From: Greg Kroah-Hartman Date: Fri, 17 Jul 2015 00:58:49 +0000 (-0700) Subject: 4.0-stable patches X-Git-Tag: v4.0.9~11 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=add1b9b215f9af24dff70d17f6b2e12d55840db8;p=thirdparty%2Fkernel%2Fstable-queue.git 4.0-stable patches added patches: arc-add-compiler-barrier-to-llsc-based-cmpxchg.patch arc-add-smp-barriers-around-atomics-per-documentation-atomic_ops.txt.patch arm64-do-not-attempt-to-use-init_mm-in-reset_context.patch arm64-entry-fix-context-tracking-for-el0_sp_pc.patch arm64-mm-fix-freeing-of-the-wrong-memmap-entries-with-sparsemem_vmemmap.patch arm64-vdso-work-around-broken-elf-toolchains-in-makefile.patch mei-me-wait-for-power-gating-exit-confirmation.patch mei-txe-reduce-suspend-resume-time.patch mm-kmemleak-allow-safe-memory-scanning-during-kmemleak-disabling.patch mm-kmemleak_alloc_percpu-should-follow-the-gfp-from-per_alloc.patch mm-thp-respect-mpol_preferred-policy-with-non-local-node.patch --- diff --git a/queue-4.0/arc-add-compiler-barrier-to-llsc-based-cmpxchg.patch b/queue-4.0/arc-add-compiler-barrier-to-llsc-based-cmpxchg.patch new file mode 100644 index 00000000000..1ddc114f218 --- /dev/null +++ b/queue-4.0/arc-add-compiler-barrier-to-llsc-based-cmpxchg.patch @@ -0,0 +1,60 @@ +From d57f727264f1425a94689bafc7e99e502cb135b5 Mon Sep 17 00:00:00 2001 +From: Vineet Gupta +Date: Thu, 13 Nov 2014 15:54:01 +0530 +Subject: ARC: add compiler barrier to LLSC based cmpxchg + +From: Vineet Gupta + +commit d57f727264f1425a94689bafc7e99e502cb135b5 upstream. + +When auditing cmpxchg call sites, Chuck noted that gcc was optimizing +away some of the desired LDs. + +| do { +| new = old = *ipi_data_ptr; +| new |= 1U << msg; +| } while (cmpxchg(ipi_data_ptr, old, new) != old); + +was generating to below + +| 8015cef8: ld r2,[r4,0] <-- First LD +| 8015cefc: bset r1,r2,r1 +| +| 8015cf00: llock r3,[r4] <-- atomic op +| 8015cf04: brne r3,r2,8015cf10 +| 8015cf08: scond r1,[r4] +| 8015cf0c: bnz 8015cf00 +| +| 8015cf10: brne r3,r2,8015cf00 <-- Branch doesn't go to orig LD + +Although this was fixed by adding a ACCESS_ONCE in this call site, it +seems safer (for now at least) to add compiler barrier to LLSC based +cmpxchg + +Reported-by: Chuck Jordan +Acked-by: Peter Zijlstra (Intel) +Signed-off-by: Vineet Gupta +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arc/include/asm/cmpxchg.h | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/arch/arc/include/asm/cmpxchg.h ++++ b/arch/arc/include/asm/cmpxchg.h +@@ -33,10 +33,11 @@ __cmpxchg(volatile void *ptr, unsigned l + " scond %3, [%1] \n" + " bnz 1b \n" + "2: \n" +- : "=&r"(prev) +- : "r"(ptr), "ir"(expected), +- "r"(new) /* can't be "ir". scond can't take limm for "b" */ +- : "cc"); ++ : "=&r"(prev) /* Early clobber, to prevent reg reuse */ ++ : "r"(ptr), /* Not "m": llock only supports reg direct addr mode */ ++ "ir"(expected), ++ "r"(new) /* can't be "ir". scond can't take LIMM for "b" */ ++ : "cc", "memory"); /* so that gcc knows memory is being written here */ + + smp_mb(); + diff --git a/queue-4.0/arc-add-smp-barriers-around-atomics-per-documentation-atomic_ops.txt.patch b/queue-4.0/arc-add-smp-barriers-around-atomics-per-documentation-atomic_ops.txt.patch new file mode 100644 index 00000000000..9a6bfb84316 --- /dev/null +++ b/queue-4.0/arc-add-smp-barriers-around-atomics-per-documentation-atomic_ops.txt.patch @@ -0,0 +1,289 @@ +From 2576c28e3f623ed401db7e6197241865328620ef Mon Sep 17 00:00:00 2001 +From: Vineet Gupta +Date: Thu, 20 Nov 2014 15:42:09 +0530 +Subject: ARC: add smp barriers around atomics per Documentation/atomic_ops.txt + +From: Vineet Gupta + +commit 2576c28e3f623ed401db7e6197241865328620ef upstream. + + - arch_spin_lock/unlock were lacking the ACQUIRE/RELEASE barriers + Since ARCv2 only provides load/load, store/store and all/all, we need + the full barrier + + - LLOCK/SCOND based atomics, bitops, cmpxchg, which return modified + values were lacking the explicit smp barriers. + + - Non LLOCK/SCOND varaints don't need the explicit barriers since that + is implicity provided by the spin locks used to implement the + critical section (the spin lock barriers in turn are also fixed in + this commit as explained above + +Cc: Paul E. McKenney +Acked-by: Peter Zijlstra (Intel) +Signed-off-by: Vineet Gupta +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arc/include/asm/atomic.h | 21 +++++++++++++++++++++ + arch/arc/include/asm/bitops.h | 19 +++++++++++++++++++ + arch/arc/include/asm/cmpxchg.h | 17 +++++++++++++++++ + arch/arc/include/asm/spinlock.h | 32 ++++++++++++++++++++++++++++++++ + 4 files changed, 89 insertions(+) + +--- a/arch/arc/include/asm/atomic.h ++++ b/arch/arc/include/asm/atomic.h +@@ -43,6 +43,12 @@ static inline int atomic_##op##_return(i + { \ + unsigned int temp; \ + \ ++ /* \ ++ * Explicit full memory barrier needed before/after as \ ++ * LLOCK/SCOND thmeselves don't provide any such semantics \ ++ */ \ ++ smp_mb(); \ ++ \ + __asm__ __volatile__( \ + "1: llock %0, [%1] \n" \ + " " #asm_op " %0, %0, %2 \n" \ +@@ -52,6 +58,8 @@ static inline int atomic_##op##_return(i + : "r"(&v->counter), "ir"(i) \ + : "cc"); \ + \ ++ smp_mb(); \ ++ \ + return temp; \ + } + +@@ -105,6 +113,9 @@ static inline int atomic_##op##_return(i + unsigned long flags; \ + unsigned long temp; \ + \ ++ /* \ ++ * spin lock/unlock provides the needed smp_mb() before/after \ ++ */ \ + atomic_ops_lock(flags); \ + temp = v->counter; \ + temp c_op i; \ +@@ -142,9 +153,19 @@ ATOMIC_OP(and, &=, and) + #define __atomic_add_unless(v, a, u) \ + ({ \ + int c, old; \ ++ \ ++ /* \ ++ * Explicit full memory barrier needed before/after as \ ++ * LLOCK/SCOND thmeselves don't provide any such semantics \ ++ */ \ ++ smp_mb(); \ ++ \ + c = atomic_read(v); \ + while (c != (u) && (old = atomic_cmpxchg((v), c, c + (a))) != c)\ + c = old; \ ++ \ ++ smp_mb(); \ ++ \ + c; \ + }) + +--- a/arch/arc/include/asm/bitops.h ++++ b/arch/arc/include/asm/bitops.h +@@ -103,6 +103,12 @@ static inline int test_and_set_bit(unsig + if (__builtin_constant_p(nr)) + nr &= 0x1f; + ++ /* ++ * Explicit full memory barrier needed before/after as ++ * LLOCK/SCOND themselves don't provide any such semantics ++ */ ++ smp_mb(); ++ + __asm__ __volatile__( + "1: llock %0, [%2] \n" + " bset %1, %0, %3 \n" +@@ -112,6 +118,8 @@ static inline int test_and_set_bit(unsig + : "r"(m), "ir"(nr) + : "cc"); + ++ smp_mb(); ++ + return (old & (1 << nr)) != 0; + } + +@@ -125,6 +133,8 @@ test_and_clear_bit(unsigned long nr, vol + if (__builtin_constant_p(nr)) + nr &= 0x1f; + ++ smp_mb(); ++ + __asm__ __volatile__( + "1: llock %0, [%2] \n" + " bclr %1, %0, %3 \n" +@@ -134,6 +144,8 @@ test_and_clear_bit(unsigned long nr, vol + : "r"(m), "ir"(nr) + : "cc"); + ++ smp_mb(); ++ + return (old & (1 << nr)) != 0; + } + +@@ -147,6 +159,8 @@ test_and_change_bit(unsigned long nr, vo + if (__builtin_constant_p(nr)) + nr &= 0x1f; + ++ smp_mb(); ++ + __asm__ __volatile__( + "1: llock %0, [%2] \n" + " bxor %1, %0, %3 \n" +@@ -156,6 +170,8 @@ test_and_change_bit(unsigned long nr, vo + : "r"(m), "ir"(nr) + : "cc"); + ++ smp_mb(); ++ + return (old & (1 << nr)) != 0; + } + +@@ -235,6 +251,9 @@ static inline int test_and_set_bit(unsig + if (__builtin_constant_p(nr)) + nr &= 0x1f; + ++ /* ++ * spin lock/unlock provide the needed smp_mb() before/after ++ */ + bitops_lock(flags); + + old = *m; +--- a/arch/arc/include/asm/cmpxchg.h ++++ b/arch/arc/include/asm/cmpxchg.h +@@ -10,6 +10,8 @@ + #define __ASM_ARC_CMPXCHG_H + + #include ++ ++#include + #include + + #ifdef CONFIG_ARC_HAS_LLSC +@@ -19,6 +21,12 @@ __cmpxchg(volatile void *ptr, unsigned l + { + unsigned long prev; + ++ /* ++ * Explicit full memory barrier needed before/after as ++ * LLOCK/SCOND thmeselves don't provide any such semantics ++ */ ++ smp_mb(); ++ + __asm__ __volatile__( + "1: llock %0, [%1] \n" + " brne %0, %2, 2f \n" +@@ -30,6 +38,8 @@ __cmpxchg(volatile void *ptr, unsigned l + "r"(new) /* can't be "ir". scond can't take limm for "b" */ + : "cc"); + ++ smp_mb(); ++ + return prev; + } + +@@ -42,6 +52,9 @@ __cmpxchg(volatile void *ptr, unsigned l + int prev; + volatile unsigned long *p = ptr; + ++ /* ++ * spin lock/unlock provide the needed smp_mb() before/after ++ */ + atomic_ops_lock(flags); + prev = *p; + if (prev == expected) +@@ -77,12 +90,16 @@ static inline unsigned long __xchg(unsig + + switch (size) { + case 4: ++ smp_mb(); ++ + __asm__ __volatile__( + " ex %0, [%1] \n" + : "+r"(val) + : "r"(ptr) + : "memory"); + ++ smp_mb(); ++ + return val; + } + return __xchg_bad_pointer(); +--- a/arch/arc/include/asm/spinlock.h ++++ b/arch/arc/include/asm/spinlock.h +@@ -22,24 +22,46 @@ static inline void arch_spin_lock(arch_s + { + unsigned int tmp = __ARCH_SPIN_LOCK_LOCKED__; + ++ /* ++ * This smp_mb() is technically superfluous, we only need the one ++ * after the lock for providing the ACQUIRE semantics. ++ * However doing the "right" thing was regressing hackbench ++ * so keeping this, pending further investigation ++ */ ++ smp_mb(); ++ + __asm__ __volatile__( + "1: ex %0, [%1] \n" + " breq %0, %2, 1b \n" + : "+&r" (tmp) + : "r"(&(lock->slock)), "ir"(__ARCH_SPIN_LOCK_LOCKED__) + : "memory"); ++ ++ /* ++ * ACQUIRE barrier to ensure load/store after taking the lock ++ * don't "bleed-up" out of the critical section (leak-in is allowed) ++ * http://www.spinics.net/lists/kernel/msg2010409.html ++ * ++ * ARCv2 only has load-load, store-store and all-all barrier ++ * thus need the full all-all barrier ++ */ ++ smp_mb(); + } + + static inline int arch_spin_trylock(arch_spinlock_t *lock) + { + unsigned int tmp = __ARCH_SPIN_LOCK_LOCKED__; + ++ smp_mb(); ++ + __asm__ __volatile__( + "1: ex %0, [%1] \n" + : "+r" (tmp) + : "r"(&(lock->slock)) + : "memory"); + ++ smp_mb(); ++ + return (tmp == __ARCH_SPIN_LOCK_UNLOCKED__); + } + +@@ -47,12 +69,22 @@ static inline void arch_spin_unlock(arch + { + unsigned int tmp = __ARCH_SPIN_LOCK_UNLOCKED__; + ++ /* ++ * RELEASE barrier: given the instructions avail on ARCv2, full barrier ++ * is the only option ++ */ ++ smp_mb(); ++ + __asm__ __volatile__( + " ex %0, [%1] \n" + : "+r" (tmp) + : "r"(&(lock->slock)) + : "memory"); + ++ /* ++ * superfluous, but keeping for now - see pairing version in ++ * arch_spin_lock above ++ */ + smp_mb(); + } + diff --git a/queue-4.0/arm64-do-not-attempt-to-use-init_mm-in-reset_context.patch b/queue-4.0/arm64-do-not-attempt-to-use-init_mm-in-reset_context.patch new file mode 100644 index 00000000000..38323766cb2 --- /dev/null +++ b/queue-4.0/arm64-do-not-attempt-to-use-init_mm-in-reset_context.patch @@ -0,0 +1,39 @@ +From 565630d503ef24e44c252bed55571b3a0d68455f Mon Sep 17 00:00:00 2001 +From: Catalin Marinas +Date: Fri, 12 Jun 2015 11:24:41 +0100 +Subject: arm64: Do not attempt to use init_mm in reset_context() + +From: Catalin Marinas + +commit 565630d503ef24e44c252bed55571b3a0d68455f upstream. + +After secondary CPU boot or hotplug, the active_mm of the idle thread is +&init_mm. The init_mm.pgd (swapper_pg_dir) is only meant for TTBR1_EL1 +and must not be set in TTBR0_EL1. Since when active_mm == &init_mm the +TTBR0_EL1 is already set to the reserved value, there is no need to +perform any context reset. + +Signed-off-by: Catalin Marinas +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm64/mm/context.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/arch/arm64/mm/context.c ++++ b/arch/arm64/mm/context.c +@@ -92,6 +92,14 @@ static void reset_context(void *info) + unsigned int cpu = smp_processor_id(); + struct mm_struct *mm = current->active_mm; + ++ /* ++ * current->active_mm could be init_mm for the idle thread immediately ++ * after secondary CPU boot or hotplug. TTBR0_EL1 is already set to ++ * the reserved value, so no need to reset any context. ++ */ ++ if (mm == &init_mm) ++ return; ++ + smp_rmb(); + asid = cpu_last_asid + cpu; + diff --git a/queue-4.0/arm64-entry-fix-context-tracking-for-el0_sp_pc.patch b/queue-4.0/arm64-entry-fix-context-tracking-for-el0_sp_pc.patch new file mode 100644 index 00000000000..51844d3b1c8 --- /dev/null +++ b/queue-4.0/arm64-entry-fix-context-tracking-for-el0_sp_pc.patch @@ -0,0 +1,69 @@ +From 46b0567c851cf85d6ba6f23eef385ec9111d09bc Mon Sep 17 00:00:00 2001 +From: Mark Rutland +Date: Mon, 15 Jun 2015 16:40:27 +0100 +Subject: arm64: entry: fix context tracking for el0_sp_pc + +From: Mark Rutland + +commit 46b0567c851cf85d6ba6f23eef385ec9111d09bc upstream. + +Commit 6c81fe7925cc4c42 ("arm64: enable context tracking") did not +update el0_sp_pc to use ct_user_exit, but this appears to have been +unintentional. In commit 6ab6463aeb5fbc75 ("arm64: adjust el0_sync so +that a function can be called") we made x0 available, and in the return +to userspace we call ct_user_enter in the kernel_exit macro. + +Due to this, we currently don't correctly inform RCU of the user->kernel +transition, and may erroneously account for time spent in the kernel as +if we were in an extended quiescent state when CONFIG_CONTEXT_TRACKING +is enabled. + +As we do record the kernel->user transition, a userspace application +making accesses from an unaligned stack pointer can demonstrate the +imbalance, provoking the following warning: + +------------[ cut here ]------------ +WARNING: CPU: 2 PID: 3660 at kernel/context_tracking.c:75 context_tracking_enter+0xd8/0xe4() +Modules linked in: +CPU: 2 PID: 3660 Comm: a.out Not tainted 4.1.0-rc7+ #8 +Hardware name: ARM Juno development board (r0) (DT) +Call trace: +[] dump_backtrace+0x0/0x124 +[] show_stack+0x10/0x1c +[] dump_stack+0x84/0xc8 +[] warn_slowpath_common+0x98/0xd0 +[] warn_slowpath_null+0x14/0x20 +[] context_tracking_enter+0xd4/0xe4 +[] preempt_schedule_irq+0xd4/0x114 +[] el1_preempt+0x4/0x28 +[] exit_files+0x38/0x4c +[] do_exit+0x430/0x978 +[] do_group_exit+0x40/0xd4 +[] get_signal+0x23c/0x4f4 +[] do_signal+0x1ac/0x518 +[] do_notify_resume+0x5c/0x68 +---[ end trace 963c192600337066 ]--- + +This patch adds the missing ct_user_exit to the el0_sp_pc entry path, +correcting the context tracking for this case. + +Signed-off-by: Mark Rutland +Acked-by: Will Deacon +Fixes: 6c81fe7925cc ("arm64: enable context tracking") +Signed-off-by: Catalin Marinas +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm64/kernel/entry.S | 1 + + 1 file changed, 1 insertion(+) + +--- a/arch/arm64/kernel/entry.S ++++ b/arch/arm64/kernel/entry.S +@@ -517,6 +517,7 @@ el0_sp_pc: + mrs x26, far_el1 + // enable interrupts before calling the main handler + enable_dbg_and_irq ++ ct_user_exit + mov x0, x26 + mov x1, x25 + mov x2, sp diff --git a/queue-4.0/arm64-mm-fix-freeing-of-the-wrong-memmap-entries-with-sparsemem_vmemmap.patch b/queue-4.0/arm64-mm-fix-freeing-of-the-wrong-memmap-entries-with-sparsemem_vmemmap.patch new file mode 100644 index 00000000000..0585b060726 --- /dev/null +++ b/queue-4.0/arm64-mm-fix-freeing-of-the-wrong-memmap-entries-with-sparsemem_vmemmap.patch @@ -0,0 +1,49 @@ +From b9bcc919931611498e856eae9bf66337330d04cc Mon Sep 17 00:00:00 2001 +From: Dave P Martin +Date: Tue, 16 Jun 2015 17:38:47 +0100 +Subject: arm64: mm: Fix freeing of the wrong memmap entries with !SPARSEMEM_VMEMMAP + +From: Dave P Martin + +commit b9bcc919931611498e856eae9bf66337330d04cc upstream. + +The memmap freeing code in free_unused_memmap() computes the end of +each memblock by adding the memblock size onto the base. However, +if SPARSEMEM is enabled then the value (start) used for the base +may already have been rounded downwards to work out which memmap +entries to free after the previous memblock. + +This may cause memmap entries that are in use to get freed. + +In general, you're not likely to hit this problem unless there +are at least 2 memblocks and one of them is not aligned to a +sparsemem section boundary. Note that carve-outs can increase +the number of memblocks by splitting the regions listed in the +device tree. + +This problem doesn't occur with SPARSEMEM_VMEMMAP, because the +vmemmap code deals with freeing the unused regions of the memmap +instead of requiring the arch code to do it. + +This patch gets the memblock base out of the memblock directly when +computing the block end address to ensure the correct value is used. + +Signed-off-by: Dave Martin +Signed-off-by: Catalin Marinas +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm64/mm/init.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/arm64/mm/init.c ++++ b/arch/arm64/mm/init.c +@@ -260,7 +260,7 @@ static void __init free_unused_memmap(vo + * memmap entries are valid from the bank end aligned to + * MAX_ORDER_NR_PAGES. + */ +- prev_end = ALIGN(start + __phys_to_pfn(reg->size), ++ prev_end = ALIGN(__phys_to_pfn(reg->base + reg->size), + MAX_ORDER_NR_PAGES); + } + diff --git a/queue-4.0/arm64-vdso-work-around-broken-elf-toolchains-in-makefile.patch b/queue-4.0/arm64-vdso-work-around-broken-elf-toolchains-in-makefile.patch new file mode 100644 index 00000000000..0257ff05662 --- /dev/null +++ b/queue-4.0/arm64-vdso-work-around-broken-elf-toolchains-in-makefile.patch @@ -0,0 +1,45 @@ +From 6f1a6ae87c0c60d7c462ef8fd071f291aa7a9abb Mon Sep 17 00:00:00 2001 +From: Will Deacon +Date: Fri, 19 Jun 2015 13:56:33 +0100 +Subject: arm64: vdso: work-around broken ELF toolchains in Makefile + +From: Will Deacon + +commit 6f1a6ae87c0c60d7c462ef8fd071f291aa7a9abb upstream. + +When building the kernel with a bare-metal (ELF) toolchain, the -shared +option may not be passed down to collect2, resulting in silent corruption +of the vDSO image (in particular, the DYNAMIC section is omitted). + +The effect of this corruption is that the dynamic linker fails to find +the vDSO symbols and libc is instead used for the syscalls that we +intended to optimise (e.g. gettimeofday). Functionally, there is no +issue as the sigreturn trampoline is still intact and located by the +kernel. + +This patch fixes the problem by explicitly passing -shared to the linker +when building the vDSO. + +Reported-by: Szabolcs Nagy +Reported-by: James Greenlaigh +Signed-off-by: Will Deacon +Signed-off-by: Catalin Marinas +Signed-off-by: Greg Kroah-Hartman + +--- + arch/arm64/kernel/vdso/Makefile | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/arch/arm64/kernel/vdso/Makefile ++++ b/arch/arm64/kernel/vdso/Makefile +@@ -15,6 +15,10 @@ ccflags-y := -shared -fno-common -fno-bu + ccflags-y += -nostdlib -Wl,-soname=linux-vdso.so.1 \ + $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) + ++# Workaround for bare-metal (ELF) toolchains that neglect to pass -shared ++# down to collect2, resulting in silent corruption of the vDSO image. ++ccflags-y += -Wl,-shared ++ + obj-y += vdso.o + extra-y += vdso.lds vdso-offsets.h + CPPFLAGS_vdso.lds += -P -C -U$(ARCH) diff --git a/queue-4.0/mei-me-wait-for-power-gating-exit-confirmation.patch b/queue-4.0/mei-me-wait-for-power-gating-exit-confirmation.patch new file mode 100644 index 00000000000..1c9a5e75486 --- /dev/null +++ b/queue-4.0/mei-me-wait-for-power-gating-exit-confirmation.patch @@ -0,0 +1,232 @@ +From 3dc196eae1db548f05e53e5875ff87b8ff79f249 Mon Sep 17 00:00:00 2001 +From: Alexander Usyskin +Date: Sat, 13 Jun 2015 08:51:17 +0300 +Subject: mei: me: wait for power gating exit confirmation + +From: Alexander Usyskin + +commit 3dc196eae1db548f05e53e5875ff87b8ff79f249 upstream. + +Fix the hbm power gating state machine so it will wait till it receives +confirmation interrupt for the PG_ISOLATION_EXIT message. + +In process of the suspend flow the devices first have to exit from the +power gating state (runtime pm resume). +If we do not handle the confirmation interrupt after sending +PG_ISOLATION_EXIT message, we may receive it already after the suspend +flow has changed the device state and interrupt will be interpreted as a +spurious event, consequently link reset will be invoked which will +prevent the device from completing the suspend flow + +kernel: [6603] mei_reset:136: mei_me 0000:00:16.0: powering down: end of reset +kernel: [476] mei_me_irq_thread_handler:643: mei_me 0000:00:16.0: function called after ISR to handle the interrupt processing. +kernel: mei_me 0000:00:16.0: FW not ready: resetting + +Cc: Gabriele Mazzotta +Link: https://bugzilla.kernel.org/show_bug.cgi?id=86241 +Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=770397 +Tested-by: Gabriele Mazzotta +Signed-off-by: Alexander Usyskin +Signed-off-by: Tomas Winkler +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/misc/mei/client.c | 2 - + drivers/misc/mei/hw-me.c | 59 +++++++++++++++++++++++++++++++++++++++++---- + drivers/misc/mei/hw-txe.c | 13 +++++++++ + drivers/misc/mei/mei_dev.h | 11 ++++++++ + 4 files changed, 80 insertions(+), 5 deletions(-) + +--- a/drivers/misc/mei/client.c ++++ b/drivers/misc/mei/client.c +@@ -573,7 +573,7 @@ void mei_host_client_init(struct work_st + bool mei_hbuf_acquire(struct mei_device *dev) + { + if (mei_pg_state(dev) == MEI_PG_ON || +- dev->pg_event == MEI_PG_EVENT_WAIT) { ++ mei_pg_in_transition(dev)) { + dev_dbg(dev->dev, "device is in pg\n"); + return false; + } +--- a/drivers/misc/mei/hw-me.c ++++ b/drivers/misc/mei/hw-me.c +@@ -629,11 +629,27 @@ int mei_me_pg_unset_sync(struct mei_devi + mutex_lock(&dev->device_lock); + + reply: +- if (dev->pg_event == MEI_PG_EVENT_RECEIVED) +- ret = mei_hbm_pg(dev, MEI_PG_ISOLATION_EXIT_RES_CMD); ++ if (dev->pg_event != MEI_PG_EVENT_RECEIVED) { ++ ret = -ETIME; ++ goto out; ++ } ++ ++ dev->pg_event = MEI_PG_EVENT_INTR_WAIT; ++ ret = mei_hbm_pg(dev, MEI_PG_ISOLATION_EXIT_RES_CMD); ++ if (ret) ++ return ret; ++ ++ mutex_unlock(&dev->device_lock); ++ wait_event_timeout(dev->wait_pg, ++ dev->pg_event == MEI_PG_EVENT_INTR_RECEIVED, timeout); ++ mutex_lock(&dev->device_lock); ++ ++ if (dev->pg_event == MEI_PG_EVENT_INTR_RECEIVED) ++ ret = 0; + else + ret = -ETIME; + ++out: + dev->pg_event = MEI_PG_EVENT_IDLE; + hw->pg_state = MEI_PG_OFF; + +@@ -641,6 +657,19 @@ reply: + } + + /** ++ * mei_me_pg_in_transition - is device now in pg transition ++ * ++ * @dev: the device structure ++ * ++ * Return: true if in pg transition, false otherwise ++ */ ++static bool mei_me_pg_in_transition(struct mei_device *dev) ++{ ++ return dev->pg_event >= MEI_PG_EVENT_WAIT && ++ dev->pg_event <= MEI_PG_EVENT_INTR_WAIT; ++} ++ ++/** + * mei_me_pg_is_enabled - detect if PG is supported by HW + * + * @dev: the device structure +@@ -672,6 +701,24 @@ notsupported: + } + + /** ++ * mei_me_pg_intr - perform pg processing in interrupt thread handler ++ * ++ * @dev: the device structure ++ */ ++static void mei_me_pg_intr(struct mei_device *dev) ++{ ++ struct mei_me_hw *hw = to_me_hw(dev); ++ ++ if (dev->pg_event != MEI_PG_EVENT_INTR_WAIT) ++ return; ++ ++ dev->pg_event = MEI_PG_EVENT_INTR_RECEIVED; ++ hw->pg_state = MEI_PG_OFF; ++ if (waitqueue_active(&dev->wait_pg)) ++ wake_up(&dev->wait_pg); ++} ++ ++/** + * mei_me_irq_quick_handler - The ISR of the MEI device + * + * @irq: The irq number +@@ -729,6 +776,8 @@ irqreturn_t mei_me_irq_thread_handler(in + goto end; + } + ++ mei_me_pg_intr(dev); ++ + /* check if we need to start the dev */ + if (!mei_host_is_ready(dev)) { + if (mei_hw_is_ready(dev)) { +@@ -765,9 +814,10 @@ irqreturn_t mei_me_irq_thread_handler(in + /* + * During PG handshake only allowed write is the replay to the + * PG exit message, so block calling write function +- * if the pg state is not idle ++ * if the pg event is in PG handshake + */ +- if (dev->pg_event == MEI_PG_EVENT_IDLE) { ++ if (dev->pg_event != MEI_PG_EVENT_WAIT && ++ dev->pg_event != MEI_PG_EVENT_RECEIVED) { + rets = mei_irq_write_handler(dev, &complete_list); + dev->hbuf_is_ready = mei_hbuf_is_ready(dev); + } +@@ -792,6 +842,7 @@ static const struct mei_hw_ops mei_me_hw + .hw_config = mei_me_hw_config, + .hw_start = mei_me_hw_start, + ++ .pg_in_transition = mei_me_pg_in_transition, + .pg_is_enabled = mei_me_pg_is_enabled, + + .intr_clear = mei_me_intr_clear, +--- a/drivers/misc/mei/hw-txe.c ++++ b/drivers/misc/mei/hw-txe.c +@@ -302,6 +302,18 @@ int mei_txe_aliveness_set_sync(struct me + } + + /** ++ * mei_txe_pg_in_transition - is device now in pg transition ++ * ++ * @dev: the device structure ++ * ++ * Return: true if in pg transition, false otherwise ++ */ ++static bool mei_txe_pg_in_transition(struct mei_device *dev) ++{ ++ return dev->pg_event == MEI_PG_EVENT_WAIT; ++} ++ ++/** + * mei_txe_pg_is_enabled - detect if PG is supported by HW + * + * @dev: the device structure +@@ -1138,6 +1150,7 @@ static const struct mei_hw_ops mei_txe_h + .hw_config = mei_txe_hw_config, + .hw_start = mei_txe_hw_start, + ++ .pg_in_transition = mei_txe_pg_in_transition, + .pg_is_enabled = mei_txe_pg_is_enabled, + + .intr_clear = mei_txe_intr_clear, +--- a/drivers/misc/mei/mei_dev.h ++++ b/drivers/misc/mei/mei_dev.h +@@ -269,6 +269,7 @@ struct mei_cl { + + * @fw_status : get fw status registers + * @pg_state : power gating state of the device ++ * @pg_in_transition : is device now in pg transition + * @pg_is_enabled : is power gating enabled + + * @intr_clear : clear pending interrupts +@@ -298,6 +299,7 @@ struct mei_hw_ops { + + int (*fw_status)(struct mei_device *dev, struct mei_fw_status *fw_sts); + enum mei_pg_state (*pg_state)(struct mei_device *dev); ++ bool (*pg_in_transition)(struct mei_device *dev); + bool (*pg_is_enabled)(struct mei_device *dev); + + void (*intr_clear)(struct mei_device *dev); +@@ -396,11 +398,15 @@ struct mei_cl_device { + * @MEI_PG_EVENT_IDLE: the driver is not in power gating transition + * @MEI_PG_EVENT_WAIT: the driver is waiting for a pg event to complete + * @MEI_PG_EVENT_RECEIVED: the driver received pg event ++ * @MEI_PG_EVENT_INTR_WAIT: the driver is waiting for a pg event interrupt ++ * @MEI_PG_EVENT_INTR_RECEIVED: the driver received pg event interrupt + */ + enum mei_pg_event { + MEI_PG_EVENT_IDLE, + MEI_PG_EVENT_WAIT, + MEI_PG_EVENT_RECEIVED, ++ MEI_PG_EVENT_INTR_WAIT, ++ MEI_PG_EVENT_INTR_RECEIVED, + }; + + /** +@@ -727,6 +733,11 @@ static inline enum mei_pg_state mei_pg_s + return dev->ops->pg_state(dev); + } + ++static inline bool mei_pg_in_transition(struct mei_device *dev) ++{ ++ return dev->ops->pg_in_transition(dev); ++} ++ + static inline bool mei_pg_is_enabled(struct mei_device *dev) + { + return dev->ops->pg_is_enabled(dev); diff --git a/queue-4.0/mei-txe-reduce-suspend-resume-time.patch b/queue-4.0/mei-txe-reduce-suspend-resume-time.patch new file mode 100644 index 00000000000..d98a1ed4653 --- /dev/null +++ b/queue-4.0/mei-txe-reduce-suspend-resume-time.patch @@ -0,0 +1,67 @@ +From fe292283c23329218e384bffc6cb4bfa3fd92277 Mon Sep 17 00:00:00 2001 +From: Tomas Winkler +Date: Tue, 14 Apr 2015 10:27:26 +0300 +Subject: mei: txe: reduce suspend/resume time + +From: Tomas Winkler + +commit fe292283c23329218e384bffc6cb4bfa3fd92277 upstream. + +HW has to be in known state before the initialisation +sequence is started. The polling step for settling aliveness +was set to 200ms while in practise this can be done in up to 30msecs. + +Signed-off-by: Tomas Winkler +Signed-off-by: Barak Yoresh +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/misc/mei/hw-txe.c | 20 ++++++++++---------- + 1 file changed, 10 insertions(+), 10 deletions(-) + +--- a/drivers/misc/mei/hw-txe.c ++++ b/drivers/misc/mei/hw-txe.c +@@ -16,6 +16,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -218,26 +219,25 @@ static u32 mei_txe_aliveness_get(struct + * + * Polls for HICR_HOST_ALIVENESS_RESP.ALIVENESS_RESP to be set + * +- * Return: > 0 if the expected value was received, -ETIME otherwise ++ * Return: 0 if the expected value was received, -ETIME otherwise + */ + static int mei_txe_aliveness_poll(struct mei_device *dev, u32 expected) + { + struct mei_txe_hw *hw = to_txe_hw(dev); +- int t = 0; ++ ktime_t stop, start; + ++ start = ktime_get(); ++ stop = ktime_add(start, ms_to_ktime(SEC_ALIVENESS_WAIT_TIMEOUT)); + do { + hw->aliveness = mei_txe_aliveness_get(dev); + if (hw->aliveness == expected) { + dev->pg_event = MEI_PG_EVENT_IDLE; +- dev_dbg(dev->dev, +- "aliveness settled after %d msecs\n", t); +- return t; ++ dev_dbg(dev->dev, "aliveness settled after %lld usecs\n", ++ ktime_to_us(ktime_sub(ktime_get(), start))); ++ return 0; + } +- mutex_unlock(&dev->device_lock); +- msleep(MSEC_PER_SEC / 5); +- mutex_lock(&dev->device_lock); +- t += MSEC_PER_SEC / 5; +- } while (t < SEC_ALIVENESS_WAIT_TIMEOUT); ++ usleep_range(20, 50); ++ } while (ktime_before(ktime_get(), stop)); + + dev->pg_event = MEI_PG_EVENT_IDLE; + dev_err(dev->dev, "aliveness timed out\n"); diff --git a/queue-4.0/mm-kmemleak-allow-safe-memory-scanning-during-kmemleak-disabling.patch b/queue-4.0/mm-kmemleak-allow-safe-memory-scanning-during-kmemleak-disabling.patch new file mode 100644 index 00000000000..a3c810c6bb7 --- /dev/null +++ b/queue-4.0/mm-kmemleak-allow-safe-memory-scanning-during-kmemleak-disabling.patch @@ -0,0 +1,105 @@ +From c5f3b1a51a591c18c8b33983908e7fdda6ae417e Mon Sep 17 00:00:00 2001 +From: Catalin Marinas +Date: Wed, 24 Jun 2015 16:58:26 -0700 +Subject: mm: kmemleak: allow safe memory scanning during kmemleak disabling + +From: Catalin Marinas + +commit c5f3b1a51a591c18c8b33983908e7fdda6ae417e upstream. + +The kmemleak scanning thread can run for minutes. Callbacks like +kmemleak_free() are allowed during this time, the race being taken care +of by the object->lock spinlock. Such lock also prevents a memory block +from being freed or unmapped while it is being scanned by blocking the +kmemleak_free() -> ... -> __delete_object() function until the lock is +released in scan_object(). + +When a kmemleak error occurs (e.g. it fails to allocate its metadata), +kmemleak_enabled is set and __delete_object() is no longer called on +freed objects. If kmemleak_scan is running at the same time, +kmemleak_free() no longer waits for the object scanning to complete, +allowing the corresponding memory block to be freed or unmapped (in the +case of vfree()). This leads to kmemleak_scan potentially triggering a +page fault. + +This patch separates the kmemleak_free() enabling/disabling from the +overall kmemleak_enabled nob so that we can defer the disabling of the +object freeing tracking until the scanning thread completed. The +kmemleak_free_part() is deliberately ignored by this patch since this is +only called during boot before the scanning thread started. + +Signed-off-by: Catalin Marinas +Reported-by: Vignesh Radhakrishnan +Tested-by: Vignesh Radhakrishnan +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/kmemleak.c | 19 ++++++++++++++++--- + 1 file changed, 16 insertions(+), 3 deletions(-) + +--- a/mm/kmemleak.c ++++ b/mm/kmemleak.c +@@ -195,6 +195,8 @@ static struct kmem_cache *scan_area_cach + + /* set if tracing memory operations is enabled */ + static int kmemleak_enabled; ++/* same as above but only for the kmemleak_free() callback */ ++static int kmemleak_free_enabled; + /* set in the late_initcall if there were no errors */ + static int kmemleak_initialized; + /* enables or disables early logging of the memory operations */ +@@ -942,7 +944,7 @@ void __ref kmemleak_free(const void *ptr + { + pr_debug("%s(0x%p)\n", __func__, ptr); + +- if (kmemleak_enabled && ptr && !IS_ERR(ptr)) ++ if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) + delete_object_full((unsigned long)ptr); + else if (kmemleak_early_log) + log_early(KMEMLEAK_FREE, ptr, 0, 0); +@@ -982,7 +984,7 @@ void __ref kmemleak_free_percpu(const vo + + pr_debug("%s(0x%p)\n", __func__, ptr); + +- if (kmemleak_enabled && ptr && !IS_ERR(ptr)) ++ if (kmemleak_free_enabled && ptr && !IS_ERR(ptr)) + for_each_possible_cpu(cpu) + delete_object_full((unsigned long)per_cpu_ptr(ptr, + cpu)); +@@ -1750,6 +1752,13 @@ static void kmemleak_do_cleanup(struct w + mutex_lock(&scan_mutex); + stop_scan_thread(); + ++ /* ++ * Once the scan thread has stopped, it is safe to no longer track ++ * object freeing. Ordering of the scan thread stopping and the memory ++ * accesses below is guaranteed by the kthread_stop() function. ++ */ ++ kmemleak_free_enabled = 0; ++ + if (!kmemleak_found_leaks) + __kmemleak_do_cleanup(); + else +@@ -1776,6 +1785,8 @@ static void kmemleak_disable(void) + /* check whether it is too early for a kernel thread */ + if (kmemleak_initialized) + schedule_work(&cleanup_work); ++ else ++ kmemleak_free_enabled = 0; + + pr_info("Kernel memory leak detector disabled\n"); + } +@@ -1840,8 +1851,10 @@ void __init kmemleak_init(void) + if (kmemleak_error) { + local_irq_restore(flags); + return; +- } else ++ } else { + kmemleak_enabled = 1; ++ kmemleak_free_enabled = 1; ++ } + local_irq_restore(flags); + + /* diff --git a/queue-4.0/mm-kmemleak_alloc_percpu-should-follow-the-gfp-from-per_alloc.patch b/queue-4.0/mm-kmemleak_alloc_percpu-should-follow-the-gfp-from-per_alloc.patch new file mode 100644 index 00000000000..688294ece1c --- /dev/null +++ b/queue-4.0/mm-kmemleak_alloc_percpu-should-follow-the-gfp-from-per_alloc.patch @@ -0,0 +1,122 @@ +From 8a8c35fadfaf55629a37ef1a8ead1b8fb32581d2 Mon Sep 17 00:00:00 2001 +From: Larry Finger +Date: Wed, 24 Jun 2015 16:58:51 -0700 +Subject: mm: kmemleak_alloc_percpu() should follow the gfp from per_alloc() + +From: Larry Finger + +commit 8a8c35fadfaf55629a37ef1a8ead1b8fb32581d2 upstream. + +Beginning at commit d52d3997f843 ("ipv6: Create percpu rt6_info"), the +following INFO splat is logged: + + =============================== + [ INFO: suspicious RCU usage. ] + 4.1.0-rc7-next-20150612 #1 Not tainted + ------------------------------- + kernel/sched/core.c:7318 Illegal context switch in RCU-bh read-side critical section! + other info that might help us debug this: + rcu_scheduler_active = 1, debug_locks = 0 + 3 locks held by systemd/1: + #0: (rtnl_mutex){+.+.+.}, at: [] rtnetlink_rcv+0x1f/0x40 + #1: (rcu_read_lock_bh){......}, at: [] ipv6_add_addr+0x62/0x540 + #2: (addrconf_hash_lock){+...+.}, at: [] ipv6_add_addr+0x184/0x540 + stack backtrace: + CPU: 0 PID: 1 Comm: systemd Not tainted 4.1.0-rc7-next-20150612 #1 + Hardware name: TOSHIBA TECRA A50-A/TECRA A50-A, BIOS Version 4.20 04/17/2014 + Call Trace: + dump_stack+0x4c/0x6e + lockdep_rcu_suspicious+0xe7/0x120 + ___might_sleep+0x1d5/0x1f0 + __might_sleep+0x4d/0x90 + kmem_cache_alloc+0x47/0x250 + create_object+0x39/0x2e0 + kmemleak_alloc_percpu+0x61/0xe0 + pcpu_alloc+0x370/0x630 + +Additional backtrace lines are truncated. In addition, the above splat +is followed by several "BUG: sleeping function called from invalid +context at mm/slub.c:1268" outputs. As suggested by Martin KaFai Lau, +these are the clue to the fix. Routine kmemleak_alloc_percpu() always +uses GFP_KERNEL for its allocations, whereas it should follow the gfp +from its callers. + +Reviewed-by: Catalin Marinas +Reviewed-by: Kamalesh Babulal +Acked-by: Martin KaFai Lau +Signed-off-by: Larry Finger +Cc: Martin KaFai Lau +Cc: Catalin Marinas +Cc: Tejun Heo +Cc: Christoph Lameter +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/kmemleak.h | 6 ++++-- + mm/kmemleak.c | 9 +++++---- + mm/percpu.c | 2 +- + 3 files changed, 10 insertions(+), 7 deletions(-) + +--- a/include/linux/kmemleak.h ++++ b/include/linux/kmemleak.h +@@ -28,7 +28,8 @@ + extern void kmemleak_init(void) __ref; + extern void kmemleak_alloc(const void *ptr, size_t size, int min_count, + gfp_t gfp) __ref; +-extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) __ref; ++extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, ++ gfp_t gfp) __ref; + extern void kmemleak_free(const void *ptr) __ref; + extern void kmemleak_free_part(const void *ptr, size_t size) __ref; + extern void kmemleak_free_percpu(const void __percpu *ptr) __ref; +@@ -71,7 +72,8 @@ static inline void kmemleak_alloc_recurs + gfp_t gfp) + { + } +-static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) ++static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, ++ gfp_t gfp) + { + } + static inline void kmemleak_free(const void *ptr) +--- a/mm/kmemleak.c ++++ b/mm/kmemleak.c +@@ -909,12 +909,13 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc); + * kmemleak_alloc_percpu - register a newly allocated __percpu object + * @ptr: __percpu pointer to beginning of the object + * @size: size of the object ++ * @gfp: flags used for kmemleak internal memory allocations + * + * This function is called from the kernel percpu allocator when a new object +- * (memory block) is allocated (alloc_percpu). It assumes GFP_KERNEL +- * allocation. ++ * (memory block) is allocated (alloc_percpu). + */ +-void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) ++void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size, ++ gfp_t gfp) + { + unsigned int cpu; + +@@ -927,7 +928,7 @@ void __ref kmemleak_alloc_percpu(const v + if (kmemleak_enabled && ptr && !IS_ERR(ptr)) + for_each_possible_cpu(cpu) + create_object((unsigned long)per_cpu_ptr(ptr, cpu), +- size, 0, GFP_KERNEL); ++ size, 0, gfp); + else if (kmemleak_early_log) + log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0); + } +--- a/mm/percpu.c ++++ b/mm/percpu.c +@@ -1030,7 +1030,7 @@ area_found: + memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size); + + ptr = __addr_to_pcpu_ptr(chunk->base_addr + off); +- kmemleak_alloc_percpu(ptr, size); ++ kmemleak_alloc_percpu(ptr, size, gfp); + return ptr; + + fail_unlock: diff --git a/queue-4.0/mm-thp-respect-mpol_preferred-policy-with-non-local-node.patch b/queue-4.0/mm-thp-respect-mpol_preferred-policy-with-non-local-node.patch new file mode 100644 index 00000000000..8c53ff549bd --- /dev/null +++ b/queue-4.0/mm-thp-respect-mpol_preferred-policy-with-non-local-node.patch @@ -0,0 +1,156 @@ +From 0867a57c4f80a566dda1bac975b42fcd857cb489 Mon Sep 17 00:00:00 2001 +From: Vlastimil Babka +Date: Wed, 24 Jun 2015 16:58:48 -0700 +Subject: mm, thp: respect MPOL_PREFERRED policy with non-local node + +From: Vlastimil Babka + +commit 0867a57c4f80a566dda1bac975b42fcd857cb489 upstream. + +Since commit 077fcf116c8c ("mm/thp: allocate transparent hugepages on +local node"), we handle THP allocations on page fault in a special way - +for non-interleave memory policies, the allocation is only attempted on +the node local to the current CPU, if the policy's nodemask allows the +node. + +This is motivated by the assumption that THP benefits cannot offset the +cost of remote accesses, so it's better to fallback to base pages on the +local node (which might still be available, while huge pages are not due +to fragmentation) than to allocate huge pages on a remote node. + +The nodemask check prevents us from violating e.g. MPOL_BIND policies +where the local node is not among the allowed nodes. However, the +current implementation can still give surprising results for the +MPOL_PREFERRED policy when the preferred node is different than the +current CPU's local node. + +In such case we should honor the preferred node and not use the local +node, which is what this patch does. If hugepage allocation on the +preferred node fails, we fall back to base pages and don't try other +nodes, with the same motivation as is done for the local node hugepage +allocations. The patch also moves the MPOL_INTERLEAVE check around to +simplify the hugepage specific test. + +The difference can be demonstrated using in-tree transhuge-stress test +on the following 2-node machine where half memory on one node was +occupied to show the difference. + +> numactl --hardware +available: 2 nodes (0-1) +node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 24 25 26 27 28 29 30 31 32 33 34 35 +node 0 size: 7878 MB +node 0 free: 3623 MB +node 1 cpus: 12 13 14 15 16 17 18 19 20 21 22 23 36 37 38 39 40 41 42 43 44 45 46 47 +node 1 size: 8045 MB +node 1 free: 7818 MB +node distances: +node 0 1 + 0: 10 21 + 1: 21 10 + +Before the patch: +> numactl -p0 -C0 ./transhuge-stress +transhuge-stress: 2.197 s/loop, 0.276 ms/page, 7249.168 MiB/s 7962 succeed, 0 failed, 1786 different pages + +> numactl -p0 -C12 ./transhuge-stress +transhuge-stress: 2.962 s/loop, 0.372 ms/page, 5376.172 MiB/s 7962 succeed, 0 failed, 3873 different pages + +Number of successful THP allocations corresponds to free memory on node 0 in +the first case and node 1 in the second case, i.e. -p parameter is ignored and +cpu binding "wins". + +After the patch: +> numactl -p0 -C0 ./transhuge-stress +transhuge-stress: 2.183 s/loop, 0.274 ms/page, 7295.516 MiB/s 7962 succeed, 0 failed, 1760 different pages + +> numactl -p0 -C12 ./transhuge-stress +transhuge-stress: 2.878 s/loop, 0.361 ms/page, 5533.638 MiB/s 7962 succeed, 0 failed, 1750 different pages + +> numactl -p1 -C0 ./transhuge-stress +transhuge-stress: 4.628 s/loop, 0.581 ms/page, 3440.893 MiB/s 7962 succeed, 0 failed, 3918 different pages + +The -p parameter is respected regardless of cpu binding. + +> numactl -C0 ./transhuge-stress +transhuge-stress: 2.202 s/loop, 0.277 ms/page, 7230.003 MiB/s 7962 succeed, 0 failed, 1750 different pages + +> numactl -C12 ./transhuge-stress +transhuge-stress: 3.020 s/loop, 0.379 ms/page, 5273.324 MiB/s 7962 succeed, 0 failed, 3916 different pages + +Without -p parameter, hugepage restriction to CPU-local node works as before. + +Fixes: 077fcf116c8c ("mm/thp: allocate transparent hugepages on local node") +Signed-off-by: Vlastimil Babka +Cc: Aneesh Kumar K.V +Acked-by: David Rientjes +Cc: Kirill A. Shutemov +Cc: Andrea Arcangeli +Cc: Michal Hocko +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/mempolicy.c | 38 ++++++++++++++++++++++---------------- + 1 file changed, 22 insertions(+), 16 deletions(-) + +--- a/mm/mempolicy.c ++++ b/mm/mempolicy.c +@@ -1971,35 +1971,41 @@ retry_cpuset: + pol = get_vma_policy(vma, addr); + cpuset_mems_cookie = read_mems_allowed_begin(); + +- if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage && +- pol->mode != MPOL_INTERLEAVE)) { ++ if (pol->mode == MPOL_INTERLEAVE) { ++ unsigned nid; ++ ++ nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); ++ mpol_cond_put(pol); ++ page = alloc_page_interleave(gfp, order, nid); ++ goto out; ++ } ++ ++ if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) { ++ int hpage_node = node; ++ + /* + * For hugepage allocation and non-interleave policy which +- * allows the current node, we only try to allocate from the +- * current node and don't fall back to other nodes, as the +- * cost of remote accesses would likely offset THP benefits. ++ * allows the current node (or other explicitly preferred ++ * node) we only try to allocate from the current/preferred ++ * node and don't fall back to other nodes, as the cost of ++ * remote accesses would likely offset THP benefits. + * + * If the policy is interleave, or does not allow the current + * node in its nodemask, we allocate the standard way. + */ ++ if (pol->mode == MPOL_PREFERRED && ++ !(pol->flags & MPOL_F_LOCAL)) ++ hpage_node = pol->v.preferred_node; ++ + nmask = policy_nodemask(gfp, pol); +- if (!nmask || node_isset(node, *nmask)) { ++ if (!nmask || node_isset(hpage_node, *nmask)) { + mpol_cond_put(pol); +- page = alloc_pages_exact_node(node, ++ page = alloc_pages_exact_node(hpage_node, + gfp | __GFP_THISNODE, order); + goto out; + } + } + +- if (pol->mode == MPOL_INTERLEAVE) { +- unsigned nid; +- +- nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order); +- mpol_cond_put(pol); +- page = alloc_page_interleave(gfp, order, nid); +- goto out; +- } +- + nmask = policy_nodemask(gfp, pol); + zl = policy_zonelist(gfp, pol, node); + mpol_cond_put(pol); diff --git a/queue-4.0/series b/queue-4.0/series index f1e0d424d10..a76a213ca06 100644 --- a/queue-4.0/series +++ b/queue-4.0/series @@ -20,3 +20,14 @@ acpi-init-switch-over-platform-to-the-acpi-mode-later.patch acpi-pm-add-missing-pm_generic_complete-invocation.patch iio-accel-kxcjk-1013-add-the-kxcj9000-acpi-id.patch tools-selftests-fix-clean-target-with-make-3.81.patch +arc-add-smp-barriers-around-atomics-per-documentation-atomic_ops.txt.patch +arc-add-compiler-barrier-to-llsc-based-cmpxchg.patch +mei-me-wait-for-power-gating-exit-confirmation.patch +mei-txe-reduce-suspend-resume-time.patch +arm64-do-not-attempt-to-use-init_mm-in-reset_context.patch +arm64-entry-fix-context-tracking-for-el0_sp_pc.patch +arm64-mm-fix-freeing-of-the-wrong-memmap-entries-with-sparsemem_vmemmap.patch +arm64-vdso-work-around-broken-elf-toolchains-in-makefile.patch +mm-kmemleak-allow-safe-memory-scanning-during-kmemleak-disabling.patch +mm-kmemleak_alloc_percpu-should-follow-the-gfp-from-per_alloc.patch +mm-thp-respect-mpol_preferred-policy-with-non-local-node.patch