--- /dev/null
+From d57f727264f1425a94689bafc7e99e502cb135b5 Mon Sep 17 00:00:00 2001
+From: Vineet Gupta <vgupta@synopsys.com>
+Date: Thu, 13 Nov 2014 15:54:01 +0530
+Subject: ARC: add compiler barrier to LLSC based cmpxchg
+
+From: Vineet Gupta <vgupta@synopsys.com>
+
+commit d57f727264f1425a94689bafc7e99e502cb135b5 upstream.
+
+When auditing cmpxchg call sites, Chuck noted that gcc was optimizing
+away some of the desired LDs.
+
+| do {
+| new = old = *ipi_data_ptr;
+| new |= 1U << msg;
+| } while (cmpxchg(ipi_data_ptr, old, new) != old);
+
+was generating to below
+
+| 8015cef8: ld r2,[r4,0] <-- First LD
+| 8015cefc: bset r1,r2,r1
+|
+| 8015cf00: llock r3,[r4] <-- atomic op
+| 8015cf04: brne r3,r2,8015cf10
+| 8015cf08: scond r1,[r4]
+| 8015cf0c: bnz 8015cf00
+|
+| 8015cf10: brne r3,r2,8015cf00 <-- Branch doesn't go to orig LD
+
+Although this was fixed by adding a ACCESS_ONCE in this call site, it
+seems safer (for now at least) to add compiler barrier to LLSC based
+cmpxchg
+
+Reported-by: Chuck Jordan <cjordan@synopsys,com>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arc/include/asm/cmpxchg.h | 9 +++++----
+ 1 file changed, 5 insertions(+), 4 deletions(-)
+
+--- a/arch/arc/include/asm/cmpxchg.h
++++ b/arch/arc/include/asm/cmpxchg.h
+@@ -33,10 +33,11 @@ __cmpxchg(volatile void *ptr, unsigned l
+ " scond %3, [%1] \n"
+ " bnz 1b \n"
+ "2: \n"
+- : "=&r"(prev)
+- : "r"(ptr), "ir"(expected),
+- "r"(new) /* can't be "ir". scond can't take limm for "b" */
+- : "cc");
++ : "=&r"(prev) /* Early clobber, to prevent reg reuse */
++ : "r"(ptr), /* Not "m": llock only supports reg direct addr mode */
++ "ir"(expected),
++ "r"(new) /* can't be "ir". scond can't take LIMM for "b" */
++ : "cc", "memory"); /* so that gcc knows memory is being written here */
+
+ smp_mb();
+
--- /dev/null
+From 2576c28e3f623ed401db7e6197241865328620ef Mon Sep 17 00:00:00 2001
+From: Vineet Gupta <vgupta@synopsys.com>
+Date: Thu, 20 Nov 2014 15:42:09 +0530
+Subject: ARC: add smp barriers around atomics per Documentation/atomic_ops.txt
+
+From: Vineet Gupta <vgupta@synopsys.com>
+
+commit 2576c28e3f623ed401db7e6197241865328620ef upstream.
+
+ - arch_spin_lock/unlock were lacking the ACQUIRE/RELEASE barriers
+ Since ARCv2 only provides load/load, store/store and all/all, we need
+ the full barrier
+
+ - LLOCK/SCOND based atomics, bitops, cmpxchg, which return modified
+ values were lacking the explicit smp barriers.
+
+ - Non LLOCK/SCOND varaints don't need the explicit barriers since that
+ is implicity provided by the spin locks used to implement the
+ critical section (the spin lock barriers in turn are also fixed in
+ this commit as explained above
+
+Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Vineet Gupta <vgupta@synopsys.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arc/include/asm/atomic.h | 21 +++++++++++++++++++++
+ arch/arc/include/asm/bitops.h | 19 +++++++++++++++++++
+ arch/arc/include/asm/cmpxchg.h | 17 +++++++++++++++++
+ arch/arc/include/asm/spinlock.h | 32 ++++++++++++++++++++++++++++++++
+ 4 files changed, 89 insertions(+)
+
+--- a/arch/arc/include/asm/atomic.h
++++ b/arch/arc/include/asm/atomic.h
+@@ -43,6 +43,12 @@ static inline int atomic_##op##_return(i
+ { \
+ unsigned int temp; \
+ \
++ /* \
++ * Explicit full memory barrier needed before/after as \
++ * LLOCK/SCOND thmeselves don't provide any such semantics \
++ */ \
++ smp_mb(); \
++ \
+ __asm__ __volatile__( \
+ "1: llock %0, [%1] \n" \
+ " " #asm_op " %0, %0, %2 \n" \
+@@ -52,6 +58,8 @@ static inline int atomic_##op##_return(i
+ : "r"(&v->counter), "ir"(i) \
+ : "cc"); \
+ \
++ smp_mb(); \
++ \
+ return temp; \
+ }
+
+@@ -105,6 +113,9 @@ static inline int atomic_##op##_return(i
+ unsigned long flags; \
+ unsigned long temp; \
+ \
++ /* \
++ * spin lock/unlock provides the needed smp_mb() before/after \
++ */ \
+ atomic_ops_lock(flags); \
+ temp = v->counter; \
+ temp c_op i; \
+@@ -142,9 +153,19 @@ ATOMIC_OP(and, &=, and)
+ #define __atomic_add_unless(v, a, u) \
+ ({ \
+ int c, old; \
++ \
++ /* \
++ * Explicit full memory barrier needed before/after as \
++ * LLOCK/SCOND thmeselves don't provide any such semantics \
++ */ \
++ smp_mb(); \
++ \
+ c = atomic_read(v); \
+ while (c != (u) && (old = atomic_cmpxchg((v), c, c + (a))) != c)\
+ c = old; \
++ \
++ smp_mb(); \
++ \
+ c; \
+ })
+
+--- a/arch/arc/include/asm/bitops.h
++++ b/arch/arc/include/asm/bitops.h
+@@ -103,6 +103,12 @@ static inline int test_and_set_bit(unsig
+ if (__builtin_constant_p(nr))
+ nr &= 0x1f;
+
++ /*
++ * Explicit full memory barrier needed before/after as
++ * LLOCK/SCOND themselves don't provide any such semantics
++ */
++ smp_mb();
++
+ __asm__ __volatile__(
+ "1: llock %0, [%2] \n"
+ " bset %1, %0, %3 \n"
+@@ -112,6 +118,8 @@ static inline int test_and_set_bit(unsig
+ : "r"(m), "ir"(nr)
+ : "cc");
+
++ smp_mb();
++
+ return (old & (1 << nr)) != 0;
+ }
+
+@@ -125,6 +133,8 @@ test_and_clear_bit(unsigned long nr, vol
+ if (__builtin_constant_p(nr))
+ nr &= 0x1f;
+
++ smp_mb();
++
+ __asm__ __volatile__(
+ "1: llock %0, [%2] \n"
+ " bclr %1, %0, %3 \n"
+@@ -134,6 +144,8 @@ test_and_clear_bit(unsigned long nr, vol
+ : "r"(m), "ir"(nr)
+ : "cc");
+
++ smp_mb();
++
+ return (old & (1 << nr)) != 0;
+ }
+
+@@ -147,6 +159,8 @@ test_and_change_bit(unsigned long nr, vo
+ if (__builtin_constant_p(nr))
+ nr &= 0x1f;
+
++ smp_mb();
++
+ __asm__ __volatile__(
+ "1: llock %0, [%2] \n"
+ " bxor %1, %0, %3 \n"
+@@ -156,6 +170,8 @@ test_and_change_bit(unsigned long nr, vo
+ : "r"(m), "ir"(nr)
+ : "cc");
+
++ smp_mb();
++
+ return (old & (1 << nr)) != 0;
+ }
+
+@@ -235,6 +251,9 @@ static inline int test_and_set_bit(unsig
+ if (__builtin_constant_p(nr))
+ nr &= 0x1f;
+
++ /*
++ * spin lock/unlock provide the needed smp_mb() before/after
++ */
+ bitops_lock(flags);
+
+ old = *m;
+--- a/arch/arc/include/asm/cmpxchg.h
++++ b/arch/arc/include/asm/cmpxchg.h
+@@ -10,6 +10,8 @@
+ #define __ASM_ARC_CMPXCHG_H
+
+ #include <linux/types.h>
++
++#include <asm/barrier.h>
+ #include <asm/smp.h>
+
+ #ifdef CONFIG_ARC_HAS_LLSC
+@@ -19,6 +21,12 @@ __cmpxchg(volatile void *ptr, unsigned l
+ {
+ unsigned long prev;
+
++ /*
++ * Explicit full memory barrier needed before/after as
++ * LLOCK/SCOND thmeselves don't provide any such semantics
++ */
++ smp_mb();
++
+ __asm__ __volatile__(
+ "1: llock %0, [%1] \n"
+ " brne %0, %2, 2f \n"
+@@ -30,6 +38,8 @@ __cmpxchg(volatile void *ptr, unsigned l
+ "r"(new) /* can't be "ir". scond can't take limm for "b" */
+ : "cc");
+
++ smp_mb();
++
+ return prev;
+ }
+
+@@ -42,6 +52,9 @@ __cmpxchg(volatile void *ptr, unsigned l
+ int prev;
+ volatile unsigned long *p = ptr;
+
++ /*
++ * spin lock/unlock provide the needed smp_mb() before/after
++ */
+ atomic_ops_lock(flags);
+ prev = *p;
+ if (prev == expected)
+@@ -77,12 +90,16 @@ static inline unsigned long __xchg(unsig
+
+ switch (size) {
+ case 4:
++ smp_mb();
++
+ __asm__ __volatile__(
+ " ex %0, [%1] \n"
+ : "+r"(val)
+ : "r"(ptr)
+ : "memory");
+
++ smp_mb();
++
+ return val;
+ }
+ return __xchg_bad_pointer();
+--- a/arch/arc/include/asm/spinlock.h
++++ b/arch/arc/include/asm/spinlock.h
+@@ -22,24 +22,46 @@ static inline void arch_spin_lock(arch_s
+ {
+ unsigned int tmp = __ARCH_SPIN_LOCK_LOCKED__;
+
++ /*
++ * This smp_mb() is technically superfluous, we only need the one
++ * after the lock for providing the ACQUIRE semantics.
++ * However doing the "right" thing was regressing hackbench
++ * so keeping this, pending further investigation
++ */
++ smp_mb();
++
+ __asm__ __volatile__(
+ "1: ex %0, [%1] \n"
+ " breq %0, %2, 1b \n"
+ : "+&r" (tmp)
+ : "r"(&(lock->slock)), "ir"(__ARCH_SPIN_LOCK_LOCKED__)
+ : "memory");
++
++ /*
++ * ACQUIRE barrier to ensure load/store after taking the lock
++ * don't "bleed-up" out of the critical section (leak-in is allowed)
++ * http://www.spinics.net/lists/kernel/msg2010409.html
++ *
++ * ARCv2 only has load-load, store-store and all-all barrier
++ * thus need the full all-all barrier
++ */
++ smp_mb();
+ }
+
+ static inline int arch_spin_trylock(arch_spinlock_t *lock)
+ {
+ unsigned int tmp = __ARCH_SPIN_LOCK_LOCKED__;
+
++ smp_mb();
++
+ __asm__ __volatile__(
+ "1: ex %0, [%1] \n"
+ : "+r" (tmp)
+ : "r"(&(lock->slock))
+ : "memory");
+
++ smp_mb();
++
+ return (tmp == __ARCH_SPIN_LOCK_UNLOCKED__);
+ }
+
+@@ -47,12 +69,22 @@ static inline void arch_spin_unlock(arch
+ {
+ unsigned int tmp = __ARCH_SPIN_LOCK_UNLOCKED__;
+
++ /*
++ * RELEASE barrier: given the instructions avail on ARCv2, full barrier
++ * is the only option
++ */
++ smp_mb();
++
+ __asm__ __volatile__(
+ " ex %0, [%1] \n"
+ : "+r" (tmp)
+ : "r"(&(lock->slock))
+ : "memory");
+
++ /*
++ * superfluous, but keeping for now - see pairing version in
++ * arch_spin_lock above
++ */
+ smp_mb();
+ }
+
--- /dev/null
+From 565630d503ef24e44c252bed55571b3a0d68455f Mon Sep 17 00:00:00 2001
+From: Catalin Marinas <catalin.marinas@arm.com>
+Date: Fri, 12 Jun 2015 11:24:41 +0100
+Subject: arm64: Do not attempt to use init_mm in reset_context()
+
+From: Catalin Marinas <catalin.marinas@arm.com>
+
+commit 565630d503ef24e44c252bed55571b3a0d68455f upstream.
+
+After secondary CPU boot or hotplug, the active_mm of the idle thread is
+&init_mm. The init_mm.pgd (swapper_pg_dir) is only meant for TTBR1_EL1
+and must not be set in TTBR0_EL1. Since when active_mm == &init_mm the
+TTBR0_EL1 is already set to the reserved value, there is no need to
+perform any context reset.
+
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/mm/context.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+--- a/arch/arm64/mm/context.c
++++ b/arch/arm64/mm/context.c
+@@ -92,6 +92,14 @@ static void reset_context(void *info)
+ unsigned int cpu = smp_processor_id();
+ struct mm_struct *mm = current->active_mm;
+
++ /*
++ * current->active_mm could be init_mm for the idle thread immediately
++ * after secondary CPU boot or hotplug. TTBR0_EL1 is already set to
++ * the reserved value, so no need to reset any context.
++ */
++ if (mm == &init_mm)
++ return;
++
+ smp_rmb();
+ asid = cpu_last_asid + cpu;
+
--- /dev/null
+From 46b0567c851cf85d6ba6f23eef385ec9111d09bc Mon Sep 17 00:00:00 2001
+From: Mark Rutland <mark.rutland@arm.com>
+Date: Mon, 15 Jun 2015 16:40:27 +0100
+Subject: arm64: entry: fix context tracking for el0_sp_pc
+
+From: Mark Rutland <mark.rutland@arm.com>
+
+commit 46b0567c851cf85d6ba6f23eef385ec9111d09bc upstream.
+
+Commit 6c81fe7925cc4c42 ("arm64: enable context tracking") did not
+update el0_sp_pc to use ct_user_exit, but this appears to have been
+unintentional. In commit 6ab6463aeb5fbc75 ("arm64: adjust el0_sync so
+that a function can be called") we made x0 available, and in the return
+to userspace we call ct_user_enter in the kernel_exit macro.
+
+Due to this, we currently don't correctly inform RCU of the user->kernel
+transition, and may erroneously account for time spent in the kernel as
+if we were in an extended quiescent state when CONFIG_CONTEXT_TRACKING
+is enabled.
+
+As we do record the kernel->user transition, a userspace application
+making accesses from an unaligned stack pointer can demonstrate the
+imbalance, provoking the following warning:
+
+------------[ cut here ]------------
+WARNING: CPU: 2 PID: 3660 at kernel/context_tracking.c:75 context_tracking_enter+0xd8/0xe4()
+Modules linked in:
+CPU: 2 PID: 3660 Comm: a.out Not tainted 4.1.0-rc7+ #8
+Hardware name: ARM Juno development board (r0) (DT)
+Call trace:
+[<ffffffc000089914>] dump_backtrace+0x0/0x124
+[<ffffffc000089a48>] show_stack+0x10/0x1c
+[<ffffffc0005b3cbc>] dump_stack+0x84/0xc8
+[<ffffffc0000b3214>] warn_slowpath_common+0x98/0xd0
+[<ffffffc0000b330c>] warn_slowpath_null+0x14/0x20
+[<ffffffc00013ada4>] context_tracking_enter+0xd4/0xe4
+[<ffffffc0005b534c>] preempt_schedule_irq+0xd4/0x114
+[<ffffffc00008561c>] el1_preempt+0x4/0x28
+[<ffffffc0001b8040>] exit_files+0x38/0x4c
+[<ffffffc0000b5b94>] do_exit+0x430/0x978
+[<ffffffc0000b614c>] do_group_exit+0x40/0xd4
+[<ffffffc0000c0208>] get_signal+0x23c/0x4f4
+[<ffffffc0000890b4>] do_signal+0x1ac/0x518
+[<ffffffc000089650>] do_notify_resume+0x5c/0x68
+---[ end trace 963c192600337066 ]---
+
+This patch adds the missing ct_user_exit to the el0_sp_pc entry path,
+correcting the context tracking for this case.
+
+Signed-off-by: Mark Rutland <mark.rutland@arm.com>
+Acked-by: Will Deacon <will.deacon@arm.com>
+Fixes: 6c81fe7925cc ("arm64: enable context tracking")
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/kernel/entry.S | 1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/arch/arm64/kernel/entry.S
++++ b/arch/arm64/kernel/entry.S
+@@ -517,6 +517,7 @@ el0_sp_pc:
+ mrs x26, far_el1
+ // enable interrupts before calling the main handler
+ enable_dbg_and_irq
++ ct_user_exit
+ mov x0, x26
+ mov x1, x25
+ mov x2, sp
--- /dev/null
+From b9bcc919931611498e856eae9bf66337330d04cc Mon Sep 17 00:00:00 2001
+From: Dave P Martin <Dave.Martin@arm.com>
+Date: Tue, 16 Jun 2015 17:38:47 +0100
+Subject: arm64: mm: Fix freeing of the wrong memmap entries with !SPARSEMEM_VMEMMAP
+
+From: Dave P Martin <Dave.Martin@arm.com>
+
+commit b9bcc919931611498e856eae9bf66337330d04cc upstream.
+
+The memmap freeing code in free_unused_memmap() computes the end of
+each memblock by adding the memblock size onto the base. However,
+if SPARSEMEM is enabled then the value (start) used for the base
+may already have been rounded downwards to work out which memmap
+entries to free after the previous memblock.
+
+This may cause memmap entries that are in use to get freed.
+
+In general, you're not likely to hit this problem unless there
+are at least 2 memblocks and one of them is not aligned to a
+sparsemem section boundary. Note that carve-outs can increase
+the number of memblocks by splitting the regions listed in the
+device tree.
+
+This problem doesn't occur with SPARSEMEM_VMEMMAP, because the
+vmemmap code deals with freeing the unused regions of the memmap
+instead of requiring the arch code to do it.
+
+This patch gets the memblock base out of the memblock directly when
+computing the block end address to ensure the correct value is used.
+
+Signed-off-by: Dave Martin <Dave.Martin@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/mm/init.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/arm64/mm/init.c
++++ b/arch/arm64/mm/init.c
+@@ -260,7 +260,7 @@ static void __init free_unused_memmap(vo
+ * memmap entries are valid from the bank end aligned to
+ * MAX_ORDER_NR_PAGES.
+ */
+- prev_end = ALIGN(start + __phys_to_pfn(reg->size),
++ prev_end = ALIGN(__phys_to_pfn(reg->base + reg->size),
+ MAX_ORDER_NR_PAGES);
+ }
+
--- /dev/null
+From 6f1a6ae87c0c60d7c462ef8fd071f291aa7a9abb Mon Sep 17 00:00:00 2001
+From: Will Deacon <will.deacon@arm.com>
+Date: Fri, 19 Jun 2015 13:56:33 +0100
+Subject: arm64: vdso: work-around broken ELF toolchains in Makefile
+
+From: Will Deacon <will.deacon@arm.com>
+
+commit 6f1a6ae87c0c60d7c462ef8fd071f291aa7a9abb upstream.
+
+When building the kernel with a bare-metal (ELF) toolchain, the -shared
+option may not be passed down to collect2, resulting in silent corruption
+of the vDSO image (in particular, the DYNAMIC section is omitted).
+
+The effect of this corruption is that the dynamic linker fails to find
+the vDSO symbols and libc is instead used for the syscalls that we
+intended to optimise (e.g. gettimeofday). Functionally, there is no
+issue as the sigreturn trampoline is still intact and located by the
+kernel.
+
+This patch fixes the problem by explicitly passing -shared to the linker
+when building the vDSO.
+
+Reported-by: Szabolcs Nagy <Szabolcs.Nagy@arm.com>
+Reported-by: James Greenlaigh <james.greenhalgh@arm.com>
+Signed-off-by: Will Deacon <will.deacon@arm.com>
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/arm64/kernel/vdso/Makefile | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/arch/arm64/kernel/vdso/Makefile
++++ b/arch/arm64/kernel/vdso/Makefile
+@@ -15,6 +15,10 @@ ccflags-y := -shared -fno-common -fno-bu
+ ccflags-y += -nostdlib -Wl,-soname=linux-vdso.so.1 \
+ $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
+
++# Workaround for bare-metal (ELF) toolchains that neglect to pass -shared
++# down to collect2, resulting in silent corruption of the vDSO image.
++ccflags-y += -Wl,-shared
++
+ obj-y += vdso.o
+ extra-y += vdso.lds vdso-offsets.h
+ CPPFLAGS_vdso.lds += -P -C -U$(ARCH)
--- /dev/null
+From 3dc196eae1db548f05e53e5875ff87b8ff79f249 Mon Sep 17 00:00:00 2001
+From: Alexander Usyskin <alexander.usyskin@intel.com>
+Date: Sat, 13 Jun 2015 08:51:17 +0300
+Subject: mei: me: wait for power gating exit confirmation
+
+From: Alexander Usyskin <alexander.usyskin@intel.com>
+
+commit 3dc196eae1db548f05e53e5875ff87b8ff79f249 upstream.
+
+Fix the hbm power gating state machine so it will wait till it receives
+confirmation interrupt for the PG_ISOLATION_EXIT message.
+
+In process of the suspend flow the devices first have to exit from the
+power gating state (runtime pm resume).
+If we do not handle the confirmation interrupt after sending
+PG_ISOLATION_EXIT message, we may receive it already after the suspend
+flow has changed the device state and interrupt will be interpreted as a
+spurious event, consequently link reset will be invoked which will
+prevent the device from completing the suspend flow
+
+kernel: [6603] mei_reset:136: mei_me 0000:00:16.0: powering down: end of reset
+kernel: [476] mei_me_irq_thread_handler:643: mei_me 0000:00:16.0: function called after ISR to handle the interrupt processing.
+kernel: mei_me 0000:00:16.0: FW not ready: resetting
+
+Cc: Gabriele Mazzotta <gabriele.mzt@gmail.com>
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=86241
+Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=770397
+Tested-by: Gabriele Mazzotta <gabriele.mzt@gmail.com>
+Signed-off-by: Alexander Usyskin <alexander.usyskin@intel.com>
+Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/misc/mei/client.c | 2 -
+ drivers/misc/mei/hw-me.c | 59 +++++++++++++++++++++++++++++++++++++++++----
+ drivers/misc/mei/hw-txe.c | 13 +++++++++
+ drivers/misc/mei/mei_dev.h | 11 ++++++++
+ 4 files changed, 80 insertions(+), 5 deletions(-)
+
+--- a/drivers/misc/mei/client.c
++++ b/drivers/misc/mei/client.c
+@@ -573,7 +573,7 @@ void mei_host_client_init(struct work_st
+ bool mei_hbuf_acquire(struct mei_device *dev)
+ {
+ if (mei_pg_state(dev) == MEI_PG_ON ||
+- dev->pg_event == MEI_PG_EVENT_WAIT) {
++ mei_pg_in_transition(dev)) {
+ dev_dbg(dev->dev, "device is in pg\n");
+ return false;
+ }
+--- a/drivers/misc/mei/hw-me.c
++++ b/drivers/misc/mei/hw-me.c
+@@ -629,11 +629,27 @@ int mei_me_pg_unset_sync(struct mei_devi
+ mutex_lock(&dev->device_lock);
+
+ reply:
+- if (dev->pg_event == MEI_PG_EVENT_RECEIVED)
+- ret = mei_hbm_pg(dev, MEI_PG_ISOLATION_EXIT_RES_CMD);
++ if (dev->pg_event != MEI_PG_EVENT_RECEIVED) {
++ ret = -ETIME;
++ goto out;
++ }
++
++ dev->pg_event = MEI_PG_EVENT_INTR_WAIT;
++ ret = mei_hbm_pg(dev, MEI_PG_ISOLATION_EXIT_RES_CMD);
++ if (ret)
++ return ret;
++
++ mutex_unlock(&dev->device_lock);
++ wait_event_timeout(dev->wait_pg,
++ dev->pg_event == MEI_PG_EVENT_INTR_RECEIVED, timeout);
++ mutex_lock(&dev->device_lock);
++
++ if (dev->pg_event == MEI_PG_EVENT_INTR_RECEIVED)
++ ret = 0;
+ else
+ ret = -ETIME;
+
++out:
+ dev->pg_event = MEI_PG_EVENT_IDLE;
+ hw->pg_state = MEI_PG_OFF;
+
+@@ -641,6 +657,19 @@ reply:
+ }
+
+ /**
++ * mei_me_pg_in_transition - is device now in pg transition
++ *
++ * @dev: the device structure
++ *
++ * Return: true if in pg transition, false otherwise
++ */
++static bool mei_me_pg_in_transition(struct mei_device *dev)
++{
++ return dev->pg_event >= MEI_PG_EVENT_WAIT &&
++ dev->pg_event <= MEI_PG_EVENT_INTR_WAIT;
++}
++
++/**
+ * mei_me_pg_is_enabled - detect if PG is supported by HW
+ *
+ * @dev: the device structure
+@@ -672,6 +701,24 @@ notsupported:
+ }
+
+ /**
++ * mei_me_pg_intr - perform pg processing in interrupt thread handler
++ *
++ * @dev: the device structure
++ */
++static void mei_me_pg_intr(struct mei_device *dev)
++{
++ struct mei_me_hw *hw = to_me_hw(dev);
++
++ if (dev->pg_event != MEI_PG_EVENT_INTR_WAIT)
++ return;
++
++ dev->pg_event = MEI_PG_EVENT_INTR_RECEIVED;
++ hw->pg_state = MEI_PG_OFF;
++ if (waitqueue_active(&dev->wait_pg))
++ wake_up(&dev->wait_pg);
++}
++
++/**
+ * mei_me_irq_quick_handler - The ISR of the MEI device
+ *
+ * @irq: The irq number
+@@ -729,6 +776,8 @@ irqreturn_t mei_me_irq_thread_handler(in
+ goto end;
+ }
+
++ mei_me_pg_intr(dev);
++
+ /* check if we need to start the dev */
+ if (!mei_host_is_ready(dev)) {
+ if (mei_hw_is_ready(dev)) {
+@@ -765,9 +814,10 @@ irqreturn_t mei_me_irq_thread_handler(in
+ /*
+ * During PG handshake only allowed write is the replay to the
+ * PG exit message, so block calling write function
+- * if the pg state is not idle
++ * if the pg event is in PG handshake
+ */
+- if (dev->pg_event == MEI_PG_EVENT_IDLE) {
++ if (dev->pg_event != MEI_PG_EVENT_WAIT &&
++ dev->pg_event != MEI_PG_EVENT_RECEIVED) {
+ rets = mei_irq_write_handler(dev, &complete_list);
+ dev->hbuf_is_ready = mei_hbuf_is_ready(dev);
+ }
+@@ -792,6 +842,7 @@ static const struct mei_hw_ops mei_me_hw
+ .hw_config = mei_me_hw_config,
+ .hw_start = mei_me_hw_start,
+
++ .pg_in_transition = mei_me_pg_in_transition,
+ .pg_is_enabled = mei_me_pg_is_enabled,
+
+ .intr_clear = mei_me_intr_clear,
+--- a/drivers/misc/mei/hw-txe.c
++++ b/drivers/misc/mei/hw-txe.c
+@@ -302,6 +302,18 @@ int mei_txe_aliveness_set_sync(struct me
+ }
+
+ /**
++ * mei_txe_pg_in_transition - is device now in pg transition
++ *
++ * @dev: the device structure
++ *
++ * Return: true if in pg transition, false otherwise
++ */
++static bool mei_txe_pg_in_transition(struct mei_device *dev)
++{
++ return dev->pg_event == MEI_PG_EVENT_WAIT;
++}
++
++/**
+ * mei_txe_pg_is_enabled - detect if PG is supported by HW
+ *
+ * @dev: the device structure
+@@ -1138,6 +1150,7 @@ static const struct mei_hw_ops mei_txe_h
+ .hw_config = mei_txe_hw_config,
+ .hw_start = mei_txe_hw_start,
+
++ .pg_in_transition = mei_txe_pg_in_transition,
+ .pg_is_enabled = mei_txe_pg_is_enabled,
+
+ .intr_clear = mei_txe_intr_clear,
+--- a/drivers/misc/mei/mei_dev.h
++++ b/drivers/misc/mei/mei_dev.h
+@@ -269,6 +269,7 @@ struct mei_cl {
+
+ * @fw_status : get fw status registers
+ * @pg_state : power gating state of the device
++ * @pg_in_transition : is device now in pg transition
+ * @pg_is_enabled : is power gating enabled
+
+ * @intr_clear : clear pending interrupts
+@@ -298,6 +299,7 @@ struct mei_hw_ops {
+
+ int (*fw_status)(struct mei_device *dev, struct mei_fw_status *fw_sts);
+ enum mei_pg_state (*pg_state)(struct mei_device *dev);
++ bool (*pg_in_transition)(struct mei_device *dev);
+ bool (*pg_is_enabled)(struct mei_device *dev);
+
+ void (*intr_clear)(struct mei_device *dev);
+@@ -396,11 +398,15 @@ struct mei_cl_device {
+ * @MEI_PG_EVENT_IDLE: the driver is not in power gating transition
+ * @MEI_PG_EVENT_WAIT: the driver is waiting for a pg event to complete
+ * @MEI_PG_EVENT_RECEIVED: the driver received pg event
++ * @MEI_PG_EVENT_INTR_WAIT: the driver is waiting for a pg event interrupt
++ * @MEI_PG_EVENT_INTR_RECEIVED: the driver received pg event interrupt
+ */
+ enum mei_pg_event {
+ MEI_PG_EVENT_IDLE,
+ MEI_PG_EVENT_WAIT,
+ MEI_PG_EVENT_RECEIVED,
++ MEI_PG_EVENT_INTR_WAIT,
++ MEI_PG_EVENT_INTR_RECEIVED,
+ };
+
+ /**
+@@ -727,6 +733,11 @@ static inline enum mei_pg_state mei_pg_s
+ return dev->ops->pg_state(dev);
+ }
+
++static inline bool mei_pg_in_transition(struct mei_device *dev)
++{
++ return dev->ops->pg_in_transition(dev);
++}
++
+ static inline bool mei_pg_is_enabled(struct mei_device *dev)
+ {
+ return dev->ops->pg_is_enabled(dev);
--- /dev/null
+From fe292283c23329218e384bffc6cb4bfa3fd92277 Mon Sep 17 00:00:00 2001
+From: Tomas Winkler <tomas.winkler@intel.com>
+Date: Tue, 14 Apr 2015 10:27:26 +0300
+Subject: mei: txe: reduce suspend/resume time
+
+From: Tomas Winkler <tomas.winkler@intel.com>
+
+commit fe292283c23329218e384bffc6cb4bfa3fd92277 upstream.
+
+HW has to be in known state before the initialisation
+sequence is started. The polling step for settling aliveness
+was set to 200ms while in practise this can be done in up to 30msecs.
+
+Signed-off-by: Tomas Winkler <tomas.winkler@intel.com>
+Signed-off-by: Barak Yoresh <barak.yoresh@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/misc/mei/hw-txe.c | 20 ++++++++++----------
+ 1 file changed, 10 insertions(+), 10 deletions(-)
+
+--- a/drivers/misc/mei/hw-txe.c
++++ b/drivers/misc/mei/hw-txe.c
+@@ -16,6 +16,7 @@
+
+ #include <linux/pci.h>
+ #include <linux/jiffies.h>
++#include <linux/ktime.h>
+ #include <linux/delay.h>
+ #include <linux/kthread.h>
+ #include <linux/irqreturn.h>
+@@ -218,26 +219,25 @@ static u32 mei_txe_aliveness_get(struct
+ *
+ * Polls for HICR_HOST_ALIVENESS_RESP.ALIVENESS_RESP to be set
+ *
+- * Return: > 0 if the expected value was received, -ETIME otherwise
++ * Return: 0 if the expected value was received, -ETIME otherwise
+ */
+ static int mei_txe_aliveness_poll(struct mei_device *dev, u32 expected)
+ {
+ struct mei_txe_hw *hw = to_txe_hw(dev);
+- int t = 0;
++ ktime_t stop, start;
+
++ start = ktime_get();
++ stop = ktime_add(start, ms_to_ktime(SEC_ALIVENESS_WAIT_TIMEOUT));
+ do {
+ hw->aliveness = mei_txe_aliveness_get(dev);
+ if (hw->aliveness == expected) {
+ dev->pg_event = MEI_PG_EVENT_IDLE;
+- dev_dbg(dev->dev,
+- "aliveness settled after %d msecs\n", t);
+- return t;
++ dev_dbg(dev->dev, "aliveness settled after %lld usecs\n",
++ ktime_to_us(ktime_sub(ktime_get(), start)));
++ return 0;
+ }
+- mutex_unlock(&dev->device_lock);
+- msleep(MSEC_PER_SEC / 5);
+- mutex_lock(&dev->device_lock);
+- t += MSEC_PER_SEC / 5;
+- } while (t < SEC_ALIVENESS_WAIT_TIMEOUT);
++ usleep_range(20, 50);
++ } while (ktime_before(ktime_get(), stop));
+
+ dev->pg_event = MEI_PG_EVENT_IDLE;
+ dev_err(dev->dev, "aliveness timed out\n");
--- /dev/null
+From c5f3b1a51a591c18c8b33983908e7fdda6ae417e Mon Sep 17 00:00:00 2001
+From: Catalin Marinas <catalin.marinas@arm.com>
+Date: Wed, 24 Jun 2015 16:58:26 -0700
+Subject: mm: kmemleak: allow safe memory scanning during kmemleak disabling
+
+From: Catalin Marinas <catalin.marinas@arm.com>
+
+commit c5f3b1a51a591c18c8b33983908e7fdda6ae417e upstream.
+
+The kmemleak scanning thread can run for minutes. Callbacks like
+kmemleak_free() are allowed during this time, the race being taken care
+of by the object->lock spinlock. Such lock also prevents a memory block
+from being freed or unmapped while it is being scanned by blocking the
+kmemleak_free() -> ... -> __delete_object() function until the lock is
+released in scan_object().
+
+When a kmemleak error occurs (e.g. it fails to allocate its metadata),
+kmemleak_enabled is set and __delete_object() is no longer called on
+freed objects. If kmemleak_scan is running at the same time,
+kmemleak_free() no longer waits for the object scanning to complete,
+allowing the corresponding memory block to be freed or unmapped (in the
+case of vfree()). This leads to kmemleak_scan potentially triggering a
+page fault.
+
+This patch separates the kmemleak_free() enabling/disabling from the
+overall kmemleak_enabled nob so that we can defer the disabling of the
+object freeing tracking until the scanning thread completed. The
+kmemleak_free_part() is deliberately ignored by this patch since this is
+only called during boot before the scanning thread started.
+
+Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
+Reported-by: Vignesh Radhakrishnan <vigneshr@codeaurora.org>
+Tested-by: Vignesh Radhakrishnan <vigneshr@codeaurora.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/kmemleak.c | 19 ++++++++++++++++---
+ 1 file changed, 16 insertions(+), 3 deletions(-)
+
+--- a/mm/kmemleak.c
++++ b/mm/kmemleak.c
+@@ -195,6 +195,8 @@ static struct kmem_cache *scan_area_cach
+
+ /* set if tracing memory operations is enabled */
+ static int kmemleak_enabled;
++/* same as above but only for the kmemleak_free() callback */
++static int kmemleak_free_enabled;
+ /* set in the late_initcall if there were no errors */
+ static int kmemleak_initialized;
+ /* enables or disables early logging of the memory operations */
+@@ -942,7 +944,7 @@ void __ref kmemleak_free(const void *ptr
+ {
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+- if (kmemleak_enabled && ptr && !IS_ERR(ptr))
++ if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
+ delete_object_full((unsigned long)ptr);
+ else if (kmemleak_early_log)
+ log_early(KMEMLEAK_FREE, ptr, 0, 0);
+@@ -982,7 +984,7 @@ void __ref kmemleak_free_percpu(const vo
+
+ pr_debug("%s(0x%p)\n", __func__, ptr);
+
+- if (kmemleak_enabled && ptr && !IS_ERR(ptr))
++ if (kmemleak_free_enabled && ptr && !IS_ERR(ptr))
+ for_each_possible_cpu(cpu)
+ delete_object_full((unsigned long)per_cpu_ptr(ptr,
+ cpu));
+@@ -1750,6 +1752,13 @@ static void kmemleak_do_cleanup(struct w
+ mutex_lock(&scan_mutex);
+ stop_scan_thread();
+
++ /*
++ * Once the scan thread has stopped, it is safe to no longer track
++ * object freeing. Ordering of the scan thread stopping and the memory
++ * accesses below is guaranteed by the kthread_stop() function.
++ */
++ kmemleak_free_enabled = 0;
++
+ if (!kmemleak_found_leaks)
+ __kmemleak_do_cleanup();
+ else
+@@ -1776,6 +1785,8 @@ static void kmemleak_disable(void)
+ /* check whether it is too early for a kernel thread */
+ if (kmemleak_initialized)
+ schedule_work(&cleanup_work);
++ else
++ kmemleak_free_enabled = 0;
+
+ pr_info("Kernel memory leak detector disabled\n");
+ }
+@@ -1840,8 +1851,10 @@ void __init kmemleak_init(void)
+ if (kmemleak_error) {
+ local_irq_restore(flags);
+ return;
+- } else
++ } else {
+ kmemleak_enabled = 1;
++ kmemleak_free_enabled = 1;
++ }
+ local_irq_restore(flags);
+
+ /*
--- /dev/null
+From 8a8c35fadfaf55629a37ef1a8ead1b8fb32581d2 Mon Sep 17 00:00:00 2001
+From: Larry Finger <Larry.Finger@lwfinger.net>
+Date: Wed, 24 Jun 2015 16:58:51 -0700
+Subject: mm: kmemleak_alloc_percpu() should follow the gfp from per_alloc()
+
+From: Larry Finger <Larry.Finger@lwfinger.net>
+
+commit 8a8c35fadfaf55629a37ef1a8ead1b8fb32581d2 upstream.
+
+Beginning at commit d52d3997f843 ("ipv6: Create percpu rt6_info"), the
+following INFO splat is logged:
+
+ ===============================
+ [ INFO: suspicious RCU usage. ]
+ 4.1.0-rc7-next-20150612 #1 Not tainted
+ -------------------------------
+ kernel/sched/core.c:7318 Illegal context switch in RCU-bh read-side critical section!
+ other info that might help us debug this:
+ rcu_scheduler_active = 1, debug_locks = 0
+ 3 locks held by systemd/1:
+ #0: (rtnl_mutex){+.+.+.}, at: [<ffffffff815f0c8f>] rtnetlink_rcv+0x1f/0x40
+ #1: (rcu_read_lock_bh){......}, at: [<ffffffff816a34e2>] ipv6_add_addr+0x62/0x540
+ #2: (addrconf_hash_lock){+...+.}, at: [<ffffffff816a3604>] ipv6_add_addr+0x184/0x540
+ stack backtrace:
+ CPU: 0 PID: 1 Comm: systemd Not tainted 4.1.0-rc7-next-20150612 #1
+ Hardware name: TOSHIBA TECRA A50-A/TECRA A50-A, BIOS Version 4.20 04/17/2014
+ Call Trace:
+ dump_stack+0x4c/0x6e
+ lockdep_rcu_suspicious+0xe7/0x120
+ ___might_sleep+0x1d5/0x1f0
+ __might_sleep+0x4d/0x90
+ kmem_cache_alloc+0x47/0x250
+ create_object+0x39/0x2e0
+ kmemleak_alloc_percpu+0x61/0xe0
+ pcpu_alloc+0x370/0x630
+
+Additional backtrace lines are truncated. In addition, the above splat
+is followed by several "BUG: sleeping function called from invalid
+context at mm/slub.c:1268" outputs. As suggested by Martin KaFai Lau,
+these are the clue to the fix. Routine kmemleak_alloc_percpu() always
+uses GFP_KERNEL for its allocations, whereas it should follow the gfp
+from its callers.
+
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Reviewed-by: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
+Acked-by: Martin KaFai Lau <kafai@fb.com>
+Signed-off-by: Larry Finger <Larry.Finger@lwfinger.net>
+Cc: Martin KaFai Lau <kafai@fb.com>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Christoph Lameter <cl@linux-foundation.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/kmemleak.h | 6 ++++--
+ mm/kmemleak.c | 9 +++++----
+ mm/percpu.c | 2 +-
+ 3 files changed, 10 insertions(+), 7 deletions(-)
+
+--- a/include/linux/kmemleak.h
++++ b/include/linux/kmemleak.h
+@@ -28,7 +28,8 @@
+ extern void kmemleak_init(void) __ref;
+ extern void kmemleak_alloc(const void *ptr, size_t size, int min_count,
+ gfp_t gfp) __ref;
+-extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size) __ref;
++extern void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
++ gfp_t gfp) __ref;
+ extern void kmemleak_free(const void *ptr) __ref;
+ extern void kmemleak_free_part(const void *ptr, size_t size) __ref;
+ extern void kmemleak_free_percpu(const void __percpu *ptr) __ref;
+@@ -71,7 +72,8 @@ static inline void kmemleak_alloc_recurs
+ gfp_t gfp)
+ {
+ }
+-static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
++static inline void kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
++ gfp_t gfp)
+ {
+ }
+ static inline void kmemleak_free(const void *ptr)
+--- a/mm/kmemleak.c
++++ b/mm/kmemleak.c
+@@ -909,12 +909,13 @@ EXPORT_SYMBOL_GPL(kmemleak_alloc);
+ * kmemleak_alloc_percpu - register a newly allocated __percpu object
+ * @ptr: __percpu pointer to beginning of the object
+ * @size: size of the object
++ * @gfp: flags used for kmemleak internal memory allocations
+ *
+ * This function is called from the kernel percpu allocator when a new object
+- * (memory block) is allocated (alloc_percpu). It assumes GFP_KERNEL
+- * allocation.
++ * (memory block) is allocated (alloc_percpu).
+ */
+-void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size)
++void __ref kmemleak_alloc_percpu(const void __percpu *ptr, size_t size,
++ gfp_t gfp)
+ {
+ unsigned int cpu;
+
+@@ -927,7 +928,7 @@ void __ref kmemleak_alloc_percpu(const v
+ if (kmemleak_enabled && ptr && !IS_ERR(ptr))
+ for_each_possible_cpu(cpu)
+ create_object((unsigned long)per_cpu_ptr(ptr, cpu),
+- size, 0, GFP_KERNEL);
++ size, 0, gfp);
+ else if (kmemleak_early_log)
+ log_early(KMEMLEAK_ALLOC_PERCPU, ptr, size, 0);
+ }
+--- a/mm/percpu.c
++++ b/mm/percpu.c
+@@ -1030,7 +1030,7 @@ area_found:
+ memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
+
+ ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
+- kmemleak_alloc_percpu(ptr, size);
++ kmemleak_alloc_percpu(ptr, size, gfp);
+ return ptr;
+
+ fail_unlock:
--- /dev/null
+From 0867a57c4f80a566dda1bac975b42fcd857cb489 Mon Sep 17 00:00:00 2001
+From: Vlastimil Babka <vbabka@suse.cz>
+Date: Wed, 24 Jun 2015 16:58:48 -0700
+Subject: mm, thp: respect MPOL_PREFERRED policy with non-local node
+
+From: Vlastimil Babka <vbabka@suse.cz>
+
+commit 0867a57c4f80a566dda1bac975b42fcd857cb489 upstream.
+
+Since commit 077fcf116c8c ("mm/thp: allocate transparent hugepages on
+local node"), we handle THP allocations on page fault in a special way -
+for non-interleave memory policies, the allocation is only attempted on
+the node local to the current CPU, if the policy's nodemask allows the
+node.
+
+This is motivated by the assumption that THP benefits cannot offset the
+cost of remote accesses, so it's better to fallback to base pages on the
+local node (which might still be available, while huge pages are not due
+to fragmentation) than to allocate huge pages on a remote node.
+
+The nodemask check prevents us from violating e.g. MPOL_BIND policies
+where the local node is not among the allowed nodes. However, the
+current implementation can still give surprising results for the
+MPOL_PREFERRED policy when the preferred node is different than the
+current CPU's local node.
+
+In such case we should honor the preferred node and not use the local
+node, which is what this patch does. If hugepage allocation on the
+preferred node fails, we fall back to base pages and don't try other
+nodes, with the same motivation as is done for the local node hugepage
+allocations. The patch also moves the MPOL_INTERLEAVE check around to
+simplify the hugepage specific test.
+
+The difference can be demonstrated using in-tree transhuge-stress test
+on the following 2-node machine where half memory on one node was
+occupied to show the difference.
+
+> numactl --hardware
+available: 2 nodes (0-1)
+node 0 cpus: 0 1 2 3 4 5 6 7 8 9 10 11 24 25 26 27 28 29 30 31 32 33 34 35
+node 0 size: 7878 MB
+node 0 free: 3623 MB
+node 1 cpus: 12 13 14 15 16 17 18 19 20 21 22 23 36 37 38 39 40 41 42 43 44 45 46 47
+node 1 size: 8045 MB
+node 1 free: 7818 MB
+node distances:
+node 0 1
+ 0: 10 21
+ 1: 21 10
+
+Before the patch:
+> numactl -p0 -C0 ./transhuge-stress
+transhuge-stress: 2.197 s/loop, 0.276 ms/page, 7249.168 MiB/s 7962 succeed, 0 failed, 1786 different pages
+
+> numactl -p0 -C12 ./transhuge-stress
+transhuge-stress: 2.962 s/loop, 0.372 ms/page, 5376.172 MiB/s 7962 succeed, 0 failed, 3873 different pages
+
+Number of successful THP allocations corresponds to free memory on node 0 in
+the first case and node 1 in the second case, i.e. -p parameter is ignored and
+cpu binding "wins".
+
+After the patch:
+> numactl -p0 -C0 ./transhuge-stress
+transhuge-stress: 2.183 s/loop, 0.274 ms/page, 7295.516 MiB/s 7962 succeed, 0 failed, 1760 different pages
+
+> numactl -p0 -C12 ./transhuge-stress
+transhuge-stress: 2.878 s/loop, 0.361 ms/page, 5533.638 MiB/s 7962 succeed, 0 failed, 1750 different pages
+
+> numactl -p1 -C0 ./transhuge-stress
+transhuge-stress: 4.628 s/loop, 0.581 ms/page, 3440.893 MiB/s 7962 succeed, 0 failed, 3918 different pages
+
+The -p parameter is respected regardless of cpu binding.
+
+> numactl -C0 ./transhuge-stress
+transhuge-stress: 2.202 s/loop, 0.277 ms/page, 7230.003 MiB/s 7962 succeed, 0 failed, 1750 different pages
+
+> numactl -C12 ./transhuge-stress
+transhuge-stress: 3.020 s/loop, 0.379 ms/page, 5273.324 MiB/s 7962 succeed, 0 failed, 3916 different pages
+
+Without -p parameter, hugepage restriction to CPU-local node works as before.
+
+Fixes: 077fcf116c8c ("mm/thp: allocate transparent hugepages on local node")
+Signed-off-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
+Acked-by: David Rientjes <rientjes@google.com>
+Cc: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Michal Hocko <mhocko@suse.cz>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/mempolicy.c | 38 ++++++++++++++++++++++----------------
+ 1 file changed, 22 insertions(+), 16 deletions(-)
+
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -1971,35 +1971,41 @@ retry_cpuset:
+ pol = get_vma_policy(vma, addr);
+ cpuset_mems_cookie = read_mems_allowed_begin();
+
+- if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage &&
+- pol->mode != MPOL_INTERLEAVE)) {
++ if (pol->mode == MPOL_INTERLEAVE) {
++ unsigned nid;
++
++ nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
++ mpol_cond_put(pol);
++ page = alloc_page_interleave(gfp, order, nid);
++ goto out;
++ }
++
++ if (unlikely(IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && hugepage)) {
++ int hpage_node = node;
++
+ /*
+ * For hugepage allocation and non-interleave policy which
+- * allows the current node, we only try to allocate from the
+- * current node and don't fall back to other nodes, as the
+- * cost of remote accesses would likely offset THP benefits.
++ * allows the current node (or other explicitly preferred
++ * node) we only try to allocate from the current/preferred
++ * node and don't fall back to other nodes, as the cost of
++ * remote accesses would likely offset THP benefits.
+ *
+ * If the policy is interleave, or does not allow the current
+ * node in its nodemask, we allocate the standard way.
+ */
++ if (pol->mode == MPOL_PREFERRED &&
++ !(pol->flags & MPOL_F_LOCAL))
++ hpage_node = pol->v.preferred_node;
++
+ nmask = policy_nodemask(gfp, pol);
+- if (!nmask || node_isset(node, *nmask)) {
++ if (!nmask || node_isset(hpage_node, *nmask)) {
+ mpol_cond_put(pol);
+- page = alloc_pages_exact_node(node,
++ page = alloc_pages_exact_node(hpage_node,
+ gfp | __GFP_THISNODE, order);
+ goto out;
+ }
+ }
+
+- if (pol->mode == MPOL_INTERLEAVE) {
+- unsigned nid;
+-
+- nid = interleave_nid(pol, vma, addr, PAGE_SHIFT + order);
+- mpol_cond_put(pol);
+- page = alloc_page_interleave(gfp, order, nid);
+- goto out;
+- }
+-
+ nmask = policy_nodemask(gfp, pol);
+ zl = policy_zonelist(gfp, pol, node);
+ mpol_cond_put(pol);
acpi-pm-add-missing-pm_generic_complete-invocation.patch
iio-accel-kxcjk-1013-add-the-kxcj9000-acpi-id.patch
tools-selftests-fix-clean-target-with-make-3.81.patch
+arc-add-smp-barriers-around-atomics-per-documentation-atomic_ops.txt.patch
+arc-add-compiler-barrier-to-llsc-based-cmpxchg.patch
+mei-me-wait-for-power-gating-exit-confirmation.patch
+mei-txe-reduce-suspend-resume-time.patch
+arm64-do-not-attempt-to-use-init_mm-in-reset_context.patch
+arm64-entry-fix-context-tracking-for-el0_sp_pc.patch
+arm64-mm-fix-freeing-of-the-wrong-memmap-entries-with-sparsemem_vmemmap.patch
+arm64-vdso-work-around-broken-elf-toolchains-in-makefile.patch
+mm-kmemleak-allow-safe-memory-scanning-during-kmemleak-disabling.patch
+mm-kmemleak_alloc_percpu-should-follow-the-gfp-from-per_alloc.patch
+mm-thp-respect-mpol_preferred-policy-with-non-local-node.patch