--- /dev/null
+From 6815bf3f233e0b10c99a758497d5d236063b010b Mon Sep 17 00:00:00 2001
+From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Date: Wed, 18 Dec 2013 17:08:52 -0800
+Subject: mm/compaction: respect ignore_skip_hint in update_pageblock_skip
+
+From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+
+commit 6815bf3f233e0b10c99a758497d5d236063b010b upstream.
+
+update_pageblock_skip() only fits to compaction which tries to isolate
+by pageblock unit. If isolate_migratepages_range() is called by CMA, it
+try to isolate regardless of pageblock unit and it don't reference
+get_pageblock_skip() by ignore_skip_hint. We should also respect it on
+update_pageblock_skip() to prevent from setting the wrong information.
+
+Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Rafael Aquini <aquini@redhat.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/compaction.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/mm/compaction.c
++++ b/mm/compaction.c
+@@ -134,6 +134,10 @@ static void update_pageblock_skip(struct
+ bool migrate_scanner)
+ {
+ struct zone *zone = cc->zone;
++
++ if (cc->ignore_skip_hint)
++ return;
++
+ if (!page)
+ return;
+
--- /dev/null
+From 20841405940e7be0617612d521e206e4b6b325db Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@redhat.com>
+Date: Wed, 18 Dec 2013 17:08:44 -0800
+Subject: mm: fix TLB flush race between migration, and change_protection_range
+
+From: Rik van Riel <riel@redhat.com>
+
+commit 20841405940e7be0617612d521e206e4b6b325db upstream.
+
+There are a few subtle races, between change_protection_range (used by
+mprotect and change_prot_numa) on one side, and NUMA page migration and
+compaction on the other side.
+
+The basic race is that there is a time window between when the PTE gets
+made non-present (PROT_NONE or NUMA), and the TLB is flushed.
+
+During that time, a CPU may continue writing to the page.
+
+This is fine most of the time, however compaction or the NUMA migration
+code may come in, and migrate the page away.
+
+When that happens, the CPU may continue writing, through the cached
+translation, to what is no longer the current memory location of the
+process.
+
+This only affects x86, which has a somewhat optimistic pte_accessible.
+All other architectures appear to be safe, and will either always flush,
+or flush whenever there is a valid mapping, even with no permissions
+(SPARC).
+
+The basic race looks like this:
+
+CPU A CPU B CPU C
+
+ load TLB entry
+make entry PTE/PMD_NUMA
+ fault on entry
+ read/write old page
+ start migrating page
+ change PTE/PMD to new page
+ read/write old page [*]
+flush TLB
+ reload TLB from new entry
+ read/write new page
+ lose data
+
+[*] the old page may belong to a new user at this point!
+
+The obvious fix is to flush remote TLB entries, by making sure that
+pte_accessible aware of the fact that PROT_NONE and PROT_NUMA memory may
+still be accessible if there is a TLB flush pending for the mm.
+
+This should fix both NUMA migration and compaction.
+
+[mgorman@suse.de: fix build]
+Signed-off-by: Rik van Riel <riel@redhat.com>
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Cc: Alex Thorlton <athorlton@sgi.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/sparc/include/asm/pgtable_64.h | 4 +--
+ arch/x86/include/asm/pgtable.h | 11 +++++++--
+ include/asm-generic/pgtable.h | 2 -
+ include/linux/mm_types.h | 44 ++++++++++++++++++++++++++++++++++++
+ kernel/fork.c | 1
+ mm/huge_memory.c | 7 +++++
+ mm/mprotect.c | 2 +
+ mm/pgtable-generic.c | 5 ++--
+ 8 files changed, 69 insertions(+), 7 deletions(-)
+
+--- a/arch/sparc/include/asm/pgtable_64.h
++++ b/arch/sparc/include/asm/pgtable_64.h
+@@ -616,7 +616,7 @@ static inline unsigned long pte_present(
+ }
+
+ #define pte_accessible pte_accessible
+-static inline unsigned long pte_accessible(pte_t a)
++static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a)
+ {
+ return pte_val(a) & _PAGE_VALID;
+ }
+@@ -806,7 +806,7 @@ static inline void __set_pte_at(struct m
+ * SUN4V NOTE: _PAGE_VALID is the same value in both the SUN4U
+ * and SUN4V pte layout, so this inline test is fine.
+ */
+- if (likely(mm != &init_mm) && pte_accessible(orig))
++ if (likely(mm != &init_mm) && pte_accessible(mm, orig))
+ tlb_batch_add(mm, addr, ptep, orig, fullmm);
+ }
+
+--- a/arch/x86/include/asm/pgtable.h
++++ b/arch/x86/include/asm/pgtable.h
+@@ -415,9 +415,16 @@ static inline int pte_present(pte_t a)
+ }
+
+ #define pte_accessible pte_accessible
+-static inline int pte_accessible(pte_t a)
++static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
+ {
+- return pte_flags(a) & _PAGE_PRESENT;
++ if (pte_flags(a) & _PAGE_PRESENT)
++ return true;
++
++ if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) &&
++ mm_tlb_flush_pending(mm))
++ return true;
++
++ return false;
+ }
+
+ static inline int pte_hidden(pte_t pte)
+--- a/include/asm-generic/pgtable.h
++++ b/include/asm-generic/pgtable.h
+@@ -220,7 +220,7 @@ static inline int pmd_same(pmd_t pmd_a,
+ #endif
+
+ #ifndef pte_accessible
+-# define pte_accessible(pte) ((void)(pte),1)
++# define pte_accessible(mm, pte) ((void)(pte), 1)
+ #endif
+
+ #ifndef flush_tlb_fix_spurious_fault
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -437,6 +437,14 @@ struct mm_struct {
+ */
+ int first_nid;
+ #endif
++#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
++ /*
++ * An operation with batched TLB flushing is going on. Anything that
++ * can move process memory needs to flush the TLB when moving a
++ * PROT_NONE or PROT_NUMA mapped page.
++ */
++ bool tlb_flush_pending;
++#endif
+ struct uprobes_state uprobes_state;
+ };
+
+@@ -457,4 +465,40 @@ static inline cpumask_t *mm_cpumask(stru
+ return mm->cpu_vm_mask_var;
+ }
+
++#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
++/*
++ * Memory barriers to keep this state in sync are graciously provided by
++ * the page table locks, outside of which no page table modifications happen.
++ * The barriers below prevent the compiler from re-ordering the instructions
++ * around the memory barriers that are already present in the code.
++ */
++static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
++{
++ barrier();
++ return mm->tlb_flush_pending;
++}
++static inline void set_tlb_flush_pending(struct mm_struct *mm)
++{
++ mm->tlb_flush_pending = true;
++ barrier();
++}
++/* Clearing is done after a TLB flush, which also provides a barrier. */
++static inline void clear_tlb_flush_pending(struct mm_struct *mm)
++{
++ barrier();
++ mm->tlb_flush_pending = false;
++}
++#else
++static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
++{
++ return false;
++}
++static inline void set_tlb_flush_pending(struct mm_struct *mm)
++{
++}
++static inline void clear_tlb_flush_pending(struct mm_struct *mm)
++{
++}
++#endif
++
+ #endif /* _LINUX_MM_TYPES_H */
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -544,6 +544,7 @@ static struct mm_struct *mm_init(struct
+ mm->cached_hole_size = ~0UL;
+ mm_init_aio(mm);
+ mm_init_owner(mm, p);
++ clear_tlb_flush_pending(mm);
+
+ if (likely(!mm_alloc_pgd(mm))) {
+ mm->def_flags = 0;
+--- a/mm/huge_memory.c
++++ b/mm/huge_memory.c
+@@ -1352,6 +1352,13 @@ int do_huge_pmd_numa_page(struct mm_stru
+ }
+
+ /*
++ * The page_table_lock above provides a memory barrier
++ * with change_protection_range.
++ */
++ if (mm_tlb_flush_pending(mm))
++ flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
++
++ /*
+ * Migrate the THP to the requested node, returns with page unlocked
+ * and pmd_numa cleared.
+ */
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -206,6 +206,7 @@ static unsigned long change_protection_r
+ BUG_ON(addr >= end);
+ pgd = pgd_offset(mm, addr);
+ flush_cache_range(vma, addr, end);
++ set_tlb_flush_pending(mm);
+ do {
+ next = pgd_addr_end(addr, end);
+ if (pgd_none_or_clear_bad(pgd))
+@@ -217,6 +218,7 @@ static unsigned long change_protection_r
+ /* Only flush the TLB if we actually modified any entries: */
+ if (pages)
+ flush_tlb_range(vma, start, end);
++ clear_tlb_flush_pending(mm);
+
+ return pages;
+ }
+--- a/mm/pgtable-generic.c
++++ b/mm/pgtable-generic.c
+@@ -86,9 +86,10 @@ int pmdp_clear_flush_young(struct vm_are
+ pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
+ pte_t *ptep)
+ {
++ struct mm_struct *mm = (vma)->vm_mm;
+ pte_t pte;
+- pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
+- if (pte_accessible(pte))
++ pte = ptep_get_and_clear(mm, address, ptep);
++ if (pte_accessible(mm, pte))
+ flush_tlb_page(vma, address);
+ return pte;
+ }
--- /dev/null
+From 4eb919825e6c3c7fb3630d5621f6d11e98a18b3a Mon Sep 17 00:00:00 2001
+From: Rik van Riel <riel@redhat.com>
+Date: Thu, 2 Jan 2014 12:58:46 -0800
+Subject: mm: fix use-after-free in sys_remap_file_pages
+
+From: Rik van Riel <riel@redhat.com>
+
+commit 4eb919825e6c3c7fb3630d5621f6d11e98a18b3a upstream.
+
+remap_file_pages calls mmap_region, which may merge the VMA with other
+existing VMAs, and free "vma". This can lead to a use-after-free bug.
+Avoid the bug by remembering vm_flags before calling mmap_region, and
+not trying to dereference vma later.
+
+Signed-off-by: Rik van Riel <riel@redhat.com>
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Cc: PaX Team <pageexec@freemail.hu>
+Cc: Kees Cook <keescook@chromium.org>
+Cc: Michel Lespinasse <walken@google.com>
+Cc: Cyrill Gorcunov <gorcunov@openvz.org>
+Cc: Hugh Dickins <hughd@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/fremap.c | 8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+--- a/mm/fremap.c
++++ b/mm/fremap.c
+@@ -203,9 +203,10 @@ get_write_lock:
+ if (mapping_cap_account_dirty(mapping)) {
+ unsigned long addr;
+ struct file *file = get_file(vma->vm_file);
++ /* mmap_region may free vma; grab the info now */
++ vm_flags = vma->vm_flags;
+
+- addr = mmap_region(file, start, size,
+- vma->vm_flags, pgoff);
++ addr = mmap_region(file, start, size, vm_flags, pgoff);
+ fput(file);
+ if (IS_ERR_VALUE(addr)) {
+ err = addr;
+@@ -213,7 +214,7 @@ get_write_lock:
+ BUG_ON(addr != start);
+ err = 0;
+ }
+- goto out;
++ goto out_freed;
+ }
+ mutex_lock(&mapping->i_mmap_mutex);
+ flush_dcache_mmap_lock(mapping);
+@@ -248,6 +249,7 @@ get_write_lock:
+ out:
+ if (vma)
+ vm_flags = vma->vm_flags;
++out_freed:
+ if (likely(!has_write_lock))
+ up_read(&mm->mmap_sem);
+ else
--- /dev/null
+From 98398c32f6687ee1e1f3ae084effb4b75adb0747 Mon Sep 17 00:00:00 2001
+From: Jianguo Wu <wujianguo@huawei.com>
+Date: Wed, 18 Dec 2013 17:08:59 -0800
+Subject: mm/hugetlb: check for pte NULL pointer in __page_check_address()
+
+From: Jianguo Wu <wujianguo@huawei.com>
+
+commit 98398c32f6687ee1e1f3ae084effb4b75adb0747 upstream.
+
+In __page_check_address(), if address's pud is not present,
+huge_pte_offset() will return NULL, we should check the return value.
+
+Signed-off-by: Jianguo Wu <wujianguo@huawei.com>
+Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: qiuxishi <qiuxishi@huawei.com>
+Cc: Hanjun Guo <guohanjun@huawei.com>
+Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/rmap.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/mm/rmap.c
++++ b/mm/rmap.c
+@@ -600,7 +600,11 @@ pte_t *__page_check_address(struct page
+ spinlock_t *ptl;
+
+ if (unlikely(PageHuge(page))) {
++ /* when pud is not present, pte will be NULL */
+ pte = huge_pte_offset(mm, address);
++ if (!pte)
++ return NULL;
++
+ ptl = &mm->page_table_lock;
+ goto check;
+ }
--- /dev/null
+From a3e0f9e47d5ef7858a26cc12d90ad5146e802d47 Mon Sep 17 00:00:00 2001
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Date: Thu, 2 Jan 2014 12:58:51 -0800
+Subject: mm/memory-failure.c: transfer page count from head page to tail page after split thp
+
+From: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+
+commit a3e0f9e47d5ef7858a26cc12d90ad5146e802d47 upstream.
+
+Memory failures on thp tail pages cause kernel panic like below:
+
+ mce: [Hardware Error]: Machine check events logged
+ MCE exception done on CPU 7
+ BUG: unable to handle kernel NULL pointer dereference at 0000000000000058
+ IP: [<ffffffff811b7cd1>] dequeue_hwpoisoned_huge_page+0x131/0x1e0
+ PGD bae42067 PUD ba47d067 PMD 0
+ Oops: 0000 [#1] SMP
+ ...
+ CPU: 7 PID: 128 Comm: kworker/7:2 Tainted: G M O 3.13.0-rc4-131217-1558-00003-g83b7df08e462 #25
+ ...
+ Call Trace:
+ me_huge_page+0x3e/0x50
+ memory_failure+0x4bb/0xc20
+ mce_process_work+0x3e/0x70
+ process_one_work+0x171/0x420
+ worker_thread+0x11b/0x3a0
+ ? manage_workers.isra.25+0x2b0/0x2b0
+ kthread+0xe4/0x100
+ ? kthread_create_on_node+0x190/0x190
+ ret_from_fork+0x7c/0xb0
+ ? kthread_create_on_node+0x190/0x190
+ ...
+ RIP dequeue_hwpoisoned_huge_page+0x131/0x1e0
+ CR2: 0000000000000058
+
+The reasoning of this problem is shown below:
+ - when we have a memory error on a thp tail page, the memory error
+ handler grabs a refcount of the head page to keep the thp under us.
+ - Before unmapping the error page from processes, we split the thp,
+ where page refcounts of both of head/tail pages don't change.
+ - Then we call try_to_unmap() over the error page (which was a tail
+ page before). We didn't pin the error page to handle the memory error,
+ this error page is freed and removed from LRU list.
+ - We never have the error page on LRU list, so the first page state
+ check returns "unknown page," then we move to the second check
+ with the saved page flag.
+ - The saved page flag have PG_tail set, so the second page state check
+ returns "hugepage."
+ - We call me_huge_page() for freed error page, then we hit the above panic.
+
+The root cause is that we didn't move refcount from the head page to the
+tail page after split thp. So this patch suggests to do this.
+
+This panic was introduced by commit 524fca1e73 ("HWPOISON: fix
+misjudgement of page_action() for errors on mlocked pages"). Note that we
+did have the same refcount problem before this commit, but it was just
+ignored because we had only first page state check which returned "unknown
+page." The commit changed the refcount problem from "doesn't work" to
+"kernel panic."
+
+Signed-off-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
+Cc: Andi Kleen <andi@firstfloor.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/memory-failure.c | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -936,6 +936,16 @@ static int hwpoison_user_mappings(struct
+ BUG_ON(!PageHWPoison(p));
+ return SWAP_FAIL;
+ }
++ /*
++ * We pinned the head page for hwpoison handling,
++ * now we split the thp and we are interested in
++ * the hwpoisoned raw page, so move the refcount
++ * to it.
++ */
++ if (hpage != p) {
++ put_page(hpage);
++ get_page(p);
++ }
+ /* THP is split, so ppage should be the real poisoned page. */
+ ppage = p;
+ }
--- /dev/null
+From b0e5fd7359f1ce8db4ccb862b3aa80d2f2cbf4d0 Mon Sep 17 00:00:00 2001
+From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Date: Wed, 18 Dec 2013 17:08:51 -0800
+Subject: mm/mempolicy: correct putback method for isolate pages if failed
+
+From: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+
+commit b0e5fd7359f1ce8db4ccb862b3aa80d2f2cbf4d0 upstream.
+
+queue_pages_range() isolates hugetlbfs pages and putback_lru_pages()
+can't handle these. We should change it to putback_movable_pages().
+
+Naoya said that it is worth going into stable, because it can break
+in-use hugepage list.
+
+Signed-off-by: Joonsoo Kim <iamjoonsoo.kim@lge.com>
+Acked-by: Rafael Aquini <aquini@redhat.com>
+Reviewed-by: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
+Reviewed-by: Wanpeng Li <liwanp@linux.vnet.ibm.com>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Wanpeng Li <liwanp@linux.vnet.ibm.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Rik van Riel <riel@redhat.com>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Cc: Zhang Yanfei <zhangyanfei@cn.fujitsu.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ mm/mempolicy.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/mempolicy.c
++++ b/mm/mempolicy.c
+@@ -1271,7 +1271,7 @@ static long do_mbind(unsigned long start
+ if (nr_failed && (flags & MPOL_MF_STRICT))
+ err = -EIO;
+ } else
+- putback_lru_pages(&pagelist);
++ putback_movable_pages(&pagelist);
+
+ up_write(&mm->mmap_sem);
+ mpol_out:
--- /dev/null
+From af2c1401e6f9177483be4fad876d0073669df9df Mon Sep 17 00:00:00 2001
+From: Mel Gorman <mgorman@suse.de>
+Date: Wed, 18 Dec 2013 17:08:45 -0800
+Subject: mm: numa: guarantee that tlb_flush_pending updates are visible before page table updates
+
+From: Mel Gorman <mgorman@suse.de>
+
+commit af2c1401e6f9177483be4fad876d0073669df9df upstream.
+
+According to documentation on barriers, stores issued before a LOCK can
+complete after the lock implying that it's possible tlb_flush_pending
+can be visible after a page table update. As per revised documentation,
+this patch adds a smp_mb__before_spinlock to guarantee the correct
+ordering.
+
+Signed-off-by: Mel Gorman <mgorman@suse.de>
+Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
+Reviewed-by: Rik van Riel <riel@redhat.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/mm_types.h | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/include/linux/mm_types.h
++++ b/include/linux/mm_types.h
+@@ -480,7 +480,12 @@ static inline bool mm_tlb_flush_pending(
+ static inline void set_tlb_flush_pending(struct mm_struct *mm)
+ {
+ mm->tlb_flush_pending = true;
+- barrier();
++
++ /*
++ * Guarantee that the tlb_flush_pending store does not leak into the
++ * critical section updating the page tables
++ */
++ smp_mb__before_spinlock();
+ }
+ /* Clearing is done after a TLB flush, which also provides a barrier. */
+ static inline void clear_tlb_flush_pending(struct mm_struct *mm)
--- /dev/null
+From e0acd0a68ec7dbf6b7a81a87a867ebd7ac9b76c4 Mon Sep 17 00:00:00 2001
+From: Oleg Nesterov <oleg@redhat.com>
+Date: Mon, 12 Aug 2013 18:14:00 +0200
+Subject: sched: fix the theoretical signal_wake_up() vs schedule() race
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+commit e0acd0a68ec7dbf6b7a81a87a867ebd7ac9b76c4 upstream.
+
+This is only theoretical, but after try_to_wake_up(p) was changed
+to check p->state under p->pi_lock the code like
+
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+
+can miss a signal. This is the special case of wait-for-condition,
+it relies on try_to_wake_up/schedule interaction and thus it does
+not need mb() between __set_current_state() and if(signal_pending).
+
+However, this __set_current_state() can move into the critical
+section protected by rq->lock, now that try_to_wake_up() takes
+another lock we need to ensure that it can't be reordered with
+"if (signal_pending(current))" check inside that section.
+
+The patch is actually one-liner, it simply adds smp_wmb() before
+spin_lock_irq(rq->lock). This is what try_to_wake_up() already
+does by the same reason.
+
+We turn this wmb() into the new helper, smp_mb__before_spinlock(),
+for better documentation and to allow the architectures to change
+the default implementation.
+
+While at it, kill smp_mb__after_lock(), it has no callers.
+
+Perhaps we can also add smp_mb__before/after_spinunlock() for
+prepare_to_wait().
+
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Acked-by: Peter Zijlstra <peterz@infradead.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ arch/x86/include/asm/spinlock.h | 4 ----
+ include/linux/spinlock.h | 14 +++++++++++---
+ kernel/sched/core.c | 14 +++++++++++++-
+ 3 files changed, 24 insertions(+), 8 deletions(-)
+
+--- a/arch/x86/include/asm/spinlock.h
++++ b/arch/x86/include/asm/spinlock.h
+@@ -233,8 +233,4 @@ static inline void arch_write_unlock(arc
+ #define arch_read_relax(lock) cpu_relax()
+ #define arch_write_relax(lock) cpu_relax()
+
+-/* The {read|write|spin}_lock() on x86 are full memory barriers. */
+-static inline void smp_mb__after_lock(void) { }
+-#define ARCH_HAS_SMP_MB_AFTER_LOCK
+-
+ #endif /* _ASM_X86_SPINLOCK_H */
+--- a/include/linux/spinlock.h
++++ b/include/linux/spinlock.h
+@@ -117,9 +117,17 @@ do { \
+ #endif /*arch_spin_is_contended*/
+ #endif
+
+-/* The lock does not imply full memory barrier. */
+-#ifndef ARCH_HAS_SMP_MB_AFTER_LOCK
+-static inline void smp_mb__after_lock(void) { smp_mb(); }
++/*
++ * Despite its name it doesn't necessarily has to be a full barrier.
++ * It should only guarantee that a STORE before the critical section
++ * can not be reordered with a LOAD inside this section.
++ * spin_lock() is the one-way barrier, this LOAD can not escape out
++ * of the region. So the default implementation simply ensures that
++ * a STORE can not move into the critical section, smp_wmb() should
++ * serialize it with another STORE done by spin_lock().
++ */
++#ifndef smp_mb__before_spinlock
++#define smp_mb__before_spinlock() smp_wmb()
+ #endif
+
+ /**
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -1487,7 +1487,13 @@ try_to_wake_up(struct task_struct *p, un
+ unsigned long flags;
+ int cpu, success = 0;
+
+- smp_wmb();
++ /*
++ * If we are going to wake up a thread waiting for CONDITION we
++ * need to ensure that CONDITION=1 done by the caller can not be
++ * reordered with p->state check below. This pairs with mb() in
++ * set_current_state() the waiting thread does.
++ */
++ smp_mb__before_spinlock();
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ if (!(p->state & state))
+ goto out;
+@@ -2966,6 +2972,12 @@ need_resched:
+ if (sched_feat(HRTICK))
+ hrtick_clear(rq);
+
++ /*
++ * Make sure that signal_pending_state()->signal_pending() below
++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
++ * done by the caller to avoid the race with signal_wake_up().
++ */
++ smp_mb__before_spinlock();
+ raw_spin_lock_irq(&rq->lock);
+
+ switch_count = &prev->nivcsw;
mm-clear-pmd_numa-before-invalidating.patch
mm-numa-ensure-anon_vma-is-locked-to-prevent-parallel-thp-splits.patch
mm-numa-avoid-unnecessary-work-on-the-failure-path.patch
+sched-fix-the-theoretical-signal_wake_up-vs-schedule-race.patch
+mm-fix-tlb-flush-race-between-migration-and-change_protection_range.patch
+mm-numa-guarantee-that-tlb_flush_pending-updates-are-visible-before-page-table-updates.patch
+mm-mempolicy-correct-putback-method-for-isolate-pages-if-failed.patch
+mm-compaction-respect-ignore_skip_hint-in-update_pageblock_skip.patch
+mm-hugetlb-check-for-pte-null-pointer-in-__page_check_address.patch
+mm-fix-use-after-free-in-sys_remap_file_pages.patch
+mm-memory-failure.c-transfer-page-count-from-head-page-to-tail-page-after-split-thp.patch