+++ /dev/null
-From 20841405940e7be0617612d521e206e4b6b325db Mon Sep 17 00:00:00 2001
-From: Rik van Riel <riel@redhat.com>
-Date: Wed, 18 Dec 2013 17:08:44 -0800
-Subject: mm: fix TLB flush race between migration, and change_protection_range
-
-From: Rik van Riel <riel@redhat.com>
-
-commit 20841405940e7be0617612d521e206e4b6b325db upstream.
-
-There are a few subtle races, between change_protection_range (used by
-mprotect and change_prot_numa) on one side, and NUMA page migration and
-compaction on the other side.
-
-The basic race is that there is a time window between when the PTE gets
-made non-present (PROT_NONE or NUMA), and the TLB is flushed.
-
-During that time, a CPU may continue writing to the page.
-
-This is fine most of the time, however compaction or the NUMA migration
-code may come in, and migrate the page away.
-
-When that happens, the CPU may continue writing, through the cached
-translation, to what is no longer the current memory location of the
-process.
-
-This only affects x86, which has a somewhat optimistic pte_accessible.
-All other architectures appear to be safe, and will either always flush,
-or flush whenever there is a valid mapping, even with no permissions
-(SPARC).
-
-The basic race looks like this:
-
-CPU A CPU B CPU C
-
- load TLB entry
-make entry PTE/PMD_NUMA
- fault on entry
- read/write old page
- start migrating page
- change PTE/PMD to new page
- read/write old page [*]
-flush TLB
- reload TLB from new entry
- read/write new page
- lose data
-
-[*] the old page may belong to a new user at this point!
-
-The obvious fix is to flush remote TLB entries, by making sure that
-pte_accessible aware of the fact that PROT_NONE and PROT_NUMA memory may
-still be accessible if there is a TLB flush pending for the mm.
-
-This should fix both NUMA migration and compaction.
-
-[mgorman@suse.de: fix build]
-Signed-off-by: Rik van Riel <riel@redhat.com>
-Signed-off-by: Mel Gorman <mgorman@suse.de>
-Cc: Alex Thorlton <athorlton@sgi.com>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- arch/sparc/include/asm/pgtable_64.h | 4 +--
- arch/x86/include/asm/pgtable.h | 11 +++++++--
- include/asm-generic/pgtable.h | 2 -
- include/linux/mm_types.h | 44 ++++++++++++++++++++++++++++++++++++
- kernel/fork.c | 1
- mm/huge_memory.c | 7 +++++
- mm/mprotect.c | 2 +
- mm/pgtable-generic.c | 5 ++--
- 8 files changed, 69 insertions(+), 7 deletions(-)
-
---- a/arch/sparc/include/asm/pgtable_64.h
-+++ b/arch/sparc/include/asm/pgtable_64.h
-@@ -616,7 +616,7 @@ static inline unsigned long pte_present(
- }
-
- #define pte_accessible pte_accessible
--static inline unsigned long pte_accessible(pte_t a)
-+static inline unsigned long pte_accessible(struct mm_struct *mm, pte_t a)
- {
- return pte_val(a) & _PAGE_VALID;
- }
-@@ -806,7 +806,7 @@ static inline void __set_pte_at(struct m
- * SUN4V NOTE: _PAGE_VALID is the same value in both the SUN4U
- * and SUN4V pte layout, so this inline test is fine.
- */
-- if (likely(mm != &init_mm) && pte_accessible(orig))
-+ if (likely(mm != &init_mm) && pte_accessible(mm, orig))
- tlb_batch_add(mm, addr, ptep, orig, fullmm);
- }
-
---- a/arch/x86/include/asm/pgtable.h
-+++ b/arch/x86/include/asm/pgtable.h
-@@ -452,9 +452,16 @@ static inline int pte_present(pte_t a)
- }
-
- #define pte_accessible pte_accessible
--static inline int pte_accessible(pte_t a)
-+static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
- {
-- return pte_flags(a) & _PAGE_PRESENT;
-+ if (pte_flags(a) & _PAGE_PRESENT)
-+ return true;
-+
-+ if ((pte_flags(a) & (_PAGE_PROTNONE | _PAGE_NUMA)) &&
-+ mm_tlb_flush_pending(mm))
-+ return true;
-+
-+ return false;
- }
-
- static inline int pte_hidden(pte_t pte)
---- a/include/asm-generic/pgtable.h
-+++ b/include/asm-generic/pgtable.h
-@@ -217,7 +217,7 @@ static inline int pmd_same(pmd_t pmd_a,
- #endif
-
- #ifndef pte_accessible
--# define pte_accessible(pte) ((void)(pte),1)
-+# define pte_accessible(mm, pte) ((void)(pte), 1)
- #endif
-
- #ifndef flush_tlb_fix_spurious_fault
---- a/include/linux/mm_types.h
-+++ b/include/linux/mm_types.h
-@@ -435,6 +435,14 @@ struct mm_struct {
- */
- int first_nid;
- #endif
-+#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
-+ /*
-+ * An operation with batched TLB flushing is going on. Anything that
-+ * can move process memory needs to flush the TLB when moving a
-+ * PROT_NONE or PROT_NUMA mapped page.
-+ */
-+ bool tlb_flush_pending;
-+#endif
- struct uprobes_state uprobes_state;
- };
-
-@@ -455,4 +463,40 @@ static inline cpumask_t *mm_cpumask(stru
- return mm->cpu_vm_mask_var;
- }
-
-+#if defined(CONFIG_NUMA_BALANCING) || defined(CONFIG_COMPACTION)
-+/*
-+ * Memory barriers to keep this state in sync are graciously provided by
-+ * the page table locks, outside of which no page table modifications happen.
-+ * The barriers below prevent the compiler from re-ordering the instructions
-+ * around the memory barriers that are already present in the code.
-+ */
-+static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
-+{
-+ barrier();
-+ return mm->tlb_flush_pending;
-+}
-+static inline void set_tlb_flush_pending(struct mm_struct *mm)
-+{
-+ mm->tlb_flush_pending = true;
-+ barrier();
-+}
-+/* Clearing is done after a TLB flush, which also provides a barrier. */
-+static inline void clear_tlb_flush_pending(struct mm_struct *mm)
-+{
-+ barrier();
-+ mm->tlb_flush_pending = false;
-+}
-+#else
-+static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
-+{
-+ return false;
-+}
-+static inline void set_tlb_flush_pending(struct mm_struct *mm)
-+{
-+}
-+static inline void clear_tlb_flush_pending(struct mm_struct *mm)
-+{
-+}
-+#endif
-+
- #endif /* _LINUX_MM_TYPES_H */
---- a/kernel/fork.c
-+++ b/kernel/fork.c
-@@ -537,6 +537,7 @@ static struct mm_struct *mm_init(struct
- spin_lock_init(&mm->page_table_lock);
- mm_init_aio(mm);
- mm_init_owner(mm, p);
-+ clear_tlb_flush_pending(mm);
-
- if (likely(!mm_alloc_pgd(mm))) {
- mm->def_flags = 0;
---- a/mm/huge_memory.c
-+++ b/mm/huge_memory.c
-@@ -1342,6 +1342,13 @@ int do_huge_pmd_numa_page(struct mm_stru
- }
-
- /*
-+ * The page_table_lock above provides a memory barrier
-+ * with change_protection_range.
-+ */
-+ if (mm_tlb_flush_pending(mm))
-+ flush_tlb_range(vma, haddr, haddr + HPAGE_PMD_SIZE);
-+
-+ /*
- * Migrate the THP to the requested node, returns with page unlocked
- * and pmd_numa cleared.
- */
---- a/mm/mprotect.c
-+++ b/mm/mprotect.c
-@@ -209,6 +209,7 @@ static unsigned long change_protection_r
- BUG_ON(addr >= end);
- pgd = pgd_offset(mm, addr);
- flush_cache_range(vma, addr, end);
-+ set_tlb_flush_pending(mm);
- do {
- next = pgd_addr_end(addr, end);
- if (pgd_none_or_clear_bad(pgd))
-@@ -220,6 +221,7 @@ static unsigned long change_protection_r
- /* Only flush the TLB if we actually modified any entries: */
- if (pages)
- flush_tlb_range(vma, start, end);
-+ clear_tlb_flush_pending(mm);
-
- return pages;
- }
---- a/mm/pgtable-generic.c
-+++ b/mm/pgtable-generic.c
-@@ -110,9 +110,10 @@ int pmdp_clear_flush_young(struct vm_are
- pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address,
- pte_t *ptep)
- {
-+ struct mm_struct *mm = (vma)->vm_mm;
- pte_t pte;
-- pte = ptep_get_and_clear((vma)->vm_mm, address, ptep);
-- if (pte_accessible(pte))
-+ pte = ptep_get_and_clear(mm, address, ptep);
-+ if (pte_accessible(mm, pte))
- flush_tlb_page(vma, address);
- return pte;
- }
+++ /dev/null
-From af2c1401e6f9177483be4fad876d0073669df9df Mon Sep 17 00:00:00 2001
-From: Mel Gorman <mgorman@suse.de>
-Date: Wed, 18 Dec 2013 17:08:45 -0800
-Subject: mm: numa: guarantee that tlb_flush_pending updates are visible before page table updates
-
-From: Mel Gorman <mgorman@suse.de>
-
-commit af2c1401e6f9177483be4fad876d0073669df9df upstream.
-
-According to documentation on barriers, stores issued before a LOCK can
-complete after the lock implying that it's possible tlb_flush_pending
-can be visible after a page table update. As per revised documentation,
-this patch adds a smp_mb__before_spinlock to guarantee the correct
-ordering.
-
-Signed-off-by: Mel Gorman <mgorman@suse.de>
-Acked-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
-Reviewed-by: Rik van Riel <riel@redhat.com>
-Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
-Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
----
- include/linux/mm_types.h | 7 ++++++-
- 1 file changed, 6 insertions(+), 1 deletion(-)
-
---- a/include/linux/mm_types.h
-+++ b/include/linux/mm_types.h
-@@ -478,7 +478,12 @@ static inline bool mm_tlb_flush_pending(
- static inline void set_tlb_flush_pending(struct mm_struct *mm)
- {
- mm->tlb_flush_pending = true;
-- barrier();
-+
-+ /*
-+ * Guarantee that the tlb_flush_pending store does not leak into the
-+ * critical section updating the page tables
-+ */
-+ smp_mb__before_spinlock();
- }
- /* Clearing is done after a TLB flush, which also provides a barrier. */
- static inline void clear_tlb_flush_pending(struct mm_struct *mm)