git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob

   1 From 3ea277194daaeaa84ce75180ec7c7a2075027a68 Mon Sep 17 00:00:00 2001
   2 From: Mel Gorman <mgorman@suse.de>
   3 Date: Wed, 2 Aug 2017 13:31:52 -0700
   4 Subject: mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries
   5
   6 From: Mel Gorman <mgorman@suse.de>
   7
   8 commit 3ea277194daaeaa84ce75180ec7c7a2075027a68 upstream.
   9
  10 Nadav Amit identified a theoritical race between page reclaim and
  11 mprotect due to TLB flushes being batched outside of the PTL being held.
  12
  13 He described the race as follows:
  14
  15         CPU0                            CPU1
  16         ----                            ----
  17                                         user accesses memory using RW PTE
  18                                         [PTE now cached in TLB]
  19         try_to_unmap_one()
  20         ==> ptep_get_and_clear()
  21         ==> set_tlb_ubc_flush_pending()
  22                                         mprotect(addr, PROT_READ)
  23                                         ==> change_pte_range()
  24                                         ==> [ PTE non-present - no flush ]
  25
  26                                         user writes using cached RW PTE
  27         ...
  28
  29         try_to_unmap_flush()
  30
  31 The same type of race exists for reads when protecting for PROT_NONE and
  32 also exists for operations that can leave an old TLB entry behind such
  33 as munmap, mremap and madvise.
  34
  35 For some operations like mprotect, it's not necessarily a data integrity
  36 issue but it is a correctness issue as there is a window where an
  37 mprotect that limits access still allows access.  For munmap, it's
  38 potentially a data integrity issue although the race is massive as an
  39 munmap, mmap and return to userspace must all complete between the
  40 window when reclaim drops the PTL and flushes the TLB.  However, it's
  41 theoritically possible so handle this issue by flushing the mm if
  42 reclaim is potentially currently batching TLB flushes.
  43
  44 Other instances where a flush is required for a present pte should be ok
  45 as either the page lock is held preventing parallel reclaim or a page
  46 reference count is elevated preventing a parallel free leading to
  47 corruption.  In the case of page_mkclean there isn't an obvious path
  48 that userspace could take advantage of without using the operations that
  49 are guarded by this patch.  Other users such as gup as a race with
  50 reclaim looks just at PTEs.  huge page variants should be ok as they
  51 don't race with reclaim.  mincore only looks at PTEs.  userfault also
  52 should be ok as if a parallel reclaim takes place, it will either fault
  53 the page back in or read some of the data before the flush occurs
  54 triggering a fault.
  55
  56 Note that a variant of this patch was acked by Andy Lutomirski but this
  57 was for the x86 parts on top of his PCID work which didn't make the 4.13
  58 merge window as expected.  His ack is dropped from this version and
  59 there will be a follow-on patch on top of PCID that will include his
  60 ack.
  61
  62 [akpm@linux-foundation.org: tweak comments]
  63 [akpm@linux-foundation.org: fix spello]
  64 Link: http://lkml.kernel.org/r/20170717155523.emckq2esjro6hf3z@suse.de
  65 Reported-by: Nadav Amit <nadav.amit@gmail.com>
  66 Signed-off-by: Mel Gorman <mgorman@suse.de>
  67 Cc: Andy Lutomirski <luto@kernel.org>
  68 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  69 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  70 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  71
  72 ---
  73  include/linux/mm_types.h |    4 ++++
  74  mm/internal.h            |    5 ++++-
  75  mm/madvise.c             |    2 ++
  76  mm/memory.c              |    1 +
  77  mm/mprotect.c            |    1 +
  78  mm/mremap.c              |    1 +
  79  mm/rmap.c                |   36 ++++++++++++++++++++++++++++++++++++
  80  7 files changed, 49 insertions(+), 1 deletion(-)
  81
  82 --- a/include/linux/mm_types.h
  83 +++ b/include/linux/mm_types.h
  84 @@ -508,6 +508,10 @@ struct mm_struct {
  85          */
  86         bool tlb_flush_pending;
  87  #endif
  88 +#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
  89 +       /* See flush_tlb_batched_pending() */
  90 +       bool tlb_flush_batched;
  91 +#endif
  92         struct uprobes_state uprobes_state;
  93  #ifdef CONFIG_X86_INTEL_MPX
  94         /* address of the bounds directory */
  95 --- a/mm/internal.h
  96 +++ b/mm/internal.h
  97 @@ -472,6 +472,7 @@ struct tlbflush_unmap_batch;
  98  #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
  99  void try_to_unmap_flush(void);
 100  void try_to_unmap_flush_dirty(void);
 101 +void flush_tlb_batched_pending(struct mm_struct *mm);
 102  #else
 103  static inline void try_to_unmap_flush(void)
 104  {
 105 @@ -479,7 +480,9 @@ static inline void try_to_unmap_flush(vo
 106  static inline void try_to_unmap_flush_dirty(void)
 107  {
 108  }
 109 -
 110 +static inline void flush_tlb_batched_pending(struct mm_struct *mm)
 111 +{
 112 +}
 113  #endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
 114
 115  extern const struct trace_print_flags pageflag_names[];
 116 --- a/mm/madvise.c
 117 +++ b/mm/madvise.c
 118 @@ -21,6 +21,7 @@
 119  #include <linux/swap.h>
 120  #include <linux/swapops.h>
 121  #include <linux/mmu_notifier.h>
 122 +#include "internal.h"
 123
 124  #include <asm/tlb.h>
 125
 126 @@ -282,6 +283,7 @@ static int madvise_free_pte_range(pmd_t
 127                 return 0;
 128
 129         orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 130 +       flush_tlb_batched_pending(mm);
 131         arch_enter_lazy_mmu_mode();
 132         for (; addr != end; pte++, addr += PAGE_SIZE) {
 133                 ptent = *pte;
 134 --- a/mm/memory.c
 135 +++ b/mm/memory.c
 136 @@ -1124,6 +1124,7 @@ again:
 137         init_rss_vec(rss);
 138         start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
 139         pte = start_pte;
 140 +       flush_tlb_batched_pending(mm);
 141         arch_enter_lazy_mmu_mode();
 142         do {
 143                 pte_t ptent = *pte;
 144 --- a/mm/mprotect.c
 145 +++ b/mm/mprotect.c
 146 @@ -74,6 +74,7 @@ static unsigned long change_pte_range(st
 147         if (!pte)
 148                 return 0;
 149
 150 +       flush_tlb_batched_pending(vma->vm_mm);
 151         arch_enter_lazy_mmu_mode();
 152         do {
 153                 oldpte = *pte;
 154 --- a/mm/mremap.c
 155 +++ b/mm/mremap.c
 156 @@ -142,6 +142,7 @@ static void move_ptes(struct vm_area_str
 157         new_ptl = pte_lockptr(mm, new_pmd);
 158         if (new_ptl != old_ptl)
 159                 spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
 160 +       flush_tlb_batched_pending(vma->vm_mm);
 161         arch_enter_lazy_mmu_mode();
 162
 163         for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
 164 --- a/mm/rmap.c
 165 +++ b/mm/rmap.c
 166 @@ -617,6 +617,13 @@ static void set_tlb_ubc_flush_pending(st
 167         tlb_ubc->flush_required = true;
 168
 169         /*
 170 +        * Ensure compiler does not re-order the setting of tlb_flush_batched
 171 +        * before the PTE is cleared.
 172 +        */
 173 +       barrier();
 174 +       mm->tlb_flush_batched = true;
 175 +
 176 +       /*
 177          * If the PTE was dirty then it's best to assume it's writable. The
 178          * caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
 179          * before the page is queued for IO.
 180 @@ -643,6 +650,35 @@ static bool should_defer_flush(struct mm
 181
 182         return should_defer;
 183  }
 184 +
 185 +/*
 186 + * Reclaim unmaps pages under the PTL but do not flush the TLB prior to
 187 + * releasing the PTL if TLB flushes are batched. It's possible for a parallel
 188 + * operation such as mprotect or munmap to race between reclaim unmapping
 189 + * the page and flushing the page. If this race occurs, it potentially allows
 190 + * access to data via a stale TLB entry. Tracking all mm's that have TLB
 191 + * batching in flight would be expensive during reclaim so instead track
 192 + * whether TLB batching occurred in the past and if so then do a flush here
 193 + * if required. This will cost one additional flush per reclaim cycle paid
 194 + * by the first operation at risk such as mprotect and mumap.
 195 + *
 196 + * This must be called under the PTL so that an access to tlb_flush_batched
 197 + * that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
 198 + * via the PTL.
 199 + */
 200 +void flush_tlb_batched_pending(struct mm_struct *mm)
 201 +{
 202 +       if (mm->tlb_flush_batched) {
 203 +               flush_tlb_mm(mm);
 204 +
 205 +               /*
 206 +                * Do not allow the compiler to re-order the clearing of
 207 +                * tlb_flush_batched before the tlb is flushed.
 208 +                */
 209 +               barrier();
 210 +               mm->tlb_flush_batched = false;
 211 +       }
 212 +}
 213  #else
 214  static void set_tlb_ubc_flush_pending(struct mm_struct *mm,
 215                 struct page *page, bool writable)