queue-5.1/mm-mmu_gather-remove-__tlb_reset_range-for-force-flush.patch

   1 From 7a30df49f63ad92318ddf1f7498d1129a77dd4bd Mon Sep 17 00:00:00 2001
   2 From: Yang Shi <yang.shi@linux.alibaba.com>
   3 Date: Thu, 13 Jun 2019 15:56:05 -0700
   4 Subject: mm: mmu_gather: remove __tlb_reset_range() for force flush
   5
   6 From: Yang Shi <yang.shi@linux.alibaba.com>
   7
   8 commit 7a30df49f63ad92318ddf1f7498d1129a77dd4bd upstream.
   9
  10 A few new fields were added to mmu_gather to make TLB flush smarter for
  11 huge page by telling what level of page table is changed.
  12
  13 __tlb_reset_range() is used to reset all these page table state to
  14 unchanged, which is called by TLB flush for parallel mapping changes for
  15 the same range under non-exclusive lock (i.e.  read mmap_sem).
  16
  17 Before commit dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in
  18 munmap"), the syscalls (e.g.  MADV_DONTNEED, MADV_FREE) which may update
  19 PTEs in parallel don't remove page tables.  But, the forementioned
  20 commit may do munmap() under read mmap_sem and free page tables.  This
  21 may result in program hang on aarch64 reported by Jan Stancek.  The
  22 problem could be reproduced by his test program with slightly modified
  23 below.
  24
  25 ---8<---
  26
  27 static int map_size = 4096;
  28 static int num_iter = 500;
  29 static long threads_total;
  30
  31 static void *distant_area;
  32
  33 void *map_write_unmap(void *ptr)
  34 {
  35         int *fd = ptr;
  36         unsigned char *map_address;
  37         int i, j = 0;
  38
  39         for (i = 0; i < num_iter; i++) {
  40                 map_address = mmap(distant_area, (size_t) map_size, PROT_WRITE | PROT_READ,
  41                         MAP_SHARED | MAP_ANONYMOUS, -1, 0);
  42                 if (map_address == MAP_FAILED) {
  43                         perror("mmap");
  44                         exit(1);
  45                 }
  46
  47                 for (j = 0; j < map_size; j++)
  48                         map_address[j] = 'b';
  49
  50                 if (munmap(map_address, map_size) == -1) {
  51                         perror("munmap");
  52                         exit(1);
  53                 }
  54         }
  55
  56         return NULL;
  57 }
  58
  59 void *dummy(void *ptr)
  60 {
  61         return NULL;
  62 }
  63
  64 int main(void)
  65 {
  66         pthread_t thid[2];
  67
  68         /* hint for mmap in map_write_unmap() */
  69         distant_area = mmap(0, DISTANT_MMAP_SIZE, PROT_WRITE | PROT_READ,
  70                         MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
  71         munmap(distant_area, (size_t)DISTANT_MMAP_SIZE);
  72         distant_area += DISTANT_MMAP_SIZE / 2;
  73
  74         while (1) {
  75                 pthread_create(&thid[0], NULL, map_write_unmap, NULL);
  76                 pthread_create(&thid[1], NULL, dummy, NULL);
  77
  78                 pthread_join(thid[0], NULL);
  79                 pthread_join(thid[1], NULL);
  80         }
  81 }
  82 ---8<---
  83
  84 The program may bring in parallel execution like below:
  85
  86         t1                                        t2
  87 munmap(map_address)
  88   downgrade_write(&mm->mmap_sem);
  89   unmap_region()
  90   tlb_gather_mmu()
  91     inc_tlb_flush_pending(tlb->mm);
  92   free_pgtables()
  93     tlb->freed_tables = 1
  94     tlb->cleared_pmds = 1
  95
  96                                         pthread_exit()
  97                                         madvise(thread_stack, 8M, MADV_DONTNEED)
  98                                           zap_page_range()
  99                                             tlb_gather_mmu()
 100                                               inc_tlb_flush_pending(tlb->mm);
 101
 102   tlb_finish_mmu()
 103     if (mm_tlb_flush_nested(tlb->mm))
 104       __tlb_reset_range()
 105
 106 __tlb_reset_range() would reset freed_tables and cleared_* bits, but this
 107 may cause inconsistency for munmap() which do free page tables.  Then it
 108 may result in some architectures, e.g.  aarch64, may not flush TLB
 109 completely as expected to have stale TLB entries remained.
 110
 111 Use fullmm flush since it yields much better performance on aarch64 and
 112 non-fullmm doesn't yields significant difference on x86.
 113
 114 The original proposed fix came from Jan Stancek who mainly debugged this
 115 issue, I just wrapped up everything together.
 116
 117 Jan's testing results:
 118
 119 v5.2-rc2-24-gbec7550cca10
 120 --------------------------
 121          mean     stddev
 122 real    37.382   2.780
 123 user     1.420   0.078
 124 sys     54.658   1.855
 125
 126 v5.2-rc2-24-gbec7550cca10 + "mm: mmu_gather: remove __tlb_reset_range() for force flush"
 127 ---------------------------------------------------------------------------------------_
 128          mean     stddev
 129 real    37.119   2.105
 130 user     1.548   0.087
 131 sys     55.698   1.357
 132
 133 [akpm@linux-foundation.org: coding-style fixes]
 134 Link: http://lkml.kernel.org/r/1558322252-113575-1-git-send-email-yang.shi@linux.alibaba.com
 135 Fixes: dd2283f2605e ("mm: mmap: zap pages with read mmap_sem in munmap")
 136 Signed-off-by: Yang Shi <yang.shi@linux.alibaba.com>
 137 Signed-off-by: Jan Stancek <jstancek@redhat.com>
 138 Reported-by: Jan Stancek <jstancek@redhat.com>
 139 Tested-by: Jan Stancek <jstancek@redhat.com>
 140 Suggested-by: Will Deacon <will.deacon@arm.com>
 141 Tested-by: Will Deacon <will.deacon@arm.com>
 142 Acked-by: Will Deacon <will.deacon@arm.com>
 143 Cc: Peter Zijlstra <peterz@infradead.org>
 144 Cc: Nick Piggin <npiggin@gmail.com>
 145 Cc: "Aneesh Kumar K.V" <aneesh.kumar@linux.ibm.com>
 146 Cc: Nadav Amit <namit@vmware.com>
 147 Cc: Minchan Kim <minchan@kernel.org>
 148 Cc: Mel Gorman <mgorman@suse.de>
 149 Cc: <stable@vger.kernel.org>    [4.20+]
 150 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
 151 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
 152 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
 153
 154 ---
 155  mm/mmu_gather.c |   24 +++++++++++++++++++-----
 156  1 file changed, 19 insertions(+), 5 deletions(-)
 157
 158 --- a/mm/mmu_gather.c
 159 +++ b/mm/mmu_gather.c
 160 @@ -93,8 +93,17 @@ void arch_tlb_finish_mmu(struct mmu_gath
 161         struct mmu_gather_batch *batch, *next;
 162
 163         if (force) {
 164 +               /*
 165 +                * The aarch64 yields better performance with fullmm by
 166 +                * avoiding multiple CPUs spamming TLBI messages at the
 167 +                * same time.
 168 +                *
 169 +                * On x86 non-fullmm doesn't yield significant difference
 170 +                * against fullmm.
 171 +                */
 172 +               tlb->fullmm = 1;
 173                 __tlb_reset_range(tlb);
 174 -               __tlb_adjust_range(tlb, start, end - start);
 175 +               tlb->freed_tables = 1;
 176         }
 177
 178         tlb_flush_mmu(tlb);
 179 @@ -249,10 +258,15 @@ void tlb_finish_mmu(struct mmu_gather *t
 180  {
 181         /*
 182          * If there are parallel threads are doing PTE changes on same range
 183 -        * under non-exclusive lock(e.g., mmap_sem read-side) but defer TLB
 184 -        * flush by batching, a thread has stable TLB entry can fail to flush
 185 -        * the TLB by observing pte_none|!pte_dirty, for example so flush TLB
 186 -        * forcefully if we detect parallel PTE batching threads.
 187 +        * under non-exclusive lock (e.g., mmap_sem read-side) but defer TLB
 188 +        * flush by batching, one thread may end up seeing inconsistent PTEs
 189 +        * and result in having stale TLB entries.  So flush TLB forcefully
 190 +        * if we detect parallel PTE batching threads.
 191 +        *
 192 +        * However, some syscalls, e.g. munmap(), may free page tables, this
 193 +        * needs force flush everything in the given range. Otherwise this
 194 +        * may result in having stale TLB entries for some architectures,
 195 +        * e.g. aarch64, that could specify flush what level TLB.
 196          */
 197         bool force = mm_tlb_flush_nested(tlb->mm);
 198