mm/mmu_gather.c

   1 #include <linux/gfp.h>
   2 #include <linux/highmem.h>
   3 #include <linux/kernel.h>
   4 #include <linux/mmdebug.h>
   5 #include <linux/mm_types.h>
   6 #include <linux/mm_inline.h>
   7 #include <linux/pagemap.h>
   8 #include <linux/rcupdate.h>
   9 #include <linux/smp.h>
  10 #include <linux/swap.h>
  11
  12 #include <asm/pgalloc.h>
  13 #include <asm/tlb.h>
  14
  15 #ifndef CONFIG_MMU_GATHER_NO_GATHER
  16
  17 static bool tlb_next_batch(struct mmu_gather *tlb)
  18 {
  19         struct mmu_gather_batch *batch;
  20
  21         batch = tlb->active;
  22         if (batch->next) {
  23                 tlb->active = batch->next;
  24                 return true;
  25         }
  26
  27         if (tlb->batch_count == MAX_GATHER_BATCH_COUNT)
  28                 return false;
  29
  30         batch = (void *)__get_free_pages(GFP_NOWAIT | __GFP_NOWARN, 0);
  31         if (!batch)
  32                 return false;
  33
  34         tlb->batch_count++;
  35         batch->next = NULL;
  36         batch->nr   = 0;
  37         batch->max  = MAX_GATHER_BATCH;
  38
  39         tlb->active->next = batch;
  40         tlb->active = batch;
  41
  42         return true;
  43 }
  44
  45 static void tlb_batch_pages_flush(struct mmu_gather *tlb)
  46 {
  47         struct mmu_gather_batch *batch;
  48
  49         for (batch = &tlb->local; batch && batch->nr; batch = batch->next) {
  50                 struct page **pages = batch->pages;
  51
  52                 do {
  53                         /*
  54                          * limit free batch count when PAGE_SIZE > 4K
  55                          */
  56                         unsigned int nr = min(512U, batch->nr);
  57
  58                         free_pages_and_swap_cache(pages, nr);
  59                         pages += nr;
  60                         batch->nr -= nr;
  61
  62                         cond_resched();
  63                 } while (batch->nr);
  64         }
  65         tlb->active = &tlb->local;
  66 }
  67
  68 static void tlb_batch_list_free(struct mmu_gather *tlb)
  69 {
  70         struct mmu_gather_batch *batch, *next;
  71
  72         for (batch = tlb->local.next; batch; batch = next) {
  73                 next = batch->next;
  74                 free_pages((unsigned long)batch, 0);
  75         }
  76         tlb->local.next = NULL;
  77 }
  78
  79 bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_size)
  80 {
  81         struct mmu_gather_batch *batch;
  82
  83         VM_BUG_ON(!tlb->end);
  84
  85 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
  86         VM_WARN_ON(tlb->page_size != page_size);
  87 #endif
  88
  89         batch = tlb->active;
  90         /*
  91          * Add the page and check if we are full. If so
  92          * force a flush.
  93          */
  94         batch->pages[batch->nr++] = page;
  95         if (batch->nr == batch->max) {
  96                 if (!tlb_next_batch(tlb))
  97                         return true;
  98                 batch = tlb->active;
  99         }
 100         VM_BUG_ON_PAGE(batch->nr > batch->max, page);
 101
 102         return false;
 103 }
 104
 105 #endif /* MMU_GATHER_NO_GATHER */
 106
 107 #ifdef CONFIG_MMU_GATHER_TABLE_FREE
 108
 109 static void __tlb_remove_table_free(struct mmu_table_batch *batch)
 110 {
 111         int i;
 112
 113         for (i = 0; i < batch->nr; i++)
 114                 __tlb_remove_table(batch->tables[i]);
 115
 116         free_page((unsigned long)batch);
 117 }
 118
 119 #ifdef CONFIG_MMU_GATHER_RCU_TABLE_FREE
 120
 121 /*
 122  * Semi RCU freeing of the page directories.
 123  *
 124  * This is needed by some architectures to implement software pagetable walkers.
 125  *
 126  * gup_fast() and other software pagetable walkers do a lockless page-table
 127  * walk and therefore needs some synchronization with the freeing of the page
 128  * directories. The chosen means to accomplish that is by disabling IRQs over
 129  * the walk.
 130  *
 131  * Architectures that use IPIs to flush TLBs will then automagically DTRT,
 132  * since we unlink the page, flush TLBs, free the page. Since the disabling of
 133  * IRQs delays the completion of the TLB flush we can never observe an already
 134  * freed page.
 135  *
 136  * Architectures that do not have this (PPC) need to delay the freeing by some
 137  * other means, this is that means.
 138  *
 139  * What we do is batch the freed directory pages (tables) and RCU free them.
 140  * We use the sched RCU variant, as that guarantees that IRQ/preempt disabling
 141  * holds off grace periods.
 142  *
 143  * However, in order to batch these pages we need to allocate storage, this
 144  * allocation is deep inside the MM code and can thus easily fail on memory
 145  * pressure. To guarantee progress we fall back to single table freeing, see
 146  * the implementation of tlb_remove_table_one().
 147  *
 148  */
 149
 150 static void tlb_remove_table_smp_sync(void *arg)
 151 {
 152         /* Simply deliver the interrupt */
 153 }
 154
 155 static void tlb_remove_table_sync_one(void)
 156 {
 157         /*
 158          * This isn't an RCU grace period and hence the page-tables cannot be
 159          * assumed to be actually RCU-freed.
 160          *
 161          * It is however sufficient for software page-table walkers that rely on
 162          * IRQ disabling.
 163          */
 164         smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
 165 }
 166
 167 static void tlb_remove_table_rcu(struct rcu_head *head)
 168 {
 169         __tlb_remove_table_free(container_of(head, struct mmu_table_batch, rcu));
 170 }
 171
 172 static void tlb_remove_table_free(struct mmu_table_batch *batch)
 173 {
 174         call_rcu(&batch->rcu, tlb_remove_table_rcu);
 175 }
 176
 177 #else /* !CONFIG_MMU_GATHER_RCU_TABLE_FREE */
 178
 179 static void tlb_remove_table_sync_one(void) { }
 180
 181 static void tlb_remove_table_free(struct mmu_table_batch *batch)
 182 {
 183         __tlb_remove_table_free(batch);
 184 }
 185
 186 #endif /* CONFIG_MMU_GATHER_RCU_TABLE_FREE */
 187
 188 /*
 189  * If we want tlb_remove_table() to imply TLB invalidates.
 190  */
 191 static inline void tlb_table_invalidate(struct mmu_gather *tlb)
 192 {
 193         if (tlb_needs_table_invalidate()) {
 194                 /*
 195                  * Invalidate page-table caches used by hardware walkers. Then
 196                  * we still need to RCU-sched wait while freeing the pages
 197                  * because software walkers can still be in-flight.
 198                  */
 199                 tlb_flush_mmu_tlbonly(tlb);
 200         }
 201 }
 202
 203 static void tlb_remove_table_one(void *table)
 204 {
 205         tlb_remove_table_sync_one();
 206         __tlb_remove_table(table);
 207 }
 208
 209 static void tlb_table_flush(struct mmu_gather *tlb)
 210 {
 211         struct mmu_table_batch **batch = &tlb->batch;
 212
 213         if (*batch) {
 214                 tlb_table_invalidate(tlb);
 215                 tlb_remove_table_free(*batch);
 216                 *batch = NULL;
 217         }
 218 }
 219
 220 void tlb_remove_table(struct mmu_gather *tlb, void *table)
 221 {
 222         struct mmu_table_batch **batch = &tlb->batch;
 223
 224         if (*batch == NULL) {
 225                 *batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
 226                 if (*batch == NULL) {
 227                         tlb_table_invalidate(tlb);
 228                         tlb_remove_table_one(table);
 229                         return;
 230                 }
 231                 (*batch)->nr = 0;
 232         }
 233
 234         (*batch)->tables[(*batch)->nr++] = table;
 235         if ((*batch)->nr == MAX_TABLE_BATCH)
 236                 tlb_table_flush(tlb);
 237 }
 238
 239 static inline void tlb_table_init(struct mmu_gather *tlb)
 240 {
 241         tlb->batch = NULL;
 242 }
 243
 244 #else /* !CONFIG_MMU_GATHER_TABLE_FREE */
 245
 246 static inline void tlb_table_flush(struct mmu_gather *tlb) { }
 247 static inline void tlb_table_init(struct mmu_gather *tlb) { }
 248
 249 #endif /* CONFIG_MMU_GATHER_TABLE_FREE */
 250
 251 static void tlb_flush_mmu_free(struct mmu_gather *tlb)
 252 {
 253         tlb_table_flush(tlb);
 254 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 255         tlb_batch_pages_flush(tlb);
 256 #endif
 257 }
 258
 259 void tlb_flush_mmu(struct mmu_gather *tlb)
 260 {
 261         tlb_flush_mmu_tlbonly(tlb);
 262         tlb_flush_mmu_free(tlb);
 263 }
 264
 265 static void __tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm,
 266                              bool fullmm)
 267 {
 268         tlb->mm = mm;
 269         tlb->fullmm = fullmm;
 270
 271 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 272         tlb->need_flush_all = 0;
 273         tlb->local.next = NULL;
 274         tlb->local.nr   = 0;
 275         tlb->local.max  = ARRAY_SIZE(tlb->__pages);
 276         tlb->active     = &tlb->local;
 277         tlb->batch_count = 0;
 278 #endif
 279
 280         tlb_table_init(tlb);
 281 #ifdef CONFIG_MMU_GATHER_PAGE_SIZE
 282         tlb->page_size = 0;
 283 #endif
 284
 285         __tlb_reset_range(tlb);
 286         inc_tlb_flush_pending(tlb->mm);
 287 }
 288
 289 /**
 290  * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down
 291  * @tlb: the mmu_gather structure to initialize
 292  * @mm: the mm_struct of the target address space
 293  *
 294  * Called to initialize an (on-stack) mmu_gather structure for page-table
 295  * tear-down from @mm.
 296  */
 297 void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm)
 298 {
 299         __tlb_gather_mmu(tlb, mm, false);
 300 }
 301
 302 /**
 303  * tlb_gather_mmu_fullmm - initialize an mmu_gather structure for page-table tear-down
 304  * @tlb: the mmu_gather structure to initialize
 305  * @mm: the mm_struct of the target address space
 306  *
 307  * In this case, @mm is without users and we're going to destroy the
 308  * full address space (exit/execve).
 309  *
 310  * Called to initialize an (on-stack) mmu_gather structure for page-table
 311  * tear-down from @mm.
 312  */
 313 void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm)
 314 {
 315         __tlb_gather_mmu(tlb, mm, true);
 316 }
 317
 318 /**
 319  * tlb_finish_mmu - finish an mmu_gather structure
 320  * @tlb: the mmu_gather structure to finish
 321  *
 322  * Called at the end of the shootdown operation to free up any resources that
 323  * were required.
 324  */
 325 void tlb_finish_mmu(struct mmu_gather *tlb)
 326 {
 327         /*
 328          * If there are parallel threads are doing PTE changes on same range
 329          * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
 330          * flush by batching, one thread may end up seeing inconsistent PTEs
 331          * and result in having stale TLB entries.  So flush TLB forcefully
 332          * if we detect parallel PTE batching threads.
 333          *
 334          * However, some syscalls, e.g. munmap(), may free page tables, this
 335          * needs force flush everything in the given range. Otherwise this
 336          * may result in having stale TLB entries for some architectures,
 337          * e.g. aarch64, that could specify flush what level TLB.
 338          */
 339         if (mm_tlb_flush_nested(tlb->mm)) {
 340                 /*
 341                  * The aarch64 yields better performance with fullmm by
 342                  * avoiding multiple CPUs spamming TLBI messages at the
 343                  * same time.
 344                  *
 345                  * On x86 non-fullmm doesn't yield significant difference
 346                  * against fullmm.
 347                  */
 348                 tlb->fullmm = 1;
 349                 __tlb_reset_range(tlb);
 350                 tlb->freed_tables = 1;
 351         }
 352
 353         tlb_flush_mmu(tlb);
 354
 355 #ifndef CONFIG_MMU_GATHER_NO_GATHER
 356         tlb_batch_list_free(tlb);
 357 #endif
 358         dec_tlb_flush_pending(tlb->mm);
 359 }