git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob

   1 From 3484b2de9499df23c4604a513b36f96326ae81ad Mon Sep 17 00:00:00 2001
   2 From: Mel Gorman <mgorman@suse.de>
   3 Date: Wed, 6 Aug 2014 16:07:14 -0700
   4 Subject: mm: rearrange zone fields into read-only, page alloc, statistics and page reclaim lines
   5
   6 From: Mel Gorman <mgorman@suse.de>
   7
   8 commit 3484b2de9499df23c4604a513b36f96326ae81ad upstream.
   9
  10 The arrangement of struct zone has changed over time and now it has
  11 reached the point where there is some inappropriate sharing going on.
  12 On x86-64 for example
  13
  14 o The zone->node field is shared with the zone lock and zone->node is
  15   accessed frequently from the page allocator due to the fair zone
  16   allocation policy.
  17
  18 o span_seqlock is almost never used by shares a line with free_area
  19
  20 o Some zone statistics share a cache line with the LRU lock so
  21   reclaim-intensive and allocator-intensive workloads can bounce the cache
  22   line on a stat update
  23
  24 This patch rearranges struct zone to put read-only and read-mostly
  25 fields together and then splits the page allocator intensive fields, the
  26 zone statistics and the page reclaim intensive fields into their own
  27 cache lines.  Note that the type of lowmem_reserve changes due to the
  28 watermark calculations being signed and avoiding a signed/unsigned
  29 conversion there.
  30
  31 On the test configuration I used the overall size of struct zone shrunk
  32 by one cache line.  On smaller machines, this is not likely to be
  33 noticable.  However, on a 4-node NUMA machine running tiobench the
  34 system CPU overhead is reduced by this patch.
  35
  36           3.16.0-rc3  3.16.0-rc3
  37              vanillarearrange-v5r9
  38 User          746.94      759.78
  39 System      65336.22    58350.98
  40 Elapsed     27553.52    27282.02
  41
  42 Signed-off-by: Mel Gorman <mgorman@suse.de>
  43 Acked-by: Johannes Weiner <hannes@cmpxchg.org>
  44 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
  45 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
  46 Signed-off-by: Mel Gorman <mgorman@suse.de>
  47 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  48 ---
  49  include/linux/mmzone.h |  205 +++++++++++++++++++++++++------------------------
  50  mm/page_alloc.c        |    7 -
  51  mm/vmstat.c            |    4
  52  3 files changed, 110 insertions(+), 106 deletions(-)
  53
  54 --- a/include/linux/mmzone.h
  55 +++ b/include/linux/mmzone.h
  56 @@ -321,19 +321,12 @@ enum zone_type {
  57  #ifndef __GENERATING_BOUNDS_H
  58
  59  struct zone {
  60 -       /* Fields commonly accessed by the page allocator */
  61 +       /* Read-mostly fields */
  62
  63         /* zone watermarks, access with *_wmark_pages(zone) macros */
  64         unsigned long watermark[NR_WMARK];
  65
  66         /*
  67 -        * When free pages are below this point, additional steps are taken
  68 -        * when reading the number of free pages to avoid per-cpu counter
  69 -        * drift allowing watermarks to be breached
  70 -        */
  71 -       unsigned long percpu_drift_mark;
  72 -
  73 -       /*
  74          * We don't know if the memory that we're going to allocate will be freeable
  75          * or/and it will be released eventually, so to avoid totally wasting several
  76          * GB of ram we must reserve some of the lower zone memory (otherwise we risk
  77 @@ -341,41 +334,26 @@ struct zone {
  78          * on the higher zones). This array is recalculated at runtime if the
  79          * sysctl_lowmem_reserve_ratio sysctl changes.
  80          */
  81 -       unsigned long           lowmem_reserve[MAX_NR_ZONES];
  82 -
  83 -       /*
  84 -        * This is a per-zone reserve of pages that should not be
  85 -        * considered dirtyable memory.
  86 -        */
  87 -       unsigned long           dirty_balance_reserve;
  88 +       long lowmem_reserve[MAX_NR_ZONES];
  89
  90  #ifdef CONFIG_NUMA
  91         int node;
  92 +#endif
  93 +
  94         /*
  95 -        * zone reclaim becomes active if more unmapped pages exist.
  96 +        * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
  97 +        * this zone's LRU.  Maintained by the pageout code.
  98          */
  99 -       unsigned long           min_unmapped_pages;
 100 -       unsigned long           min_slab_pages;
 101 -#endif
 102 +       unsigned int inactive_ratio;
 103 +
 104 +       struct pglist_data      *zone_pgdat;
 105         struct per_cpu_pageset __percpu *pageset;
 106 +
 107         /*
 108 -        * free areas of different sizes
 109 +        * This is a per-zone reserve of pages that should not be
 110 +        * considered dirtyable memory.
 111          */
 112 -       spinlock_t              lock;
 113 -#if defined CONFIG_COMPACTION || defined CONFIG_CMA
 114 -       /* Set to true when the PG_migrate_skip bits should be cleared */
 115 -       bool                    compact_blockskip_flush;
 116 -
 117 -       /* pfn where compaction free scanner should start */
 118 -       unsigned long           compact_cached_free_pfn;
 119 -       /* pfn where async and sync compaction migration scanner should start */
 120 -       unsigned long           compact_cached_migrate_pfn[2];
 121 -#endif
 122 -#ifdef CONFIG_MEMORY_HOTPLUG
 123 -       /* see spanned/present_pages for more description */
 124 -       seqlock_t               span_seqlock;
 125 -#endif
 126 -       struct free_area        free_area[MAX_ORDER];
 127 +       unsigned long           dirty_balance_reserve;
 128
 129  #ifndef CONFIG_SPARSEMEM
 130         /*
 131 @@ -385,71 +363,14 @@ struct zone {
 132         unsigned long           *pageblock_flags;
 133  #endif /* CONFIG_SPARSEMEM */
 134
 135 -#ifdef CONFIG_COMPACTION
 136 -       /*
 137 -        * On compaction failure, 1<<compact_defer_shift compactions
 138 -        * are skipped before trying again. The number attempted since
 139 -        * last failure is tracked with compact_considered.
 140 -        */
 141 -       unsigned int            compact_considered;
 142 -       unsigned int            compact_defer_shift;
 143 -       int                     compact_order_failed;
 144 -#endif
 145 -
 146 -       ZONE_PADDING(_pad1_)
 147 -
 148 -       /* Fields commonly accessed by the page reclaim scanner */
 149 -       spinlock_t              lru_lock;
 150 -       struct lruvec           lruvec;
 151 -
 152 -       unsigned long           pages_scanned;     /* since last reclaim */
 153 -       unsigned long           flags;             /* zone flags, see below */
 154 -
 155 -       /* Zone statistics */
 156 -       atomic_long_t           vm_stat[NR_VM_ZONE_STAT_ITEMS];
 157 -
 158 -       /*
 159 -        * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
 160 -        * this zone's LRU.  Maintained by the pageout code.
 161 -        */
 162 -       unsigned int inactive_ratio;
 163 -
 164 -
 165 -       ZONE_PADDING(_pad2_)
 166 -       /* Rarely used or read-mostly fields */
 167 -
 168 +#ifdef CONFIG_NUMA
 169         /*
 170 -        * wait_table           -- the array holding the hash table
 171 -        * wait_table_hash_nr_entries   -- the size of the hash table array
 172 -        * wait_table_bits      -- wait_table_size == (1 << wait_table_bits)
 173 -        *
 174 -        * The purpose of all these is to keep track of the people
 175 -        * waiting for a page to become available and make them
 176 -        * runnable again when possible. The trouble is that this
 177 -        * consumes a lot of space, especially when so few things
 178 -        * wait on pages at a given time. So instead of using
 179 -        * per-page waitqueues, we use a waitqueue hash table.
 180 -        *
 181 -        * The bucket discipline is to sleep on the same queue when
 182 -        * colliding and wake all in that wait queue when removing.
 183 -        * When something wakes, it must check to be sure its page is
 184 -        * truly available, a la thundering herd. The cost of a
 185 -        * collision is great, but given the expected load of the
 186 -        * table, they should be so rare as to be outweighed by the
 187 -        * benefits from the saved space.
 188 -        *
 189 -        * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
 190 -        * primary users of these fields, and in mm/page_alloc.c
 191 -        * free_area_init_core() performs the initialization of them.
 192 +        * zone reclaim becomes active if more unmapped pages exist.
 193          */
 194 -       wait_queue_head_t       * wait_table;
 195 -       unsigned long           wait_table_hash_nr_entries;
 196 -       unsigned long           wait_table_bits;
 197 +       unsigned long           min_unmapped_pages;
 198 +       unsigned long           min_slab_pages;
 199 +#endif /* CONFIG_NUMA */
 200
 201 -       /*
 202 -        * Discontig memory support fields.
 203 -        */
 204 -       struct pglist_data      *zone_pgdat;
 205         /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
 206         unsigned long           zone_start_pfn;
 207
 208 @@ -495,9 +416,11 @@ struct zone {
 209          * adjust_managed_page_count() should be used instead of directly
 210          * touching zone->managed_pages and totalram_pages.
 211          */
 212 +       unsigned long           managed_pages;
 213         unsigned long           spanned_pages;
 214         unsigned long           present_pages;
 215 -       unsigned long           managed_pages;
 216 +
 217 +       const char              *name;
 218
 219         /*
 220          * Number of MIGRATE_RESEVE page block. To maintain for just
 221 @@ -505,10 +428,92 @@ struct zone {
 222          */
 223         int                     nr_migrate_reserve_block;
 224
 225 +#ifdef CONFIG_MEMORY_HOTPLUG
 226 +       /* see spanned/present_pages for more description */
 227 +       seqlock_t               span_seqlock;
 228 +#endif
 229 +
 230         /*
 231 -        * rarely used fields:
 232 +        * wait_table           -- the array holding the hash table
 233 +        * wait_table_hash_nr_entries   -- the size of the hash table array
 234 +        * wait_table_bits      -- wait_table_size == (1 << wait_table_bits)
 235 +        *
 236 +        * The purpose of all these is to keep track of the people
 237 +        * waiting for a page to become available and make them
 238 +        * runnable again when possible. The trouble is that this
 239 +        * consumes a lot of space, especially when so few things
 240 +        * wait on pages at a given time. So instead of using
 241 +        * per-page waitqueues, we use a waitqueue hash table.
 242 +        *
 243 +        * The bucket discipline is to sleep on the same queue when
 244 +        * colliding and wake all in that wait queue when removing.
 245 +        * When something wakes, it must check to be sure its page is
 246 +        * truly available, a la thundering herd. The cost of a
 247 +        * collision is great, but given the expected load of the
 248 +        * table, they should be so rare as to be outweighed by the
 249 +        * benefits from the saved space.
 250 +        *
 251 +        * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
 252 +        * primary users of these fields, and in mm/page_alloc.c
 253 +        * free_area_init_core() performs the initialization of them.
 254          */
 255 -       const char              *name;
 256 +       wait_queue_head_t       *wait_table;
 257 +       unsigned long           wait_table_hash_nr_entries;
 258 +       unsigned long           wait_table_bits;
 259 +
 260 +       ZONE_PADDING(_pad1_)
 261 +
 262 +       /* Write-intensive fields used from the page allocator */
 263 +       spinlock_t              lock;
 264 +
 265 +       /* free areas of different sizes */
 266 +       struct free_area        free_area[MAX_ORDER];
 267 +
 268 +       /* zone flags, see below */
 269 +       unsigned long           flags;
 270 +
 271 +       ZONE_PADDING(_pad2_)
 272 +
 273 +       /* Write-intensive fields used by page reclaim */
 274 +
 275 +       /* Fields commonly accessed by the page reclaim scanner */
 276 +       spinlock_t              lru_lock;
 277 +       unsigned long           pages_scanned;     /* since last reclaim */
 278 +       struct lruvec           lruvec;
 279 +
 280 +       /*
 281 +        * When free pages are below this point, additional steps are taken
 282 +        * when reading the number of free pages to avoid per-cpu counter
 283 +        * drift allowing watermarks to be breached
 284 +        */
 285 +       unsigned long percpu_drift_mark;
 286 +
 287 +#if defined CONFIG_COMPACTION || defined CONFIG_CMA
 288 +       /* pfn where compaction free scanner should start */
 289 +       unsigned long           compact_cached_free_pfn;
 290 +       /* pfn where async and sync compaction migration scanner should start */
 291 +       unsigned long           compact_cached_migrate_pfn[2];
 292 +#endif
 293 +
 294 +#ifdef CONFIG_COMPACTION
 295 +       /*
 296 +        * On compaction failure, 1<<compact_defer_shift compactions
 297 +        * are skipped before trying again. The number attempted since
 298 +        * last failure is tracked with compact_considered.
 299 +        */
 300 +       unsigned int            compact_considered;
 301 +       unsigned int            compact_defer_shift;
 302 +       int                     compact_order_failed;
 303 +#endif
 304 +
 305 +#if defined CONFIG_COMPACTION || defined CONFIG_CMA
 306 +       /* Set to true when the PG_migrate_skip bits should be cleared */
 307 +       bool                    compact_blockskip_flush;
 308 +#endif
 309 +
 310 +       ZONE_PADDING(_pad3_)
 311 +       /* Zone statistics */
 312 +       atomic_long_t           vm_stat[NR_VM_ZONE_STAT_ITEMS];
 313  } ____cacheline_internodealigned_in_smp;
 314
 315  typedef enum {
 316 --- a/mm/page_alloc.c
 317 +++ b/mm/page_alloc.c
 318 @@ -1710,7 +1710,6 @@ static bool __zone_watermark_ok(struct z
 319  {
 320         /* free_pages my go negative - that's OK */
 321         long min = mark;
 322 -       long lowmem_reserve = z->lowmem_reserve[classzone_idx];
 323         int o;
 324         long free_cma = 0;
 325
 326 @@ -1725,7 +1724,7 @@ static bool __zone_watermark_ok(struct z
 327                 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
 328  #endif
 329
 330 -       if (free_pages - free_cma <= min + lowmem_reserve)
 331 +       if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
 332                 return false;
 333         for (o = 0; o < order; o++) {
 334                 /* At the next order, this order's pages become unavailable */
 335 @@ -3257,7 +3256,7 @@ void show_free_areas(unsigned int filter
 336                         );
 337                 printk("lowmem_reserve[]:");
 338                 for (i = 0; i < MAX_NR_ZONES; i++)
 339 -                       printk(" %lu", zone->lowmem_reserve[i]);
 340 +                       printk(" %ld", zone->lowmem_reserve[i]);
 341                 printk("\n");
 342         }
 343
 344 @@ -5585,7 +5584,7 @@ static void calculate_totalreserve_pages
 345         for_each_online_pgdat(pgdat) {
 346                 for (i = 0; i < MAX_NR_ZONES; i++) {
 347                         struct zone *zone = pgdat->node_zones + i;
 348 -                       unsigned long max = 0;
 349 +                       long max = 0;
 350
 351                         /* Find valid and maximum lowmem_reserve in the zone */
 352                         for (j = i; j < MAX_NR_ZONES; j++) {
 353 --- a/mm/vmstat.c
 354 +++ b/mm/vmstat.c
 355 @@ -1065,10 +1065,10 @@ static void zoneinfo_show_print(struct s
 356                                 zone_page_state(zone, i));
 357
 358         seq_printf(m,
 359 -                  "\n        protection: (%lu",
 360 +                  "\n        protection: (%ld",
 361                    zone->lowmem_reserve[0]);
 362         for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
 363 -               seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
 364 +               seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
 365         seq_printf(m,
 366                    ")"
 367                    "\n  pagesets");