1 From 3484b2de9499df23c4604a513b36f96326ae81ad Mon Sep 17 00:00:00 2001
2 From: Mel Gorman <mgorman@suse.de>
3 Date: Wed, 6 Aug 2014 16:07:14 -0700
4 Subject: mm: rearrange zone fields into read-only, page alloc, statistics and page reclaim lines
6 From: Mel Gorman <mgorman@suse.de>
8 commit 3484b2de9499df23c4604a513b36f96326ae81ad upstream.
10 The arrangement of struct zone has changed over time and now it has
11 reached the point where there is some inappropriate sharing going on.
14 o The zone->node field is shared with the zone lock and zone->node is
15 accessed frequently from the page allocator due to the fair zone
18 o span_seqlock is almost never used by shares a line with free_area
20 o Some zone statistics share a cache line with the LRU lock so
21 reclaim-intensive and allocator-intensive workloads can bounce the cache
24 This patch rearranges struct zone to put read-only and read-mostly
25 fields together and then splits the page allocator intensive fields, the
26 zone statistics and the page reclaim intensive fields into their own
27 cache lines. Note that the type of lowmem_reserve changes due to the
28 watermark calculations being signed and avoiding a signed/unsigned
31 On the test configuration I used the overall size of struct zone shrunk
32 by one cache line. On smaller machines, this is not likely to be
33 noticable. However, on a 4-node NUMA machine running tiobench the
34 system CPU overhead is reduced by this patch.
39 System 65336.22 58350.98
40 Elapsed 27553.52 27282.02
42 Signed-off-by: Mel Gorman <mgorman@suse.de>
43 Acked-by: Johannes Weiner <hannes@cmpxchg.org>
44 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
45 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
46 Signed-off-by: Mel Gorman <mgorman@suse.de>
47 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
49 include/linux/mmzone.h | 205 +++++++++++++++++++++++++------------------------
52 3 files changed, 110 insertions(+), 106 deletions(-)
54 --- a/include/linux/mmzone.h
55 +++ b/include/linux/mmzone.h
56 @@ -321,19 +321,12 @@ enum zone_type {
57 #ifndef __GENERATING_BOUNDS_H
60 - /* Fields commonly accessed by the page allocator */
61 + /* Read-mostly fields */
63 /* zone watermarks, access with *_wmark_pages(zone) macros */
64 unsigned long watermark[NR_WMARK];
67 - * When free pages are below this point, additional steps are taken
68 - * when reading the number of free pages to avoid per-cpu counter
69 - * drift allowing watermarks to be breached
71 - unsigned long percpu_drift_mark;
74 * We don't know if the memory that we're going to allocate will be freeable
75 * or/and it will be released eventually, so to avoid totally wasting several
76 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
77 @@ -341,41 +334,26 @@ struct zone {
78 * on the higher zones). This array is recalculated at runtime if the
79 * sysctl_lowmem_reserve_ratio sysctl changes.
81 - unsigned long lowmem_reserve[MAX_NR_ZONES];
84 - * This is a per-zone reserve of pages that should not be
85 - * considered dirtyable memory.
87 - unsigned long dirty_balance_reserve;
88 + long lowmem_reserve[MAX_NR_ZONES];
95 - * zone reclaim becomes active if more unmapped pages exist.
96 + * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
97 + * this zone's LRU. Maintained by the pageout code.
99 - unsigned long min_unmapped_pages;
100 - unsigned long min_slab_pages;
102 + unsigned int inactive_ratio;
104 + struct pglist_data *zone_pgdat;
105 struct per_cpu_pageset __percpu *pageset;
108 - * free areas of different sizes
109 + * This is a per-zone reserve of pages that should not be
110 + * considered dirtyable memory.
113 -#if defined CONFIG_COMPACTION || defined CONFIG_CMA
114 - /* Set to true when the PG_migrate_skip bits should be cleared */
115 - bool compact_blockskip_flush;
117 - /* pfn where compaction free scanner should start */
118 - unsigned long compact_cached_free_pfn;
119 - /* pfn where async and sync compaction migration scanner should start */
120 - unsigned long compact_cached_migrate_pfn[2];
122 -#ifdef CONFIG_MEMORY_HOTPLUG
123 - /* see spanned/present_pages for more description */
124 - seqlock_t span_seqlock;
126 - struct free_area free_area[MAX_ORDER];
127 + unsigned long dirty_balance_reserve;
129 #ifndef CONFIG_SPARSEMEM
131 @@ -385,71 +363,14 @@ struct zone {
132 unsigned long *pageblock_flags;
133 #endif /* CONFIG_SPARSEMEM */
135 -#ifdef CONFIG_COMPACTION
137 - * On compaction failure, 1<<compact_defer_shift compactions
138 - * are skipped before trying again. The number attempted since
139 - * last failure is tracked with compact_considered.
141 - unsigned int compact_considered;
142 - unsigned int compact_defer_shift;
143 - int compact_order_failed;
146 - ZONE_PADDING(_pad1_)
148 - /* Fields commonly accessed by the page reclaim scanner */
149 - spinlock_t lru_lock;
150 - struct lruvec lruvec;
152 - unsigned long pages_scanned; /* since last reclaim */
153 - unsigned long flags; /* zone flags, see below */
155 - /* Zone statistics */
156 - atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
159 - * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
160 - * this zone's LRU. Maintained by the pageout code.
162 - unsigned int inactive_ratio;
165 - ZONE_PADDING(_pad2_)
166 - /* Rarely used or read-mostly fields */
170 - * wait_table -- the array holding the hash table
171 - * wait_table_hash_nr_entries -- the size of the hash table array
172 - * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
174 - * The purpose of all these is to keep track of the people
175 - * waiting for a page to become available and make them
176 - * runnable again when possible. The trouble is that this
177 - * consumes a lot of space, especially when so few things
178 - * wait on pages at a given time. So instead of using
179 - * per-page waitqueues, we use a waitqueue hash table.
181 - * The bucket discipline is to sleep on the same queue when
182 - * colliding and wake all in that wait queue when removing.
183 - * When something wakes, it must check to be sure its page is
184 - * truly available, a la thundering herd. The cost of a
185 - * collision is great, but given the expected load of the
186 - * table, they should be so rare as to be outweighed by the
187 - * benefits from the saved space.
189 - * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
190 - * primary users of these fields, and in mm/page_alloc.c
191 - * free_area_init_core() performs the initialization of them.
192 + * zone reclaim becomes active if more unmapped pages exist.
194 - wait_queue_head_t * wait_table;
195 - unsigned long wait_table_hash_nr_entries;
196 - unsigned long wait_table_bits;
197 + unsigned long min_unmapped_pages;
198 + unsigned long min_slab_pages;
199 +#endif /* CONFIG_NUMA */
202 - * Discontig memory support fields.
204 - struct pglist_data *zone_pgdat;
205 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
206 unsigned long zone_start_pfn;
208 @@ -495,9 +416,11 @@ struct zone {
209 * adjust_managed_page_count() should be used instead of directly
210 * touching zone->managed_pages and totalram_pages.
212 + unsigned long managed_pages;
213 unsigned long spanned_pages;
214 unsigned long present_pages;
215 - unsigned long managed_pages;
220 * Number of MIGRATE_RESEVE page block. To maintain for just
221 @@ -505,10 +428,92 @@ struct zone {
223 int nr_migrate_reserve_block;
225 +#ifdef CONFIG_MEMORY_HOTPLUG
226 + /* see spanned/present_pages for more description */
227 + seqlock_t span_seqlock;
231 - * rarely used fields:
232 + * wait_table -- the array holding the hash table
233 + * wait_table_hash_nr_entries -- the size of the hash table array
234 + * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
236 + * The purpose of all these is to keep track of the people
237 + * waiting for a page to become available and make them
238 + * runnable again when possible. The trouble is that this
239 + * consumes a lot of space, especially when so few things
240 + * wait on pages at a given time. So instead of using
241 + * per-page waitqueues, we use a waitqueue hash table.
243 + * The bucket discipline is to sleep on the same queue when
244 + * colliding and wake all in that wait queue when removing.
245 + * When something wakes, it must check to be sure its page is
246 + * truly available, a la thundering herd. The cost of a
247 + * collision is great, but given the expected load of the
248 + * table, they should be so rare as to be outweighed by the
249 + * benefits from the saved space.
251 + * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
252 + * primary users of these fields, and in mm/page_alloc.c
253 + * free_area_init_core() performs the initialization of them.
256 + wait_queue_head_t *wait_table;
257 + unsigned long wait_table_hash_nr_entries;
258 + unsigned long wait_table_bits;
260 + ZONE_PADDING(_pad1_)
262 + /* Write-intensive fields used from the page allocator */
265 + /* free areas of different sizes */
266 + struct free_area free_area[MAX_ORDER];
268 + /* zone flags, see below */
269 + unsigned long flags;
271 + ZONE_PADDING(_pad2_)
273 + /* Write-intensive fields used by page reclaim */
275 + /* Fields commonly accessed by the page reclaim scanner */
276 + spinlock_t lru_lock;
277 + unsigned long pages_scanned; /* since last reclaim */
278 + struct lruvec lruvec;
281 + * When free pages are below this point, additional steps are taken
282 + * when reading the number of free pages to avoid per-cpu counter
283 + * drift allowing watermarks to be breached
285 + unsigned long percpu_drift_mark;
287 +#if defined CONFIG_COMPACTION || defined CONFIG_CMA
288 + /* pfn where compaction free scanner should start */
289 + unsigned long compact_cached_free_pfn;
290 + /* pfn where async and sync compaction migration scanner should start */
291 + unsigned long compact_cached_migrate_pfn[2];
294 +#ifdef CONFIG_COMPACTION
296 + * On compaction failure, 1<<compact_defer_shift compactions
297 + * are skipped before trying again. The number attempted since
298 + * last failure is tracked with compact_considered.
300 + unsigned int compact_considered;
301 + unsigned int compact_defer_shift;
302 + int compact_order_failed;
305 +#if defined CONFIG_COMPACTION || defined CONFIG_CMA
306 + /* Set to true when the PG_migrate_skip bits should be cleared */
307 + bool compact_blockskip_flush;
310 + ZONE_PADDING(_pad3_)
311 + /* Zone statistics */
312 + atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
313 } ____cacheline_internodealigned_in_smp;
316 --- a/mm/page_alloc.c
317 +++ b/mm/page_alloc.c
318 @@ -1710,7 +1710,6 @@ static bool __zone_watermark_ok(struct z
320 /* free_pages my go negative - that's OK */
322 - long lowmem_reserve = z->lowmem_reserve[classzone_idx];
326 @@ -1725,7 +1724,7 @@ static bool __zone_watermark_ok(struct z
327 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
330 - if (free_pages - free_cma <= min + lowmem_reserve)
331 + if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
333 for (o = 0; o < order; o++) {
334 /* At the next order, this order's pages become unavailable */
335 @@ -3257,7 +3256,7 @@ void show_free_areas(unsigned int filter
337 printk("lowmem_reserve[]:");
338 for (i = 0; i < MAX_NR_ZONES; i++)
339 - printk(" %lu", zone->lowmem_reserve[i]);
340 + printk(" %ld", zone->lowmem_reserve[i]);
344 @@ -5585,7 +5584,7 @@ static void calculate_totalreserve_pages
345 for_each_online_pgdat(pgdat) {
346 for (i = 0; i < MAX_NR_ZONES; i++) {
347 struct zone *zone = pgdat->node_zones + i;
348 - unsigned long max = 0;
351 /* Find valid and maximum lowmem_reserve in the zone */
352 for (j = i; j < MAX_NR_ZONES; j++) {
355 @@ -1065,10 +1065,10 @@ static void zoneinfo_show_print(struct s
356 zone_page_state(zone, i));
359 - "\n protection: (%lu",
360 + "\n protection: (%ld",
361 zone->lowmem_reserve[0]);
362 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
363 - seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
364 + seq_printf(m, ", %ld", zone->lowmem_reserve[i]);