]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob
6f7549fe364101e32995f5f2e3482bdaeb54bce5
[thirdparty/kernel/stable-queue.git] /
1 From 3484b2de9499df23c4604a513b36f96326ae81ad Mon Sep 17 00:00:00 2001
2 From: Mel Gorman <mgorman@suse.de>
3 Date: Wed, 6 Aug 2014 16:07:14 -0700
4 Subject: mm: rearrange zone fields into read-only, page alloc, statistics and page reclaim lines
5
6 From: Mel Gorman <mgorman@suse.de>
7
8 commit 3484b2de9499df23c4604a513b36f96326ae81ad upstream.
9
10 The arrangement of struct zone has changed over time and now it has
11 reached the point where there is some inappropriate sharing going on.
12 On x86-64 for example
13
14 o The zone->node field is shared with the zone lock and zone->node is
15 accessed frequently from the page allocator due to the fair zone
16 allocation policy.
17
18 o span_seqlock is almost never used by shares a line with free_area
19
20 o Some zone statistics share a cache line with the LRU lock so
21 reclaim-intensive and allocator-intensive workloads can bounce the cache
22 line on a stat update
23
24 This patch rearranges struct zone to put read-only and read-mostly
25 fields together and then splits the page allocator intensive fields, the
26 zone statistics and the page reclaim intensive fields into their own
27 cache lines. Note that the type of lowmem_reserve changes due to the
28 watermark calculations being signed and avoiding a signed/unsigned
29 conversion there.
30
31 On the test configuration I used the overall size of struct zone shrunk
32 by one cache line. On smaller machines, this is not likely to be
33 noticable. However, on a 4-node NUMA machine running tiobench the
34 system CPU overhead is reduced by this patch.
35
36 3.16.0-rc3 3.16.0-rc3
37 vanillarearrange-v5r9
38 User 746.94 759.78
39 System 65336.22 58350.98
40 Elapsed 27553.52 27282.02
41
42 Signed-off-by: Mel Gorman <mgorman@suse.de>
43 Acked-by: Johannes Weiner <hannes@cmpxchg.org>
44 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
45 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
46 Signed-off-by: Mel Gorman <mgorman@suse.de>
47 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
48 ---
49 include/linux/mmzone.h | 205 +++++++++++++++++++++++++------------------------
50 mm/page_alloc.c | 7 -
51 mm/vmstat.c | 4
52 3 files changed, 110 insertions(+), 106 deletions(-)
53
54 --- a/include/linux/mmzone.h
55 +++ b/include/linux/mmzone.h
56 @@ -321,19 +321,12 @@ enum zone_type {
57 #ifndef __GENERATING_BOUNDS_H
58
59 struct zone {
60 - /* Fields commonly accessed by the page allocator */
61 + /* Read-mostly fields */
62
63 /* zone watermarks, access with *_wmark_pages(zone) macros */
64 unsigned long watermark[NR_WMARK];
65
66 /*
67 - * When free pages are below this point, additional steps are taken
68 - * when reading the number of free pages to avoid per-cpu counter
69 - * drift allowing watermarks to be breached
70 - */
71 - unsigned long percpu_drift_mark;
72 -
73 - /*
74 * We don't know if the memory that we're going to allocate will be freeable
75 * or/and it will be released eventually, so to avoid totally wasting several
76 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
77 @@ -341,41 +334,26 @@ struct zone {
78 * on the higher zones). This array is recalculated at runtime if the
79 * sysctl_lowmem_reserve_ratio sysctl changes.
80 */
81 - unsigned long lowmem_reserve[MAX_NR_ZONES];
82 -
83 - /*
84 - * This is a per-zone reserve of pages that should not be
85 - * considered dirtyable memory.
86 - */
87 - unsigned long dirty_balance_reserve;
88 + long lowmem_reserve[MAX_NR_ZONES];
89
90 #ifdef CONFIG_NUMA
91 int node;
92 +#endif
93 +
94 /*
95 - * zone reclaim becomes active if more unmapped pages exist.
96 + * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
97 + * this zone's LRU. Maintained by the pageout code.
98 */
99 - unsigned long min_unmapped_pages;
100 - unsigned long min_slab_pages;
101 -#endif
102 + unsigned int inactive_ratio;
103 +
104 + struct pglist_data *zone_pgdat;
105 struct per_cpu_pageset __percpu *pageset;
106 +
107 /*
108 - * free areas of different sizes
109 + * This is a per-zone reserve of pages that should not be
110 + * considered dirtyable memory.
111 */
112 - spinlock_t lock;
113 -#if defined CONFIG_COMPACTION || defined CONFIG_CMA
114 - /* Set to true when the PG_migrate_skip bits should be cleared */
115 - bool compact_blockskip_flush;
116 -
117 - /* pfn where compaction free scanner should start */
118 - unsigned long compact_cached_free_pfn;
119 - /* pfn where async and sync compaction migration scanner should start */
120 - unsigned long compact_cached_migrate_pfn[2];
121 -#endif
122 -#ifdef CONFIG_MEMORY_HOTPLUG
123 - /* see spanned/present_pages for more description */
124 - seqlock_t span_seqlock;
125 -#endif
126 - struct free_area free_area[MAX_ORDER];
127 + unsigned long dirty_balance_reserve;
128
129 #ifndef CONFIG_SPARSEMEM
130 /*
131 @@ -385,71 +363,14 @@ struct zone {
132 unsigned long *pageblock_flags;
133 #endif /* CONFIG_SPARSEMEM */
134
135 -#ifdef CONFIG_COMPACTION
136 - /*
137 - * On compaction failure, 1<<compact_defer_shift compactions
138 - * are skipped before trying again. The number attempted since
139 - * last failure is tracked with compact_considered.
140 - */
141 - unsigned int compact_considered;
142 - unsigned int compact_defer_shift;
143 - int compact_order_failed;
144 -#endif
145 -
146 - ZONE_PADDING(_pad1_)
147 -
148 - /* Fields commonly accessed by the page reclaim scanner */
149 - spinlock_t lru_lock;
150 - struct lruvec lruvec;
151 -
152 - unsigned long pages_scanned; /* since last reclaim */
153 - unsigned long flags; /* zone flags, see below */
154 -
155 - /* Zone statistics */
156 - atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
157 -
158 - /*
159 - * The target ratio of ACTIVE_ANON to INACTIVE_ANON pages on
160 - * this zone's LRU. Maintained by the pageout code.
161 - */
162 - unsigned int inactive_ratio;
163 -
164 -
165 - ZONE_PADDING(_pad2_)
166 - /* Rarely used or read-mostly fields */
167 -
168 +#ifdef CONFIG_NUMA
169 /*
170 - * wait_table -- the array holding the hash table
171 - * wait_table_hash_nr_entries -- the size of the hash table array
172 - * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
173 - *
174 - * The purpose of all these is to keep track of the people
175 - * waiting for a page to become available and make them
176 - * runnable again when possible. The trouble is that this
177 - * consumes a lot of space, especially when so few things
178 - * wait on pages at a given time. So instead of using
179 - * per-page waitqueues, we use a waitqueue hash table.
180 - *
181 - * The bucket discipline is to sleep on the same queue when
182 - * colliding and wake all in that wait queue when removing.
183 - * When something wakes, it must check to be sure its page is
184 - * truly available, a la thundering herd. The cost of a
185 - * collision is great, but given the expected load of the
186 - * table, they should be so rare as to be outweighed by the
187 - * benefits from the saved space.
188 - *
189 - * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
190 - * primary users of these fields, and in mm/page_alloc.c
191 - * free_area_init_core() performs the initialization of them.
192 + * zone reclaim becomes active if more unmapped pages exist.
193 */
194 - wait_queue_head_t * wait_table;
195 - unsigned long wait_table_hash_nr_entries;
196 - unsigned long wait_table_bits;
197 + unsigned long min_unmapped_pages;
198 + unsigned long min_slab_pages;
199 +#endif /* CONFIG_NUMA */
200
201 - /*
202 - * Discontig memory support fields.
203 - */
204 - struct pglist_data *zone_pgdat;
205 /* zone_start_pfn == zone_start_paddr >> PAGE_SHIFT */
206 unsigned long zone_start_pfn;
207
208 @@ -495,9 +416,11 @@ struct zone {
209 * adjust_managed_page_count() should be used instead of directly
210 * touching zone->managed_pages and totalram_pages.
211 */
212 + unsigned long managed_pages;
213 unsigned long spanned_pages;
214 unsigned long present_pages;
215 - unsigned long managed_pages;
216 +
217 + const char *name;
218
219 /*
220 * Number of MIGRATE_RESEVE page block. To maintain for just
221 @@ -505,10 +428,92 @@ struct zone {
222 */
223 int nr_migrate_reserve_block;
224
225 +#ifdef CONFIG_MEMORY_HOTPLUG
226 + /* see spanned/present_pages for more description */
227 + seqlock_t span_seqlock;
228 +#endif
229 +
230 /*
231 - * rarely used fields:
232 + * wait_table -- the array holding the hash table
233 + * wait_table_hash_nr_entries -- the size of the hash table array
234 + * wait_table_bits -- wait_table_size == (1 << wait_table_bits)
235 + *
236 + * The purpose of all these is to keep track of the people
237 + * waiting for a page to become available and make them
238 + * runnable again when possible. The trouble is that this
239 + * consumes a lot of space, especially when so few things
240 + * wait on pages at a given time. So instead of using
241 + * per-page waitqueues, we use a waitqueue hash table.
242 + *
243 + * The bucket discipline is to sleep on the same queue when
244 + * colliding and wake all in that wait queue when removing.
245 + * When something wakes, it must check to be sure its page is
246 + * truly available, a la thundering herd. The cost of a
247 + * collision is great, but given the expected load of the
248 + * table, they should be so rare as to be outweighed by the
249 + * benefits from the saved space.
250 + *
251 + * __wait_on_page_locked() and unlock_page() in mm/filemap.c, are the
252 + * primary users of these fields, and in mm/page_alloc.c
253 + * free_area_init_core() performs the initialization of them.
254 */
255 - const char *name;
256 + wait_queue_head_t *wait_table;
257 + unsigned long wait_table_hash_nr_entries;
258 + unsigned long wait_table_bits;
259 +
260 + ZONE_PADDING(_pad1_)
261 +
262 + /* Write-intensive fields used from the page allocator */
263 + spinlock_t lock;
264 +
265 + /* free areas of different sizes */
266 + struct free_area free_area[MAX_ORDER];
267 +
268 + /* zone flags, see below */
269 + unsigned long flags;
270 +
271 + ZONE_PADDING(_pad2_)
272 +
273 + /* Write-intensive fields used by page reclaim */
274 +
275 + /* Fields commonly accessed by the page reclaim scanner */
276 + spinlock_t lru_lock;
277 + unsigned long pages_scanned; /* since last reclaim */
278 + struct lruvec lruvec;
279 +
280 + /*
281 + * When free pages are below this point, additional steps are taken
282 + * when reading the number of free pages to avoid per-cpu counter
283 + * drift allowing watermarks to be breached
284 + */
285 + unsigned long percpu_drift_mark;
286 +
287 +#if defined CONFIG_COMPACTION || defined CONFIG_CMA
288 + /* pfn where compaction free scanner should start */
289 + unsigned long compact_cached_free_pfn;
290 + /* pfn where async and sync compaction migration scanner should start */
291 + unsigned long compact_cached_migrate_pfn[2];
292 +#endif
293 +
294 +#ifdef CONFIG_COMPACTION
295 + /*
296 + * On compaction failure, 1<<compact_defer_shift compactions
297 + * are skipped before trying again. The number attempted since
298 + * last failure is tracked with compact_considered.
299 + */
300 + unsigned int compact_considered;
301 + unsigned int compact_defer_shift;
302 + int compact_order_failed;
303 +#endif
304 +
305 +#if defined CONFIG_COMPACTION || defined CONFIG_CMA
306 + /* Set to true when the PG_migrate_skip bits should be cleared */
307 + bool compact_blockskip_flush;
308 +#endif
309 +
310 + ZONE_PADDING(_pad3_)
311 + /* Zone statistics */
312 + atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS];
313 } ____cacheline_internodealigned_in_smp;
314
315 typedef enum {
316 --- a/mm/page_alloc.c
317 +++ b/mm/page_alloc.c
318 @@ -1710,7 +1710,6 @@ static bool __zone_watermark_ok(struct z
319 {
320 /* free_pages my go negative - that's OK */
321 long min = mark;
322 - long lowmem_reserve = z->lowmem_reserve[classzone_idx];
323 int o;
324 long free_cma = 0;
325
326 @@ -1725,7 +1724,7 @@ static bool __zone_watermark_ok(struct z
327 free_cma = zone_page_state(z, NR_FREE_CMA_PAGES);
328 #endif
329
330 - if (free_pages - free_cma <= min + lowmem_reserve)
331 + if (free_pages - free_cma <= min + z->lowmem_reserve[classzone_idx])
332 return false;
333 for (o = 0; o < order; o++) {
334 /* At the next order, this order's pages become unavailable */
335 @@ -3257,7 +3256,7 @@ void show_free_areas(unsigned int filter
336 );
337 printk("lowmem_reserve[]:");
338 for (i = 0; i < MAX_NR_ZONES; i++)
339 - printk(" %lu", zone->lowmem_reserve[i]);
340 + printk(" %ld", zone->lowmem_reserve[i]);
341 printk("\n");
342 }
343
344 @@ -5585,7 +5584,7 @@ static void calculate_totalreserve_pages
345 for_each_online_pgdat(pgdat) {
346 for (i = 0; i < MAX_NR_ZONES; i++) {
347 struct zone *zone = pgdat->node_zones + i;
348 - unsigned long max = 0;
349 + long max = 0;
350
351 /* Find valid and maximum lowmem_reserve in the zone */
352 for (j = i; j < MAX_NR_ZONES; j++) {
353 --- a/mm/vmstat.c
354 +++ b/mm/vmstat.c
355 @@ -1065,10 +1065,10 @@ static void zoneinfo_show_print(struct s
356 zone_page_state(zone, i));
357
358 seq_printf(m,
359 - "\n protection: (%lu",
360 + "\n protection: (%ld",
361 zone->lowmem_reserve[0]);
362 for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
363 - seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
364 + seq_printf(m, ", %ld", zone->lowmem_reserve[i]);
365 seq_printf(m,
366 ")"
367 "\n pagesets");