]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob
349bb87d3d38199e1a401ac60b9c6ceafe943548
[thirdparty/kernel/stable-queue.git] /
1 From aa45484031ddee09b06350ab8528bfe5b2c76d1c Mon Sep 17 00:00:00 2001
2 From: Christoph Lameter <cl@linux.com>
3 Date: Thu, 9 Sep 2010 16:38:17 -0700
4 Subject: mm: page allocator: calculate a better estimate of NR_FREE_PAGES when memory is low and kswapd is awake
5
6 From: Christoph Lameter <cl@linux.com>
7
8 commit aa45484031ddee09b06350ab8528bfe5b2c76d1c upstream.
9
10 Ordinarily watermark checks are based on the vmstat NR_FREE_PAGES as it is
11 cheaper than scanning a number of lists. To avoid synchronization
12 overhead, counter deltas are maintained on a per-cpu basis and drained
13 both periodically and when the delta is above a threshold. On large CPU
14 systems, the difference between the estimated and real value of
15 NR_FREE_PAGES can be very high. If NR_FREE_PAGES is much higher than
16 number of real free page in buddy, the VM can allocate pages below min
17 watermark, at worst reducing the real number of pages to zero. Even if
18 the OOM killer kills some victim for freeing memory, it may not free
19 memory if the exit path requires a new page resulting in livelock.
20
21 This patch introduces a zone_page_state_snapshot() function (courtesy of
22 Christoph) that takes a slightly more accurate view of an arbitrary vmstat
23 counter. It is used to read NR_FREE_PAGES while kswapd is awake to avoid
24 the watermark being accidentally broken. The estimate is not perfect and
25 may result in cache line bounces but is expected to be lighter than the
26 IPI calls necessary to continually drain the per-cpu counters while kswapd
27 is awake.
28
29 Signed-off-by: Christoph Lameter <cl@linux.com>
30 Signed-off-by: Mel Gorman <mel@csn.ul.ie>
31 Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
32 Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
33 Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
34
35
36 ---
37 include/linux/mmzone.h | 13 +++++++++++++
38 include/linux/vmstat.h | 22 ++++++++++++++++++++++
39 mm/mmzone.c | 21 +++++++++++++++++++++
40 mm/page_alloc.c | 4 ++--
41 mm/vmstat.c | 15 ++++++++++++++-
42 5 files changed, 72 insertions(+), 3 deletions(-)
43
44 --- a/include/linux/mmzone.h
45 +++ b/include/linux/mmzone.h
46 @@ -290,6 +290,13 @@ struct zone {
47 unsigned long watermark[NR_WMARK];
48
49 /*
50 + * When free pages are below this point, additional steps are taken
51 + * when reading the number of free pages to avoid per-cpu counter
52 + * drift allowing watermarks to be breached
53 + */
54 + unsigned long percpu_drift_mark;
55 +
56 + /*
57 * We don't know if the memory that we're going to allocate will be freeable
58 * or/and it will be released eventually, so to avoid totally wasting several
59 * GB of ram we must reserve some of the lower zone memory (otherwise we risk
60 @@ -460,6 +467,12 @@ static inline int zone_is_oom_locked(con
61 return test_bit(ZONE_OOM_LOCKED, &zone->flags);
62 }
63
64 +#ifdef CONFIG_SMP
65 +unsigned long zone_nr_free_pages(struct zone *zone);
66 +#else
67 +#define zone_nr_free_pages(zone) zone_page_state(zone, NR_FREE_PAGES)
68 +#endif /* CONFIG_SMP */
69 +
70 /*
71 * The "priority" of VM scanning is how much of the queues we will scan in one
72 * go. A value of 12 for DEF_PRIORITY implies that we will scan 1/4096th of the
73 --- a/include/linux/vmstat.h
74 +++ b/include/linux/vmstat.h
75 @@ -166,6 +166,28 @@ static inline unsigned long zone_page_st
76 return x;
77 }
78
79 +/*
80 + * More accurate version that also considers the currently pending
81 + * deltas. For that we need to loop over all cpus to find the current
82 + * deltas. There is no synchronization so the result cannot be
83 + * exactly accurate either.
84 + */
85 +static inline unsigned long zone_page_state_snapshot(struct zone *zone,
86 + enum zone_stat_item item)
87 +{
88 + long x = atomic_long_read(&zone->vm_stat[item]);
89 +
90 +#ifdef CONFIG_SMP
91 + int cpu;
92 + for_each_online_cpu(cpu)
93 + x += zone_pcp(zone, cpu)->vm_stat_diff[item];
94 +
95 + if (x < 0)
96 + x = 0;
97 +#endif
98 + return x;
99 +}
100 +
101 extern unsigned long global_reclaimable_pages(void);
102 extern unsigned long zone_reclaimable_pages(struct zone *zone);
103
104 --- a/mm/mmzone.c
105 +++ b/mm/mmzone.c
106 @@ -87,3 +87,24 @@ int memmap_valid_within(unsigned long pf
107 return 1;
108 }
109 #endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
110 +
111 +#ifdef CONFIG_SMP
112 +/* Called when a more accurate view of NR_FREE_PAGES is needed */
113 +unsigned long zone_nr_free_pages(struct zone *zone)
114 +{
115 + unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
116 +
117 + /*
118 + * While kswapd is awake, it is considered the zone is under some
119 + * memory pressure. Under pressure, there is a risk that
120 + * per-cpu-counter-drift will allow the min watermark to be breached
121 + * potentially causing a live-lock. While kswapd is awake and
122 + * free pages are low, get a better estimate for free pages
123 + */
124 + if (nr_free_pages < zone->percpu_drift_mark &&
125 + !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
126 + return zone_page_state_snapshot(zone, NR_FREE_PAGES);
127 +
128 + return nr_free_pages;
129 +}
130 +#endif /* CONFIG_SMP */
131 --- a/mm/page_alloc.c
132 +++ b/mm/page_alloc.c
133 @@ -1365,7 +1365,7 @@ int zone_watermark_ok(struct zone *z, in
134 {
135 /* free_pages my go negative - that's OK */
136 long min = mark;
137 - long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
138 + long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
139 int o;
140
141 if (alloc_flags & ALLOC_HIGH)
142 @@ -2250,7 +2250,7 @@ void show_free_areas(void)
143 " all_unreclaimable? %s"
144 "\n",
145 zone->name,
146 - K(zone_page_state(zone, NR_FREE_PAGES)),
147 + K(zone_nr_free_pages(zone)),
148 K(min_wmark_pages(zone)),
149 K(low_wmark_pages(zone)),
150 K(high_wmark_pages(zone)),
151 --- a/mm/vmstat.c
152 +++ b/mm/vmstat.c
153 @@ -136,10 +136,23 @@ static void refresh_zone_stat_thresholds
154 int threshold;
155
156 for_each_populated_zone(zone) {
157 + unsigned long max_drift, tolerate_drift;
158 +
159 threshold = calculate_threshold(zone);
160
161 for_each_online_cpu(cpu)
162 zone_pcp(zone, cpu)->stat_threshold = threshold;
163 +
164 + /*
165 + * Only set percpu_drift_mark if there is a danger that
166 + * NR_FREE_PAGES reports the low watermark is ok when in fact
167 + * the min watermark could be breached by an allocation
168 + */
169 + tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
170 + max_drift = num_online_cpus() * threshold;
171 + if (max_drift > tolerate_drift)
172 + zone->percpu_drift_mark = high_wmark_pages(zone) +
173 + max_drift;
174 }
175 }
176
177 @@ -715,7 +728,7 @@ static void zoneinfo_show_print(struct s
178 "\n scanned %lu"
179 "\n spanned %lu"
180 "\n present %lu",
181 - zone_page_state(zone, NR_FREE_PAGES),
182 + zone_nr_free_pages(zone),
183 min_wmark_pages(zone),
184 low_wmark_pages(zone),
185 high_wmark_pages(zone),