]>
Commit | Line | Data |
---|---|---|
2cb7cef9 BS |
1 | From: Peter Zijlstra <a.p.zijlstra@chello.nl> |
2 | Subject: mm: emergency pool | |
3 | Patch-mainline: No | |
4 | References: FATE#303834 | |
5 | ||
6 | Provide means to reserve a specific amount of pages. | |
7 | ||
8 | The emergency pool is separated from the min watermark because ALLOC_HARDER | |
9 | and ALLOC_HIGH modify the watermark in a relative way and thus do not ensure | |
10 | a strict minimum. | |
11 | ||
12 | Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> | |
13 | Acked-by: Neil Brown <neilb@suse.de> | |
14 | Acked-by: Suresh Jayaraman <sjayaraman@suse.de> | |
15 | ||
16 | --- | |
17 | include/linux/mmzone.h | 6 ++- | |
18 | mm/page_alloc.c | 86 ++++++++++++++++++++++++++++++++++++++++++------- | |
19 | mm/vmstat.c | 6 +-- | |
20 | 3 files changed, 83 insertions(+), 15 deletions(-) | |
21 | ||
22 | --- a/include/linux/mmzone.h | |
23 | +++ b/include/linux/mmzone.h | |
24 | @@ -206,7 +206,10 @@ enum zone_type { | |
25 | ||
26 | struct zone { | |
27 | /* Fields commonly accessed by the page allocator */ | |
28 | - unsigned long pages_min, pages_low, pages_high; | |
29 | + unsigned long pages_high; /* we stop kswapd */ | |
30 | + unsigned long pages_low; /* we wake up kswapd */ | |
31 | + unsigned long pages_min; /* we enter direct reclaim */ | |
32 | + unsigned long pages_emerg; /* emergency pool */ | |
33 | /* | |
34 | * We don't know if the memory that we're going to allocate will be freeable | |
35 | * or/and it will be released eventually, so to avoid totally wasting several | |
36 | @@ -674,6 +677,7 @@ int sysctl_min_unmapped_ratio_sysctl_han | |
37 | struct file *, void __user *, size_t *, loff_t *); | |
38 | int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *, int, | |
39 | struct file *, void __user *, size_t *, loff_t *); | |
40 | +int adjust_memalloc_reserve(int pages); | |
41 | ||
42 | extern int numa_zonelist_order_handler(struct ctl_table *, int, | |
43 | struct file *, void __user *, size_t *, loff_t *); | |
44 | --- a/mm/page_alloc.c | |
45 | +++ b/mm/page_alloc.c | |
46 | @@ -122,6 +122,8 @@ static char * const zone_names[MAX_NR_ZO | |
47 | ||
48 | static DEFINE_SPINLOCK(min_free_lock); | |
49 | int min_free_kbytes = 1024; | |
50 | +static DEFINE_MUTEX(var_free_mutex); | |
51 | +int var_free_kbytes; | |
52 | ||
53 | unsigned long __meminitdata nr_kernel_pages; | |
54 | unsigned long __meminitdata nr_all_pages; | |
55 | @@ -1241,7 +1243,7 @@ int zone_watermark_ok(struct zone *z, in | |
56 | if (alloc_flags & ALLOC_HARDER) | |
57 | min -= min / 4; | |
58 | ||
59 | - if (free_pages <= min + z->lowmem_reserve[classzone_idx]) | |
60 | + if (free_pages <= min+z->lowmem_reserve[classzone_idx]+z->pages_emerg) | |
61 | return 0; | |
62 | for (o = 0; o < order; o++) { | |
63 | /* At the next order, this order's pages become unavailable */ | |
64 | @@ -1508,7 +1510,7 @@ __alloc_pages_internal(gfp_t gfp_mask, u | |
65 | struct reclaim_state reclaim_state; | |
66 | struct task_struct *p = current; | |
67 | int do_retry; | |
68 | - int alloc_flags; | |
69 | + int alloc_flags = 0; | |
70 | unsigned long did_some_progress; | |
71 | unsigned long pages_reclaimed = 0; | |
72 | ||
73 | @@ -1679,9 +1681,9 @@ nopage: | |
74 | printk(KERN_INFO "everything is working fine. Allocations from irqs cannot be\n"); | |
75 | printk(KERN_INFO "perfectly reliable and the kernel is designed to handle that.\n"); | |
76 | } | |
77 | - printk(KERN_INFO "%s: page allocation failure." | |
78 | - " order:%d, mode:0x%x\n", | |
79 | - p->comm, order, gfp_mask); | |
80 | + printk(KERN_WARNING "%s: page allocation failure." | |
81 | + " order:%d, mode:0x%x, alloc_flags:0x%x, pflags:0x%x\n", | |
82 | + p->comm, order, gfp_mask, alloc_flags, p->flags); | |
83 | dump_stack(); | |
84 | show_mem(); | |
85 | } | |
86 | @@ -1945,9 +1947,9 @@ void show_free_areas(void) | |
87 | "\n", | |
88 | zone->name, | |
89 | K(zone_page_state(zone, NR_FREE_PAGES)), | |
90 | - K(zone->pages_min), | |
91 | - K(zone->pages_low), | |
92 | - K(zone->pages_high), | |
93 | + K(zone->pages_emerg + zone->pages_min), | |
94 | + K(zone->pages_emerg + zone->pages_low), | |
95 | + K(zone->pages_emerg + zone->pages_high), | |
96 | K(zone_page_state(zone, NR_ACTIVE)), | |
97 | K(zone_page_state(zone, NR_INACTIVE)), | |
98 | K(zone->present_pages), | |
99 | @@ -4211,7 +4213,7 @@ static void calculate_totalreserve_pages | |
100 | } | |
101 | ||
102 | /* we treat pages_high as reserved pages. */ | |
103 | - max += zone->pages_high; | |
104 | + max += zone->pages_high + zone->pages_emerg; | |
105 | ||
106 | if (max > zone->present_pages) | |
107 | max = zone->present_pages; | |
108 | @@ -4268,7 +4270,8 @@ static void setup_per_zone_lowmem_reserv | |
109 | */ | |
110 | static void __setup_per_zone_pages_min(void) | |
111 | { | |
112 | - unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | |
113 | + unsigned pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); | |
114 | + unsigned pages_emerg = var_free_kbytes >> (PAGE_SHIFT - 10); | |
115 | unsigned long lowmem_pages = 0; | |
116 | struct zone *zone; | |
117 | unsigned long flags; | |
118 | @@ -4280,11 +4283,13 @@ static void __setup_per_zone_pages_min(v | |
119 | } | |
120 | ||
121 | for_each_zone(zone) { | |
122 | - u64 tmp; | |
123 | + u64 tmp, tmp_emerg; | |
124 | ||
125 | spin_lock_irqsave(&zone->lock, flags); | |
126 | tmp = (u64)pages_min * zone->present_pages; | |
127 | do_div(tmp, lowmem_pages); | |
128 | + tmp_emerg = (u64)pages_emerg * zone->present_pages; | |
129 | + do_div(tmp_emerg, lowmem_pages); | |
130 | if (is_highmem(zone)) { | |
131 | /* | |
132 | * __GFP_HIGH and PF_MEMALLOC allocations usually don't | |
133 | @@ -4303,12 +4308,14 @@ static void __setup_per_zone_pages_min(v | |
134 | if (min_pages > 128) | |
135 | min_pages = 128; | |
136 | zone->pages_min = min_pages; | |
137 | + zone->pages_emerg = 0; | |
138 | } else { | |
139 | /* | |
140 | * If it's a lowmem zone, reserve a number of pages | |
141 | * proportionate to the zone's size. | |
142 | */ | |
143 | zone->pages_min = tmp; | |
144 | + zone->pages_emerg = tmp_emerg; | |
145 | } | |
146 | ||
147 | zone->pages_low = zone->pages_min + (tmp >> 2); | |
148 | @@ -4330,6 +4337,63 @@ void setup_per_zone_pages_min(void) | |
149 | spin_unlock_irqrestore(&min_free_lock, flags); | |
150 | } | |
151 | ||
152 | +static void __adjust_memalloc_reserve(int pages) | |
153 | +{ | |
154 | + var_free_kbytes += pages << (PAGE_SHIFT - 10); | |
155 | + BUG_ON(var_free_kbytes < 0); | |
156 | + setup_per_zone_pages_min(); | |
157 | +} | |
158 | + | |
159 | +static int test_reserve_limits(void) | |
160 | +{ | |
161 | + struct zone *zone; | |
162 | + int node; | |
163 | + | |
164 | + for_each_zone(zone) | |
165 | + wakeup_kswapd(zone, 0); | |
166 | + | |
167 | + for_each_online_node(node) { | |
168 | + struct page *page = alloc_pages_node(node, GFP_KERNEL, 0); | |
169 | + if (!page) | |
170 | + return -ENOMEM; | |
171 | + | |
172 | + __free_page(page); | |
173 | + } | |
174 | + | |
175 | + return 0; | |
176 | +} | |
177 | + | |
178 | +/** | |
179 | + * adjust_memalloc_reserve - adjust the memalloc reserve | |
180 | + * @pages: number of pages to add | |
181 | + * | |
182 | + * It adds a number of pages to the memalloc reserve; if | |
183 | + * the number was positive it kicks reclaim into action to | |
184 | + * satisfy the higher watermarks. | |
185 | + * | |
186 | + * returns -ENOMEM when it failed to satisfy the watermarks. | |
187 | + */ | |
188 | +int adjust_memalloc_reserve(int pages) | |
189 | +{ | |
190 | + int err = 0; | |
191 | + | |
192 | + mutex_lock(&var_free_mutex); | |
193 | + __adjust_memalloc_reserve(pages); | |
194 | + if (pages > 0) { | |
195 | + err = test_reserve_limits(); | |
196 | + if (err) { | |
197 | + __adjust_memalloc_reserve(-pages); | |
198 | + goto unlock; | |
199 | + } | |
200 | + } | |
201 | + printk(KERN_DEBUG "Emergency reserve: %d\n", var_free_kbytes); | |
202 | + | |
203 | +unlock: | |
204 | + mutex_unlock(&var_free_mutex); | |
205 | + return err; | |
206 | +} | |
207 | +EXPORT_SYMBOL_GPL(adjust_memalloc_reserve); | |
208 | + | |
209 | /* | |
210 | * Initialise min_free_kbytes. | |
211 | * | |
212 | --- a/mm/vmstat.c | |
213 | +++ b/mm/vmstat.c | |
214 | @@ -692,9 +692,9 @@ static void zoneinfo_show_print(struct s | |
215 | "\n spanned %lu" | |
216 | "\n present %lu", | |
217 | zone_page_state(zone, NR_FREE_PAGES), | |
218 | - zone->pages_min, | |
219 | - zone->pages_low, | |
220 | - zone->pages_high, | |
221 | + zone->pages_emerg + zone->pages_min, | |
222 | + zone->pages_emerg + zone->pages_low, | |
223 | + zone->pages_emerg + zone->pages_high, | |
224 | zone->pages_scanned, | |
225 | zone->nr_scan_active, zone->nr_scan_inactive, | |
226 | zone->spanned_pages, |