]>
Commit | Line | Data |
---|---|---|
c4e04f43 GKH |
1 | From 47504ee04b9241548ae2c28be7d0b01cff3b7aa6 Mon Sep 17 00:00:00 2001 |
2 | From: Dennis Zhou <dennisszhou@gmail.com> | |
3 | Date: Fri, 16 Feb 2018 12:07:19 -0600 | |
4 | Subject: percpu: add __GFP_NORETRY semantics to the percpu balancing path | |
5 | ||
6 | From: Dennis Zhou <dennisszhou@gmail.com> | |
7 | ||
8 | commit 47504ee04b9241548ae2c28be7d0b01cff3b7aa6 upstream. | |
9 | ||
10 | Percpu memory using the vmalloc area based chunk allocator lazily | |
11 | populates chunks by first requesting the full virtual address space | |
12 | required for the chunk and subsequently adding pages as allocations come | |
13 | through. To ensure atomic allocations can succeed, a workqueue item is | |
14 | used to maintain a minimum number of empty pages. In certain scenarios, | |
15 | such as reported in [1], it is possible that physical memory becomes | |
16 | quite scarce which can result in either a rather long time spent trying | |
17 | to find free pages or worse, a kernel panic. | |
18 | ||
19 | This patch adds support for __GFP_NORETRY and __GFP_NOWARN passing them | |
20 | through to the underlying allocators. This should prevent any | |
21 | unnecessary panics potentially caused by the workqueue item. The passing | |
22 | of gfp around is as additional flags rather than a full set of flags. | |
23 | The next patch will change these to caller passed semantics. | |
24 | ||
25 | V2: | |
26 | Added const modifier to gfp flags in the balance path. | |
27 | Removed an extra whitespace. | |
28 | ||
29 | [1] https://lkml.org/lkml/2018/2/12/551 | |
30 | ||
31 | Signed-off-by: Dennis Zhou <dennisszhou@gmail.com> | |
32 | Suggested-by: Daniel Borkmann <daniel@iogearbox.net> | |
33 | Reported-by: syzbot+adb03f3f0bb57ce3acda@syzkaller.appspotmail.com | |
34 | Acked-by: Christoph Lameter <cl@linux.com> | |
35 | Signed-off-by: Tejun Heo <tj@kernel.org> | |
36 | Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org> | |
37 | ||
38 | --- | |
39 | mm/percpu-km.c | 8 ++++---- | |
40 | mm/percpu-vm.c | 18 +++++++++++------- | |
41 | mm/percpu.c | 45 ++++++++++++++++++++++++++++----------------- | |
42 | 3 files changed, 43 insertions(+), 28 deletions(-) | |
43 | ||
44 | --- a/mm/percpu-km.c | |
45 | +++ b/mm/percpu-km.c | |
46 | @@ -34,7 +34,7 @@ | |
47 | #include <linux/log2.h> | |
48 | ||
49 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, | |
50 | - int page_start, int page_end) | |
51 | + int page_start, int page_end, gfp_t gfp) | |
52 | { | |
53 | return 0; | |
54 | } | |
55 | @@ -45,18 +45,18 @@ static void pcpu_depopulate_chunk(struct | |
56 | /* nada */ | |
57 | } | |
58 | ||
59 | -static struct pcpu_chunk *pcpu_create_chunk(void) | |
60 | +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) | |
61 | { | |
62 | const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT; | |
63 | struct pcpu_chunk *chunk; | |
64 | struct page *pages; | |
65 | int i; | |
66 | ||
67 | - chunk = pcpu_alloc_chunk(); | |
68 | + chunk = pcpu_alloc_chunk(gfp); | |
69 | if (!chunk) | |
70 | return NULL; | |
71 | ||
72 | - pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages)); | |
73 | + pages = alloc_pages(gfp | GFP_KERNEL, order_base_2(nr_pages)); | |
74 | if (!pages) { | |
75 | pcpu_free_chunk(chunk); | |
76 | return NULL; | |
77 | --- a/mm/percpu-vm.c | |
78 | +++ b/mm/percpu-vm.c | |
79 | @@ -37,7 +37,7 @@ static struct page **pcpu_get_pages(void | |
80 | lockdep_assert_held(&pcpu_alloc_mutex); | |
81 | ||
82 | if (!pages) | |
83 | - pages = pcpu_mem_zalloc(pages_size); | |
84 | + pages = pcpu_mem_zalloc(pages_size, 0); | |
85 | return pages; | |
86 | } | |
87 | ||
88 | @@ -73,18 +73,21 @@ static void pcpu_free_pages(struct pcpu_ | |
89 | * @pages: array to put the allocated pages into, indexed by pcpu_page_idx() | |
90 | * @page_start: page index of the first page to be allocated | |
91 | * @page_end: page index of the last page to be allocated + 1 | |
92 | + * @gfp: allocation flags passed to the underlying allocator | |
93 | * | |
94 | * Allocate pages [@page_start,@page_end) into @pages for all units. | |
95 | * The allocation is for @chunk. Percpu core doesn't care about the | |
96 | * content of @pages and will pass it verbatim to pcpu_map_pages(). | |
97 | */ | |
98 | static int pcpu_alloc_pages(struct pcpu_chunk *chunk, | |
99 | - struct page **pages, int page_start, int page_end) | |
100 | + struct page **pages, int page_start, int page_end, | |
101 | + gfp_t gfp) | |
102 | { | |
103 | - const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; | |
104 | unsigned int cpu, tcpu; | |
105 | int i; | |
106 | ||
107 | + gfp |= GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD; | |
108 | + | |
109 | for_each_possible_cpu(cpu) { | |
110 | for (i = page_start; i < page_end; i++) { | |
111 | struct page **pagep = &pages[pcpu_page_idx(cpu, i)]; | |
112 | @@ -262,6 +265,7 @@ static void pcpu_post_map_flush(struct p | |
113 | * @chunk: chunk of interest | |
114 | * @page_start: the start page | |
115 | * @page_end: the end page | |
116 | + * @gfp: allocation flags passed to the underlying memory allocator | |
117 | * | |
118 | * For each cpu, populate and map pages [@page_start,@page_end) into | |
119 | * @chunk. | |
120 | @@ -270,7 +274,7 @@ static void pcpu_post_map_flush(struct p | |
121 | * pcpu_alloc_mutex, does GFP_KERNEL allocation. | |
122 | */ | |
123 | static int pcpu_populate_chunk(struct pcpu_chunk *chunk, | |
124 | - int page_start, int page_end) | |
125 | + int page_start, int page_end, gfp_t gfp) | |
126 | { | |
127 | struct page **pages; | |
128 | ||
129 | @@ -278,7 +282,7 @@ static int pcpu_populate_chunk(struct pc | |
130 | if (!pages) | |
131 | return -ENOMEM; | |
132 | ||
133 | - if (pcpu_alloc_pages(chunk, pages, page_start, page_end)) | |
134 | + if (pcpu_alloc_pages(chunk, pages, page_start, page_end, gfp)) | |
135 | return -ENOMEM; | |
136 | ||
137 | if (pcpu_map_pages(chunk, pages, page_start, page_end)) { | |
138 | @@ -325,12 +329,12 @@ static void pcpu_depopulate_chunk(struct | |
139 | pcpu_free_pages(chunk, pages, page_start, page_end); | |
140 | } | |
141 | ||
142 | -static struct pcpu_chunk *pcpu_create_chunk(void) | |
143 | +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp) | |
144 | { | |
145 | struct pcpu_chunk *chunk; | |
146 | struct vm_struct **vms; | |
147 | ||
148 | - chunk = pcpu_alloc_chunk(); | |
149 | + chunk = pcpu_alloc_chunk(gfp); | |
150 | if (!chunk) | |
151 | return NULL; | |
152 | ||
153 | --- a/mm/percpu.c | |
154 | +++ b/mm/percpu.c | |
155 | @@ -447,10 +447,12 @@ static void pcpu_next_fit_region(struct | |
156 | /** | |
157 | * pcpu_mem_zalloc - allocate memory | |
158 | * @size: bytes to allocate | |
159 | + * @gfp: allocation flags | |
160 | * | |
161 | * Allocate @size bytes. If @size is smaller than PAGE_SIZE, | |
162 | - * kzalloc() is used; otherwise, vzalloc() is used. The returned | |
163 | - * memory is always zeroed. | |
164 | + * kzalloc() is used; otherwise, the equivalent of vzalloc() is used. | |
165 | + * This is to facilitate passing through whitelisted flags. The | |
166 | + * returned memory is always zeroed. | |
167 | * | |
168 | * CONTEXT: | |
169 | * Does GFP_KERNEL allocation. | |
170 | @@ -458,15 +460,16 @@ static void pcpu_next_fit_region(struct | |
171 | * RETURNS: | |
172 | * Pointer to the allocated area on success, NULL on failure. | |
173 | */ | |
174 | -static void *pcpu_mem_zalloc(size_t size) | |
175 | +static void *pcpu_mem_zalloc(size_t size, gfp_t gfp) | |
176 | { | |
177 | if (WARN_ON_ONCE(!slab_is_available())) | |
178 | return NULL; | |
179 | ||
180 | if (size <= PAGE_SIZE) | |
181 | - return kzalloc(size, GFP_KERNEL); | |
182 | + return kzalloc(size, gfp | GFP_KERNEL); | |
183 | else | |
184 | - return vzalloc(size); | |
185 | + return __vmalloc(size, gfp | GFP_KERNEL | __GFP_ZERO, | |
186 | + PAGE_KERNEL); | |
187 | } | |
188 | ||
189 | /** | |
190 | @@ -1154,12 +1157,12 @@ static struct pcpu_chunk * __init pcpu_a | |
191 | return chunk; | |
192 | } | |
193 | ||
194 | -static struct pcpu_chunk *pcpu_alloc_chunk(void) | |
195 | +static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp) | |
196 | { | |
197 | struct pcpu_chunk *chunk; | |
198 | int region_bits; | |
199 | ||
200 | - chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size); | |
201 | + chunk = pcpu_mem_zalloc(pcpu_chunk_struct_size, gfp); | |
202 | if (!chunk) | |
203 | return NULL; | |
204 | ||
205 | @@ -1168,17 +1171,17 @@ static struct pcpu_chunk *pcpu_alloc_chu | |
206 | region_bits = pcpu_chunk_map_bits(chunk); | |
207 | ||
208 | chunk->alloc_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits) * | |
209 | - sizeof(chunk->alloc_map[0])); | |
210 | + sizeof(chunk->alloc_map[0]), gfp); | |
211 | if (!chunk->alloc_map) | |
212 | goto alloc_map_fail; | |
213 | ||
214 | chunk->bound_map = pcpu_mem_zalloc(BITS_TO_LONGS(region_bits + 1) * | |
215 | - sizeof(chunk->bound_map[0])); | |
216 | + sizeof(chunk->bound_map[0]), gfp); | |
217 | if (!chunk->bound_map) | |
218 | goto bound_map_fail; | |
219 | ||
220 | chunk->md_blocks = pcpu_mem_zalloc(pcpu_chunk_nr_blocks(chunk) * | |
221 | - sizeof(chunk->md_blocks[0])); | |
222 | + sizeof(chunk->md_blocks[0]), gfp); | |
223 | if (!chunk->md_blocks) | |
224 | goto md_blocks_fail; | |
225 | ||
226 | @@ -1277,9 +1280,10 @@ static void pcpu_chunk_depopulated(struc | |
227 | * pcpu_addr_to_page - translate address to physical address | |
228 | * pcpu_verify_alloc_info - check alloc_info is acceptable during init | |
229 | */ | |
230 | -static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size); | |
231 | +static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size, | |
232 | + gfp_t gfp); | |
233 | static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size); | |
234 | -static struct pcpu_chunk *pcpu_create_chunk(void); | |
235 | +static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp); | |
236 | static void pcpu_destroy_chunk(struct pcpu_chunk *chunk); | |
237 | static struct page *pcpu_addr_to_page(void *addr); | |
238 | static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai); | |
239 | @@ -1421,7 +1425,7 @@ restart: | |
240 | } | |
241 | ||
242 | if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) { | |
243 | - chunk = pcpu_create_chunk(); | |
244 | + chunk = pcpu_create_chunk(0); | |
245 | if (!chunk) { | |
246 | err = "failed to allocate new chunk"; | |
247 | goto fail; | |
248 | @@ -1450,7 +1454,7 @@ area_found: | |
249 | page_start, page_end) { | |
250 | WARN_ON(chunk->immutable); | |
251 | ||
252 | - ret = pcpu_populate_chunk(chunk, rs, re); | |
253 | + ret = pcpu_populate_chunk(chunk, rs, re, 0); | |
254 | ||
255 | spin_lock_irqsave(&pcpu_lock, flags); | |
256 | if (ret) { | |
257 | @@ -1561,10 +1565,17 @@ void __percpu *__alloc_reserved_percpu(s | |
258 | * pcpu_balance_workfn - manage the amount of free chunks and populated pages | |
259 | * @work: unused | |
260 | * | |
261 | - * Reclaim all fully free chunks except for the first one. | |
262 | + * Reclaim all fully free chunks except for the first one. This is also | |
263 | + * responsible for maintaining the pool of empty populated pages. However, | |
264 | + * it is possible that this is called when physical memory is scarce causing | |
265 | + * OOM killer to be triggered. We should avoid doing so until an actual | |
266 | + * allocation causes the failure as it is possible that requests can be | |
267 | + * serviced from already backed regions. | |
268 | */ | |
269 | static void pcpu_balance_workfn(struct work_struct *work) | |
270 | { | |
271 | + /* gfp flags passed to underlying allocators */ | |
272 | + const gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN; | |
273 | LIST_HEAD(to_free); | |
274 | struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1]; | |
275 | struct pcpu_chunk *chunk, *next; | |
276 | @@ -1645,7 +1656,7 @@ retry_pop: | |
277 | chunk->nr_pages) { | |
278 | int nr = min(re - rs, nr_to_pop); | |
279 | ||
280 | - ret = pcpu_populate_chunk(chunk, rs, rs + nr); | |
281 | + ret = pcpu_populate_chunk(chunk, rs, rs + nr, gfp); | |
282 | if (!ret) { | |
283 | nr_to_pop -= nr; | |
284 | spin_lock_irq(&pcpu_lock); | |
285 | @@ -1662,7 +1673,7 @@ retry_pop: | |
286 | ||
287 | if (nr_to_pop) { | |
288 | /* ran out of chunks to populate, create a new one and retry */ | |
289 | - chunk = pcpu_create_chunk(); | |
290 | + chunk = pcpu_create_chunk(gfp); | |
291 | if (chunk) { | |
292 | spin_lock_irq(&pcpu_lock); | |
293 | pcpu_chunk_relocate(chunk, -1); |