]> git.ipfire.org Git - thirdparty/linux.git/blame - mm/vmalloc.c
mm/vmscan: remove kerneldoc-like comment from isolate_lru_pages
[thirdparty/linux.git] / mm / vmalloc.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4 2/*
1da177e4
LT
3 * Copyright (C) 1993 Linus Torvalds
4 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
5 * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
6 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
930fc45a 7 * Numa awareness, Christoph Lameter, SGI, June 2005
d758ffe6 8 * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
1da177e4
LT
9 */
10
db64fe02 11#include <linux/vmalloc.h>
1da177e4
LT
12#include <linux/mm.h>
13#include <linux/module.h>
14#include <linux/highmem.h>
c3edc401 15#include <linux/sched/signal.h>
1da177e4
LT
16#include <linux/slab.h>
17#include <linux/spinlock.h>
18#include <linux/interrupt.h>
5f6a6a9c 19#include <linux/proc_fs.h>
a10aa579 20#include <linux/seq_file.h>
868b104d 21#include <linux/set_memory.h>
3ac7fe5a 22#include <linux/debugobjects.h>
23016969 23#include <linux/kallsyms.h>
db64fe02 24#include <linux/list.h>
4da56b99 25#include <linux/notifier.h>
db64fe02 26#include <linux/rbtree.h>
0f14599c 27#include <linux/xarray.h>
db64fe02 28#include <linux/rcupdate.h>
f0aa6617 29#include <linux/pfn.h>
89219d37 30#include <linux/kmemleak.h>
60063497 31#include <linux/atomic.h>
3b32123d 32#include <linux/compiler.h>
32fcfd40 33#include <linux/llist.h>
0f616be1 34#include <linux/bitops.h>
68ad4a33 35#include <linux/rbtree_augmented.h>
bdebd6a2 36#include <linux/overflow.h>
c0eb315a 37#include <linux/pgtable.h>
7c0f6ba6 38#include <linux/uaccess.h>
f7ee1f13 39#include <linux/hugetlb.h>
1da177e4 40#include <asm/tlbflush.h>
2dca6999 41#include <asm/shmparam.h>
1da177e4 42
dd56b046 43#include "internal.h"
2a681cfa 44#include "pgalloc-track.h"
dd56b046 45
121e6f32
NP
46#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
47static bool __ro_after_init vmap_allow_huge = true;
48
49static int __init set_nohugevmalloc(char *str)
50{
51 vmap_allow_huge = false;
52 return 0;
53}
54early_param("nohugevmalloc", set_nohugevmalloc);
55#else /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
56static const bool vmap_allow_huge = false;
57#endif /* CONFIG_HAVE_ARCH_HUGE_VMALLOC */
58
186525bd
IM
59bool is_vmalloc_addr(const void *x)
60{
61 unsigned long addr = (unsigned long)x;
62
63 return addr >= VMALLOC_START && addr < VMALLOC_END;
64}
65EXPORT_SYMBOL(is_vmalloc_addr);
66
32fcfd40
AV
67struct vfree_deferred {
68 struct llist_head list;
69 struct work_struct wq;
70};
71static DEFINE_PER_CPU(struct vfree_deferred, vfree_deferred);
72
73static void __vunmap(const void *, int);
74
75static void free_work(struct work_struct *w)
76{
77 struct vfree_deferred *p = container_of(w, struct vfree_deferred, wq);
894e58c1
BP
78 struct llist_node *t, *llnode;
79
80 llist_for_each_safe(llnode, t, llist_del_all(&p->list))
81 __vunmap((void *)llnode, 1);
32fcfd40
AV
82}
83
db64fe02 84/*** Page table manipulation functions ***/
5e9e3d77
NP
85static int vmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
86 phys_addr_t phys_addr, pgprot_t prot,
f7ee1f13 87 unsigned int max_page_shift, pgtbl_mod_mask *mask)
5e9e3d77
NP
88{
89 pte_t *pte;
90 u64 pfn;
f7ee1f13 91 unsigned long size = PAGE_SIZE;
5e9e3d77
NP
92
93 pfn = phys_addr >> PAGE_SHIFT;
94 pte = pte_alloc_kernel_track(pmd, addr, mask);
95 if (!pte)
96 return -ENOMEM;
97 do {
98 BUG_ON(!pte_none(*pte));
f7ee1f13
CL
99
100#ifdef CONFIG_HUGETLB_PAGE
101 size = arch_vmap_pte_range_map_size(addr, end, pfn, max_page_shift);
102 if (size != PAGE_SIZE) {
103 pte_t entry = pfn_pte(pfn, prot);
104
105 entry = pte_mkhuge(entry);
106 entry = arch_make_huge_pte(entry, ilog2(size), 0);
107 set_huge_pte_at(&init_mm, addr, pte, entry);
108 pfn += PFN_DOWN(size);
109 continue;
110 }
111#endif
5e9e3d77
NP
112 set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
113 pfn++;
f7ee1f13 114 } while (pte += PFN_DOWN(size), addr += size, addr != end);
5e9e3d77
NP
115 *mask |= PGTBL_PTE_MODIFIED;
116 return 0;
117}
118
119static int vmap_try_huge_pmd(pmd_t *pmd, unsigned long addr, unsigned long end,
120 phys_addr_t phys_addr, pgprot_t prot,
121 unsigned int max_page_shift)
122{
123 if (max_page_shift < PMD_SHIFT)
124 return 0;
125
126 if (!arch_vmap_pmd_supported(prot))
127 return 0;
128
129 if ((end - addr) != PMD_SIZE)
130 return 0;
131
132 if (!IS_ALIGNED(addr, PMD_SIZE))
133 return 0;
134
135 if (!IS_ALIGNED(phys_addr, PMD_SIZE))
136 return 0;
137
138 if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
139 return 0;
140
141 return pmd_set_huge(pmd, phys_addr, prot);
142}
143
144static int vmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
145 phys_addr_t phys_addr, pgprot_t prot,
146 unsigned int max_page_shift, pgtbl_mod_mask *mask)
147{
148 pmd_t *pmd;
149 unsigned long next;
150
151 pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
152 if (!pmd)
153 return -ENOMEM;
154 do {
155 next = pmd_addr_end(addr, end);
156
157 if (vmap_try_huge_pmd(pmd, addr, next, phys_addr, prot,
158 max_page_shift)) {
159 *mask |= PGTBL_PMD_MODIFIED;
160 continue;
161 }
162
f7ee1f13 163 if (vmap_pte_range(pmd, addr, next, phys_addr, prot, max_page_shift, mask))
5e9e3d77
NP
164 return -ENOMEM;
165 } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
166 return 0;
167}
168
169static int vmap_try_huge_pud(pud_t *pud, unsigned long addr, unsigned long end,
170 phys_addr_t phys_addr, pgprot_t prot,
171 unsigned int max_page_shift)
172{
173 if (max_page_shift < PUD_SHIFT)
174 return 0;
175
176 if (!arch_vmap_pud_supported(prot))
177 return 0;
178
179 if ((end - addr) != PUD_SIZE)
180 return 0;
181
182 if (!IS_ALIGNED(addr, PUD_SIZE))
183 return 0;
184
185 if (!IS_ALIGNED(phys_addr, PUD_SIZE))
186 return 0;
187
188 if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
189 return 0;
190
191 return pud_set_huge(pud, phys_addr, prot);
192}
193
194static int vmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
195 phys_addr_t phys_addr, pgprot_t prot,
196 unsigned int max_page_shift, pgtbl_mod_mask *mask)
197{
198 pud_t *pud;
199 unsigned long next;
200
201 pud = pud_alloc_track(&init_mm, p4d, addr, mask);
202 if (!pud)
203 return -ENOMEM;
204 do {
205 next = pud_addr_end(addr, end);
206
207 if (vmap_try_huge_pud(pud, addr, next, phys_addr, prot,
208 max_page_shift)) {
209 *mask |= PGTBL_PUD_MODIFIED;
210 continue;
211 }
212
213 if (vmap_pmd_range(pud, addr, next, phys_addr, prot,
214 max_page_shift, mask))
215 return -ENOMEM;
216 } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
217 return 0;
218}
219
220static int vmap_try_huge_p4d(p4d_t *p4d, unsigned long addr, unsigned long end,
221 phys_addr_t phys_addr, pgprot_t prot,
222 unsigned int max_page_shift)
223{
224 if (max_page_shift < P4D_SHIFT)
225 return 0;
226
227 if (!arch_vmap_p4d_supported(prot))
228 return 0;
229
230 if ((end - addr) != P4D_SIZE)
231 return 0;
232
233 if (!IS_ALIGNED(addr, P4D_SIZE))
234 return 0;
235
236 if (!IS_ALIGNED(phys_addr, P4D_SIZE))
237 return 0;
238
239 if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
240 return 0;
241
242 return p4d_set_huge(p4d, phys_addr, prot);
243}
244
245static int vmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
246 phys_addr_t phys_addr, pgprot_t prot,
247 unsigned int max_page_shift, pgtbl_mod_mask *mask)
248{
249 p4d_t *p4d;
250 unsigned long next;
251
252 p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
253 if (!p4d)
254 return -ENOMEM;
255 do {
256 next = p4d_addr_end(addr, end);
257
258 if (vmap_try_huge_p4d(p4d, addr, next, phys_addr, prot,
259 max_page_shift)) {
260 *mask |= PGTBL_P4D_MODIFIED;
261 continue;
262 }
263
264 if (vmap_pud_range(p4d, addr, next, phys_addr, prot,
265 max_page_shift, mask))
266 return -ENOMEM;
267 } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
268 return 0;
269}
270
5d87510d 271static int vmap_range_noflush(unsigned long addr, unsigned long end,
5e9e3d77
NP
272 phys_addr_t phys_addr, pgprot_t prot,
273 unsigned int max_page_shift)
274{
275 pgd_t *pgd;
276 unsigned long start;
277 unsigned long next;
278 int err;
279 pgtbl_mod_mask mask = 0;
280
281 might_sleep();
282 BUG_ON(addr >= end);
283
284 start = addr;
285 pgd = pgd_offset_k(addr);
286 do {
287 next = pgd_addr_end(addr, end);
288 err = vmap_p4d_range(pgd, addr, next, phys_addr, prot,
289 max_page_shift, &mask);
290 if (err)
291 break;
292 } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
293
5e9e3d77
NP
294 if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
295 arch_sync_kernel_mappings(start, end);
296
297 return err;
298}
b221385b 299
5d87510d
NP
300int vmap_range(unsigned long addr, unsigned long end,
301 phys_addr_t phys_addr, pgprot_t prot,
302 unsigned int max_page_shift)
303{
304 int err;
305
306 err = vmap_range_noflush(addr, end, phys_addr, prot, max_page_shift);
307 flush_cache_vmap(addr, end);
308
309 return err;
310}
311
2ba3e694
JR
312static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
313 pgtbl_mod_mask *mask)
1da177e4
LT
314{
315 pte_t *pte;
316
317 pte = pte_offset_kernel(pmd, addr);
318 do {
319 pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
320 WARN_ON(!pte_none(ptent) && !pte_present(ptent));
321 } while (pte++, addr += PAGE_SIZE, addr != end);
2ba3e694 322 *mask |= PGTBL_PTE_MODIFIED;
1da177e4
LT
323}
324
2ba3e694
JR
325static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
326 pgtbl_mod_mask *mask)
1da177e4
LT
327{
328 pmd_t *pmd;
329 unsigned long next;
2ba3e694 330 int cleared;
1da177e4
LT
331
332 pmd = pmd_offset(pud, addr);
333 do {
334 next = pmd_addr_end(addr, end);
2ba3e694
JR
335
336 cleared = pmd_clear_huge(pmd);
337 if (cleared || pmd_bad(*pmd))
338 *mask |= PGTBL_PMD_MODIFIED;
339
340 if (cleared)
b9820d8f 341 continue;
1da177e4
LT
342 if (pmd_none_or_clear_bad(pmd))
343 continue;
2ba3e694 344 vunmap_pte_range(pmd, addr, next, mask);
e47110e9
AK
345
346 cond_resched();
1da177e4
LT
347 } while (pmd++, addr = next, addr != end);
348}
349
2ba3e694
JR
350static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
351 pgtbl_mod_mask *mask)
1da177e4
LT
352{
353 pud_t *pud;
354 unsigned long next;
2ba3e694 355 int cleared;
1da177e4 356
c2febafc 357 pud = pud_offset(p4d, addr);
1da177e4
LT
358 do {
359 next = pud_addr_end(addr, end);
2ba3e694
JR
360
361 cleared = pud_clear_huge(pud);
362 if (cleared || pud_bad(*pud))
363 *mask |= PGTBL_PUD_MODIFIED;
364
365 if (cleared)
b9820d8f 366 continue;
1da177e4
LT
367 if (pud_none_or_clear_bad(pud))
368 continue;
2ba3e694 369 vunmap_pmd_range(pud, addr, next, mask);
1da177e4
LT
370 } while (pud++, addr = next, addr != end);
371}
372
2ba3e694
JR
373static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
374 pgtbl_mod_mask *mask)
c2febafc
KS
375{
376 p4d_t *p4d;
377 unsigned long next;
2ba3e694 378 int cleared;
c2febafc
KS
379
380 p4d = p4d_offset(pgd, addr);
381 do {
382 next = p4d_addr_end(addr, end);
2ba3e694
JR
383
384 cleared = p4d_clear_huge(p4d);
385 if (cleared || p4d_bad(*p4d))
386 *mask |= PGTBL_P4D_MODIFIED;
387
388 if (cleared)
c2febafc
KS
389 continue;
390 if (p4d_none_or_clear_bad(p4d))
391 continue;
2ba3e694 392 vunmap_pud_range(p4d, addr, next, mask);
c2febafc
KS
393 } while (p4d++, addr = next, addr != end);
394}
395
4ad0ae8c
NP
396/*
397 * vunmap_range_noflush is similar to vunmap_range, but does not
398 * flush caches or TLBs.
b521c43f 399 *
4ad0ae8c
NP
400 * The caller is responsible for calling flush_cache_vmap() before calling
401 * this function, and flush_tlb_kernel_range after it has returned
402 * successfully (and before the addresses are expected to cause a page fault
403 * or be re-mapped for something else, if TLB flushes are being delayed or
404 * coalesced).
b521c43f 405 *
4ad0ae8c 406 * This is an internal function only. Do not use outside mm/.
b521c43f 407 */
4ad0ae8c 408void vunmap_range_noflush(unsigned long start, unsigned long end)
1da177e4 409{
1da177e4 410 unsigned long next;
b521c43f 411 pgd_t *pgd;
2ba3e694
JR
412 unsigned long addr = start;
413 pgtbl_mod_mask mask = 0;
1da177e4
LT
414
415 BUG_ON(addr >= end);
416 pgd = pgd_offset_k(addr);
1da177e4
LT
417 do {
418 next = pgd_addr_end(addr, end);
2ba3e694
JR
419 if (pgd_bad(*pgd))
420 mask |= PGTBL_PGD_MODIFIED;
1da177e4
LT
421 if (pgd_none_or_clear_bad(pgd))
422 continue;
2ba3e694 423 vunmap_p4d_range(pgd, addr, next, &mask);
1da177e4 424 } while (pgd++, addr = next, addr != end);
2ba3e694
JR
425
426 if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
427 arch_sync_kernel_mappings(start, end);
1da177e4
LT
428}
429
4ad0ae8c
NP
430/**
431 * vunmap_range - unmap kernel virtual addresses
432 * @addr: start of the VM area to unmap
433 * @end: end of the VM area to unmap (non-inclusive)
434 *
435 * Clears any present PTEs in the virtual address range, flushes TLBs and
436 * caches. Any subsequent access to the address before it has been re-mapped
437 * is a kernel bug.
438 */
439void vunmap_range(unsigned long addr, unsigned long end)
440{
441 flush_cache_vunmap(addr, end);
442 vunmap_range_noflush(addr, end);
443 flush_tlb_kernel_range(addr, end);
444}
445
0a264884 446static int vmap_pages_pte_range(pmd_t *pmd, unsigned long addr,
2ba3e694
JR
447 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
448 pgtbl_mod_mask *mask)
1da177e4
LT
449{
450 pte_t *pte;
451
db64fe02
NP
452 /*
453 * nr is a running index into the array which helps higher level
454 * callers keep track of where we're up to.
455 */
456
2ba3e694 457 pte = pte_alloc_kernel_track(pmd, addr, mask);
1da177e4
LT
458 if (!pte)
459 return -ENOMEM;
460 do {
db64fe02
NP
461 struct page *page = pages[*nr];
462
463 if (WARN_ON(!pte_none(*pte)))
464 return -EBUSY;
465 if (WARN_ON(!page))
1da177e4
LT
466 return -ENOMEM;
467 set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
db64fe02 468 (*nr)++;
1da177e4 469 } while (pte++, addr += PAGE_SIZE, addr != end);
2ba3e694 470 *mask |= PGTBL_PTE_MODIFIED;
1da177e4
LT
471 return 0;
472}
473
0a264884 474static int vmap_pages_pmd_range(pud_t *pud, unsigned long addr,
2ba3e694
JR
475 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
476 pgtbl_mod_mask *mask)
1da177e4
LT
477{
478 pmd_t *pmd;
479 unsigned long next;
480
2ba3e694 481 pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
1da177e4
LT
482 if (!pmd)
483 return -ENOMEM;
484 do {
485 next = pmd_addr_end(addr, end);
0a264884 486 if (vmap_pages_pte_range(pmd, addr, next, prot, pages, nr, mask))
1da177e4
LT
487 return -ENOMEM;
488 } while (pmd++, addr = next, addr != end);
489 return 0;
490}
491
0a264884 492static int vmap_pages_pud_range(p4d_t *p4d, unsigned long addr,
2ba3e694
JR
493 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
494 pgtbl_mod_mask *mask)
1da177e4
LT
495{
496 pud_t *pud;
497 unsigned long next;
498
2ba3e694 499 pud = pud_alloc_track(&init_mm, p4d, addr, mask);
1da177e4
LT
500 if (!pud)
501 return -ENOMEM;
502 do {
503 next = pud_addr_end(addr, end);
0a264884 504 if (vmap_pages_pmd_range(pud, addr, next, prot, pages, nr, mask))
1da177e4
LT
505 return -ENOMEM;
506 } while (pud++, addr = next, addr != end);
507 return 0;
508}
509
0a264884 510static int vmap_pages_p4d_range(pgd_t *pgd, unsigned long addr,
2ba3e694
JR
511 unsigned long end, pgprot_t prot, struct page **pages, int *nr,
512 pgtbl_mod_mask *mask)
c2febafc
KS
513{
514 p4d_t *p4d;
515 unsigned long next;
516
2ba3e694 517 p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
c2febafc
KS
518 if (!p4d)
519 return -ENOMEM;
520 do {
521 next = p4d_addr_end(addr, end);
0a264884 522 if (vmap_pages_pud_range(p4d, addr, next, prot, pages, nr, mask))
c2febafc
KS
523 return -ENOMEM;
524 } while (p4d++, addr = next, addr != end);
525 return 0;
526}
527
121e6f32
NP
528static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end,
529 pgprot_t prot, struct page **pages)
1da177e4 530{
2ba3e694 531 unsigned long start = addr;
b521c43f 532 pgd_t *pgd;
121e6f32 533 unsigned long next;
db64fe02
NP
534 int err = 0;
535 int nr = 0;
2ba3e694 536 pgtbl_mod_mask mask = 0;
1da177e4
LT
537
538 BUG_ON(addr >= end);
539 pgd = pgd_offset_k(addr);
1da177e4
LT
540 do {
541 next = pgd_addr_end(addr, end);
2ba3e694
JR
542 if (pgd_bad(*pgd))
543 mask |= PGTBL_PGD_MODIFIED;
0a264884 544 err = vmap_pages_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
1da177e4 545 if (err)
bf88c8c8 546 return err;
1da177e4 547 } while (pgd++, addr = next, addr != end);
db64fe02 548
2ba3e694
JR
549 if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
550 arch_sync_kernel_mappings(start, end);
551
60bb4465 552 return 0;
1da177e4
LT
553}
554
b67177ec
NP
555/*
556 * vmap_pages_range_noflush is similar to vmap_pages_range, but does not
557 * flush caches.
558 *
559 * The caller is responsible for calling flush_cache_vmap() after this
560 * function returns successfully and before the addresses are accessed.
561 *
562 * This is an internal function only. Do not use outside mm/.
563 */
564int vmap_pages_range_noflush(unsigned long addr, unsigned long end,
121e6f32
NP
565 pgprot_t prot, struct page **pages, unsigned int page_shift)
566{
567 unsigned int i, nr = (end - addr) >> PAGE_SHIFT;
568
569 WARN_ON(page_shift < PAGE_SHIFT);
570
571 if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMALLOC) ||
572 page_shift == PAGE_SHIFT)
573 return vmap_small_pages_range_noflush(addr, end, prot, pages);
574
575 for (i = 0; i < nr; i += 1U << (page_shift - PAGE_SHIFT)) {
576 int err;
577
578 err = vmap_range_noflush(addr, addr + (1UL << page_shift),
579 __pa(page_address(pages[i])), prot,
580 page_shift);
581 if (err)
582 return err;
583
584 addr += 1UL << page_shift;
585 }
586
587 return 0;
588}
589
121e6f32 590/**
b67177ec 591 * vmap_pages_range - map pages to a kernel virtual address
121e6f32 592 * @addr: start of the VM area to map
b67177ec 593 * @end: end of the VM area to map (non-inclusive)
121e6f32 594 * @prot: page protection flags to use
b67177ec
NP
595 * @pages: pages to map (always PAGE_SIZE pages)
596 * @page_shift: maximum shift that the pages may be mapped with, @pages must
597 * be aligned and contiguous up to at least this shift.
121e6f32
NP
598 *
599 * RETURNS:
600 * 0 on success, -errno on failure.
601 */
b67177ec
NP
602static int vmap_pages_range(unsigned long addr, unsigned long end,
603 pgprot_t prot, struct page **pages, unsigned int page_shift)
8fc48985 604{
b67177ec 605 int err;
8fc48985 606
b67177ec
NP
607 err = vmap_pages_range_noflush(addr, end, prot, pages, page_shift);
608 flush_cache_vmap(addr, end);
609 return err;
8fc48985
TH
610}
611
81ac3ad9 612int is_vmalloc_or_module_addr(const void *x)
73bdf0a6
LT
613{
614 /*
ab4f2ee1 615 * ARM, x86-64 and sparc64 put modules in a special place,
73bdf0a6
LT
616 * and fall back on vmalloc() if that fails. Others
617 * just put it in the vmalloc space.
618 */
619#if defined(CONFIG_MODULES) && defined(MODULES_VADDR)
620 unsigned long addr = (unsigned long)x;
621 if (addr >= MODULES_VADDR && addr < MODULES_END)
622 return 1;
623#endif
624 return is_vmalloc_addr(x);
625}
626
48667e7a 627/*
c0eb315a
NP
628 * Walk a vmap address to the struct page it maps. Huge vmap mappings will
629 * return the tail page that corresponds to the base page address, which
630 * matches small vmap mappings.
48667e7a 631 */
add688fb 632struct page *vmalloc_to_page(const void *vmalloc_addr)
48667e7a
CL
633{
634 unsigned long addr = (unsigned long) vmalloc_addr;
add688fb 635 struct page *page = NULL;
48667e7a 636 pgd_t *pgd = pgd_offset_k(addr);
c2febafc
KS
637 p4d_t *p4d;
638 pud_t *pud;
639 pmd_t *pmd;
640 pte_t *ptep, pte;
48667e7a 641
7aa413de
IM
642 /*
643 * XXX we might need to change this if we add VIRTUAL_BUG_ON for
644 * architectures that do not vmalloc module space
645 */
73bdf0a6 646 VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr));
59ea7463 647
c2febafc
KS
648 if (pgd_none(*pgd))
649 return NULL;
c0eb315a
NP
650 if (WARN_ON_ONCE(pgd_leaf(*pgd)))
651 return NULL; /* XXX: no allowance for huge pgd */
652 if (WARN_ON_ONCE(pgd_bad(*pgd)))
653 return NULL;
654
c2febafc
KS
655 p4d = p4d_offset(pgd, addr);
656 if (p4d_none(*p4d))
657 return NULL;
c0eb315a
NP
658 if (p4d_leaf(*p4d))
659 return p4d_page(*p4d) + ((addr & ~P4D_MASK) >> PAGE_SHIFT);
660 if (WARN_ON_ONCE(p4d_bad(*p4d)))
661 return NULL;
029c54b0 662
c0eb315a
NP
663 pud = pud_offset(p4d, addr);
664 if (pud_none(*pud))
665 return NULL;
666 if (pud_leaf(*pud))
667 return pud_page(*pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
668 if (WARN_ON_ONCE(pud_bad(*pud)))
c2febafc 669 return NULL;
c0eb315a 670
c2febafc 671 pmd = pmd_offset(pud, addr);
c0eb315a
NP
672 if (pmd_none(*pmd))
673 return NULL;
674 if (pmd_leaf(*pmd))
675 return pmd_page(*pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
676 if (WARN_ON_ONCE(pmd_bad(*pmd)))
c2febafc
KS
677 return NULL;
678
679 ptep = pte_offset_map(pmd, addr);
680 pte = *ptep;
681 if (pte_present(pte))
682 page = pte_page(pte);
683 pte_unmap(ptep);
c0eb315a 684
add688fb 685 return page;
48667e7a 686}
add688fb 687EXPORT_SYMBOL(vmalloc_to_page);
48667e7a
CL
688
689/*
add688fb 690 * Map a vmalloc()-space virtual address to the physical page frame number.
48667e7a 691 */
add688fb 692unsigned long vmalloc_to_pfn(const void *vmalloc_addr)
48667e7a 693{
add688fb 694 return page_to_pfn(vmalloc_to_page(vmalloc_addr));
48667e7a 695}
add688fb 696EXPORT_SYMBOL(vmalloc_to_pfn);
48667e7a 697
db64fe02
NP
698
699/*** Global kva allocator ***/
700
bb850f4d 701#define DEBUG_AUGMENT_PROPAGATE_CHECK 0
a6cf4e0f 702#define DEBUG_AUGMENT_LOWEST_MATCH_CHECK 0
bb850f4d 703
db64fe02 704
db64fe02 705static DEFINE_SPINLOCK(vmap_area_lock);
e36176be 706static DEFINE_SPINLOCK(free_vmap_area_lock);
f1c4069e
JK
707/* Export for kexec only */
708LIST_HEAD(vmap_area_list);
89699605 709static struct rb_root vmap_area_root = RB_ROOT;
68ad4a33 710static bool vmap_initialized __read_mostly;
89699605 711
96e2db45
URS
712static struct rb_root purge_vmap_area_root = RB_ROOT;
713static LIST_HEAD(purge_vmap_area_list);
714static DEFINE_SPINLOCK(purge_vmap_area_lock);
715
68ad4a33
URS
716/*
717 * This kmem_cache is used for vmap_area objects. Instead of
718 * allocating from slab we reuse an object from this cache to
719 * make things faster. Especially in "no edge" splitting of
720 * free block.
721 */
722static struct kmem_cache *vmap_area_cachep;
723
724/*
725 * This linked list is used in pair with free_vmap_area_root.
726 * It gives O(1) access to prev/next to perform fast coalescing.
727 */
728static LIST_HEAD(free_vmap_area_list);
729
730/*
731 * This augment red-black tree represents the free vmap space.
732 * All vmap_area objects in this tree are sorted by va->va_start
733 * address. It is used for allocation and merging when a vmap
734 * object is released.
735 *
736 * Each vmap_area node contains a maximum available free block
737 * of its sub-tree, right or left. Therefore it is possible to
738 * find a lowest match of free area.
739 */
740static struct rb_root free_vmap_area_root = RB_ROOT;
741
82dd23e8
URS
742/*
743 * Preload a CPU with one object for "no edge" split case. The
744 * aim is to get rid of allocations from the atomic context, thus
745 * to use more permissive allocation masks.
746 */
747static DEFINE_PER_CPU(struct vmap_area *, ne_fit_preload_node);
748
68ad4a33
URS
749static __always_inline unsigned long
750va_size(struct vmap_area *va)
751{
752 return (va->va_end - va->va_start);
753}
754
755static __always_inline unsigned long
756get_subtree_max_size(struct rb_node *node)
757{
758 struct vmap_area *va;
759
760 va = rb_entry_safe(node, struct vmap_area, rb_node);
761 return va ? va->subtree_max_size : 0;
762}
89699605 763
68ad4a33
URS
764/*
765 * Gets called when remove the node and rotate.
766 */
767static __always_inline unsigned long
768compute_subtree_max_size(struct vmap_area *va)
769{
770 return max3(va_size(va),
771 get_subtree_max_size(va->rb_node.rb_left),
772 get_subtree_max_size(va->rb_node.rb_right));
773}
774
315cc066
ML
775RB_DECLARE_CALLBACKS_MAX(static, free_vmap_area_rb_augment_cb,
776 struct vmap_area, rb_node, unsigned long, subtree_max_size, va_size)
68ad4a33
URS
777
778static void purge_vmap_area_lazy(void);
779static BLOCKING_NOTIFIER_HEAD(vmap_notify_list);
780static unsigned long lazy_max_pages(void);
db64fe02 781
97105f0a
RG
782static atomic_long_t nr_vmalloc_pages;
783
784unsigned long vmalloc_nr_pages(void)
785{
786 return atomic_long_read(&nr_vmalloc_pages);
787}
788
db64fe02 789static struct vmap_area *__find_vmap_area(unsigned long addr)
1da177e4 790{
db64fe02
NP
791 struct rb_node *n = vmap_area_root.rb_node;
792
793 while (n) {
794 struct vmap_area *va;
795
796 va = rb_entry(n, struct vmap_area, rb_node);
797 if (addr < va->va_start)
798 n = n->rb_left;
cef2ac3f 799 else if (addr >= va->va_end)
db64fe02
NP
800 n = n->rb_right;
801 else
802 return va;
803 }
804
805 return NULL;
806}
807
68ad4a33
URS
808/*
809 * This function returns back addresses of parent node
810 * and its left or right link for further processing.
9c801f61
URS
811 *
812 * Otherwise NULL is returned. In that case all further
813 * steps regarding inserting of conflicting overlap range
814 * have to be declined and actually considered as a bug.
68ad4a33
URS
815 */
816static __always_inline struct rb_node **
817find_va_links(struct vmap_area *va,
818 struct rb_root *root, struct rb_node *from,
819 struct rb_node **parent)
820{
821 struct vmap_area *tmp_va;
822 struct rb_node **link;
823
824 if (root) {
825 link = &root->rb_node;
826 if (unlikely(!*link)) {
827 *parent = NULL;
828 return link;
829 }
830 } else {
831 link = &from;
832 }
db64fe02 833
68ad4a33
URS
834 /*
835 * Go to the bottom of the tree. When we hit the last point
836 * we end up with parent rb_node and correct direction, i name
837 * it link, where the new va->rb_node will be attached to.
838 */
839 do {
840 tmp_va = rb_entry(*link, struct vmap_area, rb_node);
db64fe02 841
68ad4a33
URS
842 /*
843 * During the traversal we also do some sanity check.
844 * Trigger the BUG() if there are sides(left/right)
845 * or full overlaps.
846 */
847 if (va->va_start < tmp_va->va_end &&
848 va->va_end <= tmp_va->va_start)
849 link = &(*link)->rb_left;
850 else if (va->va_end > tmp_va->va_start &&
851 va->va_start >= tmp_va->va_end)
852 link = &(*link)->rb_right;
9c801f61
URS
853 else {
854 WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
855 va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
856
857 return NULL;
858 }
68ad4a33
URS
859 } while (*link);
860
861 *parent = &tmp_va->rb_node;
862 return link;
863}
864
865static __always_inline struct list_head *
866get_va_next_sibling(struct rb_node *parent, struct rb_node **link)
867{
868 struct list_head *list;
869
870 if (unlikely(!parent))
871 /*
872 * The red-black tree where we try to find VA neighbors
873 * before merging or inserting is empty, i.e. it means
874 * there is no free vmap space. Normally it does not
875 * happen but we handle this case anyway.
876 */
877 return NULL;
878
879 list = &rb_entry(parent, struct vmap_area, rb_node)->list;
880 return (&parent->rb_right == link ? list->next : list);
881}
882
883static __always_inline void
884link_va(struct vmap_area *va, struct rb_root *root,
885 struct rb_node *parent, struct rb_node **link, struct list_head *head)
886{
887 /*
888 * VA is still not in the list, but we can
889 * identify its future previous list_head node.
890 */
891 if (likely(parent)) {
892 head = &rb_entry(parent, struct vmap_area, rb_node)->list;
893 if (&parent->rb_right != link)
894 head = head->prev;
db64fe02
NP
895 }
896
68ad4a33
URS
897 /* Insert to the rb-tree */
898 rb_link_node(&va->rb_node, parent, link);
899 if (root == &free_vmap_area_root) {
900 /*
901 * Some explanation here. Just perform simple insertion
902 * to the tree. We do not set va->subtree_max_size to
903 * its current size before calling rb_insert_augmented().
904 * It is because of we populate the tree from the bottom
905 * to parent levels when the node _is_ in the tree.
906 *
907 * Therefore we set subtree_max_size to zero after insertion,
908 * to let __augment_tree_propagate_from() puts everything to
909 * the correct order later on.
910 */
911 rb_insert_augmented(&va->rb_node,
912 root, &free_vmap_area_rb_augment_cb);
913 va->subtree_max_size = 0;
914 } else {
915 rb_insert_color(&va->rb_node, root);
916 }
db64fe02 917
68ad4a33
URS
918 /* Address-sort this list */
919 list_add(&va->list, head);
db64fe02
NP
920}
921
68ad4a33
URS
922static __always_inline void
923unlink_va(struct vmap_area *va, struct rb_root *root)
924{
460e42d1
URS
925 if (WARN_ON(RB_EMPTY_NODE(&va->rb_node)))
926 return;
db64fe02 927
460e42d1
URS
928 if (root == &free_vmap_area_root)
929 rb_erase_augmented(&va->rb_node,
930 root, &free_vmap_area_rb_augment_cb);
931 else
932 rb_erase(&va->rb_node, root);
933
934 list_del(&va->list);
935 RB_CLEAR_NODE(&va->rb_node);
68ad4a33
URS
936}
937
bb850f4d
URS
938#if DEBUG_AUGMENT_PROPAGATE_CHECK
939static void
da27c9ed 940augment_tree_propagate_check(void)
bb850f4d
URS
941{
942 struct vmap_area *va;
da27c9ed 943 unsigned long computed_size;
bb850f4d 944
da27c9ed
URS
945 list_for_each_entry(va, &free_vmap_area_list, list) {
946 computed_size = compute_subtree_max_size(va);
947 if (computed_size != va->subtree_max_size)
948 pr_emerg("tree is corrupted: %lu, %lu\n",
949 va_size(va), va->subtree_max_size);
bb850f4d 950 }
bb850f4d
URS
951}
952#endif
953
68ad4a33
URS
954/*
955 * This function populates subtree_max_size from bottom to upper
956 * levels starting from VA point. The propagation must be done
957 * when VA size is modified by changing its va_start/va_end. Or
958 * in case of newly inserting of VA to the tree.
959 *
960 * It means that __augment_tree_propagate_from() must be called:
961 * - After VA has been inserted to the tree(free path);
962 * - After VA has been shrunk(allocation path);
963 * - After VA has been increased(merging path).
964 *
965 * Please note that, it does not mean that upper parent nodes
966 * and their subtree_max_size are recalculated all the time up
967 * to the root node.
968 *
969 * 4--8
970 * /\
971 * / \
972 * / \
973 * 2--2 8--8
974 *
975 * For example if we modify the node 4, shrinking it to 2, then
976 * no any modification is required. If we shrink the node 2 to 1
977 * its subtree_max_size is updated only, and set to 1. If we shrink
978 * the node 8 to 6, then its subtree_max_size is set to 6 and parent
979 * node becomes 4--6.
980 */
981static __always_inline void
982augment_tree_propagate_from(struct vmap_area *va)
983{
15ae144f
URS
984 /*
985 * Populate the tree from bottom towards the root until
986 * the calculated maximum available size of checked node
987 * is equal to its current one.
988 */
989 free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
bb850f4d
URS
990
991#if DEBUG_AUGMENT_PROPAGATE_CHECK
da27c9ed 992 augment_tree_propagate_check();
bb850f4d 993#endif
68ad4a33
URS
994}
995
996static void
997insert_vmap_area(struct vmap_area *va,
998 struct rb_root *root, struct list_head *head)
999{
1000 struct rb_node **link;
1001 struct rb_node *parent;
1002
1003 link = find_va_links(va, root, NULL, &parent);
9c801f61
URS
1004 if (link)
1005 link_va(va, root, parent, link, head);
68ad4a33
URS
1006}
1007
1008static void
1009insert_vmap_area_augment(struct vmap_area *va,
1010 struct rb_node *from, struct rb_root *root,
1011 struct list_head *head)
1012{
1013 struct rb_node **link;
1014 struct rb_node *parent;
1015
1016 if (from)
1017 link = find_va_links(va, NULL, from, &parent);
1018 else
1019 link = find_va_links(va, root, NULL, &parent);
1020
9c801f61
URS
1021 if (link) {
1022 link_va(va, root, parent, link, head);
1023 augment_tree_propagate_from(va);
1024 }
68ad4a33
URS
1025}
1026
1027/*
1028 * Merge de-allocated chunk of VA memory with previous
1029 * and next free blocks. If coalesce is not done a new
1030 * free area is inserted. If VA has been merged, it is
1031 * freed.
9c801f61
URS
1032 *
1033 * Please note, it can return NULL in case of overlap
1034 * ranges, followed by WARN() report. Despite it is a
1035 * buggy behaviour, a system can be alive and keep
1036 * ongoing.
68ad4a33 1037 */
3c5c3cfb 1038static __always_inline struct vmap_area *
68ad4a33
URS
1039merge_or_add_vmap_area(struct vmap_area *va,
1040 struct rb_root *root, struct list_head *head)
1041{
1042 struct vmap_area *sibling;
1043 struct list_head *next;
1044 struct rb_node **link;
1045 struct rb_node *parent;
1046 bool merged = false;
1047
1048 /*
1049 * Find a place in the tree where VA potentially will be
1050 * inserted, unless it is merged with its sibling/siblings.
1051 */
1052 link = find_va_links(va, root, NULL, &parent);
9c801f61
URS
1053 if (!link)
1054 return NULL;
68ad4a33
URS
1055
1056 /*
1057 * Get next node of VA to check if merging can be done.
1058 */
1059 next = get_va_next_sibling(parent, link);
1060 if (unlikely(next == NULL))
1061 goto insert;
1062
1063 /*
1064 * start end
1065 * | |
1066 * |<------VA------>|<-----Next----->|
1067 * | |
1068 * start end
1069 */
1070 if (next != head) {
1071 sibling = list_entry(next, struct vmap_area, list);
1072 if (sibling->va_start == va->va_end) {
1073 sibling->va_start = va->va_start;
1074
68ad4a33
URS
1075 /* Free vmap_area object. */
1076 kmem_cache_free(vmap_area_cachep, va);
1077
1078 /* Point to the new merged area. */
1079 va = sibling;
1080 merged = true;
1081 }
1082 }
1083
1084 /*
1085 * start end
1086 * | |
1087 * |<-----Prev----->|<------VA------>|
1088 * | |
1089 * start end
1090 */
1091 if (next->prev != head) {
1092 sibling = list_entry(next->prev, struct vmap_area, list);
1093 if (sibling->va_end == va->va_start) {
5dd78640
URS
1094 /*
1095 * If both neighbors are coalesced, it is important
1096 * to unlink the "next" node first, followed by merging
1097 * with "previous" one. Otherwise the tree might not be
1098 * fully populated if a sibling's augmented value is
1099 * "normalized" because of rotation operations.
1100 */
54f63d9d
URS
1101 if (merged)
1102 unlink_va(va, root);
68ad4a33 1103
5dd78640
URS
1104 sibling->va_end = va->va_end;
1105
68ad4a33
URS
1106 /* Free vmap_area object. */
1107 kmem_cache_free(vmap_area_cachep, va);
3c5c3cfb
DA
1108
1109 /* Point to the new merged area. */
1110 va = sibling;
1111 merged = true;
68ad4a33
URS
1112 }
1113 }
1114
1115insert:
5dd78640 1116 if (!merged)
68ad4a33 1117 link_va(va, root, parent, link, head);
3c5c3cfb 1118
96e2db45
URS
1119 return va;
1120}
1121
1122static __always_inline struct vmap_area *
1123merge_or_add_vmap_area_augment(struct vmap_area *va,
1124 struct rb_root *root, struct list_head *head)
1125{
1126 va = merge_or_add_vmap_area(va, root, head);
1127 if (va)
1128 augment_tree_propagate_from(va);
1129
3c5c3cfb 1130 return va;
68ad4a33
URS
1131}
1132
1133static __always_inline bool
1134is_within_this_va(struct vmap_area *va, unsigned long size,
1135 unsigned long align, unsigned long vstart)
1136{
1137 unsigned long nva_start_addr;
1138
1139 if (va->va_start > vstart)
1140 nva_start_addr = ALIGN(va->va_start, align);
1141 else
1142 nva_start_addr = ALIGN(vstart, align);
1143
1144 /* Can be overflowed due to big size or alignment. */
1145 if (nva_start_addr + size < nva_start_addr ||
1146 nva_start_addr < vstart)
1147 return false;
1148
1149 return (nva_start_addr + size <= va->va_end);
1150}
1151
1152/*
1153 * Find the first free block(lowest start address) in the tree,
1154 * that will accomplish the request corresponding to passing
1155 * parameters.
1156 */
1157static __always_inline struct vmap_area *
1158find_vmap_lowest_match(unsigned long size,
1159 unsigned long align, unsigned long vstart)
1160{
1161 struct vmap_area *va;
1162 struct rb_node *node;
1163 unsigned long length;
1164
1165 /* Start from the root. */
1166 node = free_vmap_area_root.rb_node;
1167
1168 /* Adjust the search size for alignment overhead. */
1169 length = size + align - 1;
1170
1171 while (node) {
1172 va = rb_entry(node, struct vmap_area, rb_node);
1173
1174 if (get_subtree_max_size(node->rb_left) >= length &&
1175 vstart < va->va_start) {
1176 node = node->rb_left;
1177 } else {
1178 if (is_within_this_va(va, size, align, vstart))
1179 return va;
1180
1181 /*
1182 * Does not make sense to go deeper towards the right
1183 * sub-tree if it does not have a free block that is
1184 * equal or bigger to the requested search length.
1185 */
1186 if (get_subtree_max_size(node->rb_right) >= length) {
1187 node = node->rb_right;
1188 continue;
1189 }
1190
1191 /*
3806b041 1192 * OK. We roll back and find the first right sub-tree,
68ad4a33
URS
1193 * that will satisfy the search criteria. It can happen
1194 * only once due to "vstart" restriction.
1195 */
1196 while ((node = rb_parent(node))) {
1197 va = rb_entry(node, struct vmap_area, rb_node);
1198 if (is_within_this_va(va, size, align, vstart))
1199 return va;
1200
1201 if (get_subtree_max_size(node->rb_right) >= length &&
1202 vstart <= va->va_start) {
1203 node = node->rb_right;
1204 break;
1205 }
1206 }
1207 }
1208 }
1209
1210 return NULL;
1211}
1212
a6cf4e0f
URS
1213#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
1214#include <linux/random.h>
1215
1216static struct vmap_area *
1217find_vmap_lowest_linear_match(unsigned long size,
1218 unsigned long align, unsigned long vstart)
1219{
1220 struct vmap_area *va;
1221
1222 list_for_each_entry(va, &free_vmap_area_list, list) {
1223 if (!is_within_this_va(va, size, align, vstart))
1224 continue;
1225
1226 return va;
1227 }
1228
1229 return NULL;
1230}
1231
1232static void
1233find_vmap_lowest_match_check(unsigned long size)
1234{
1235 struct vmap_area *va_1, *va_2;
1236 unsigned long vstart;
1237 unsigned int rnd;
1238
1239 get_random_bytes(&rnd, sizeof(rnd));
1240 vstart = VMALLOC_START + rnd;
1241
1242 va_1 = find_vmap_lowest_match(size, 1, vstart);
1243 va_2 = find_vmap_lowest_linear_match(size, 1, vstart);
1244
1245 if (va_1 != va_2)
1246 pr_emerg("not lowest: t: 0x%p, l: 0x%p, v: 0x%lx\n",
1247 va_1, va_2, vstart);
1248}
1249#endif
1250
68ad4a33
URS
1251enum fit_type {
1252 NOTHING_FIT = 0,
1253 FL_FIT_TYPE = 1, /* full fit */
1254 LE_FIT_TYPE = 2, /* left edge fit */
1255 RE_FIT_TYPE = 3, /* right edge fit */
1256 NE_FIT_TYPE = 4 /* no edge fit */
1257};
1258
1259static __always_inline enum fit_type
1260classify_va_fit_type(struct vmap_area *va,
1261 unsigned long nva_start_addr, unsigned long size)
1262{
1263 enum fit_type type;
1264
1265 /* Check if it is within VA. */
1266 if (nva_start_addr < va->va_start ||
1267 nva_start_addr + size > va->va_end)
1268 return NOTHING_FIT;
1269
1270 /* Now classify. */
1271 if (va->va_start == nva_start_addr) {
1272 if (va->va_end == nva_start_addr + size)
1273 type = FL_FIT_TYPE;
1274 else
1275 type = LE_FIT_TYPE;
1276 } else if (va->va_end == nva_start_addr + size) {
1277 type = RE_FIT_TYPE;
1278 } else {
1279 type = NE_FIT_TYPE;
1280 }
1281
1282 return type;
1283}
1284
1285static __always_inline int
1286adjust_va_to_fit_type(struct vmap_area *va,
1287 unsigned long nva_start_addr, unsigned long size,
1288 enum fit_type type)
1289{
2c929233 1290 struct vmap_area *lva = NULL;
68ad4a33
URS
1291
1292 if (type == FL_FIT_TYPE) {
1293 /*
1294 * No need to split VA, it fully fits.
1295 *
1296 * | |
1297 * V NVA V
1298 * |---------------|
1299 */
1300 unlink_va(va, &free_vmap_area_root);
1301 kmem_cache_free(vmap_area_cachep, va);
1302 } else if (type == LE_FIT_TYPE) {
1303 /*
1304 * Split left edge of fit VA.
1305 *
1306 * | |
1307 * V NVA V R
1308 * |-------|-------|
1309 */
1310 va->va_start += size;
1311 } else if (type == RE_FIT_TYPE) {
1312 /*
1313 * Split right edge of fit VA.
1314 *
1315 * | |
1316 * L V NVA V
1317 * |-------|-------|
1318 */
1319 va->va_end = nva_start_addr;
1320 } else if (type == NE_FIT_TYPE) {
1321 /*
1322 * Split no edge of fit VA.
1323 *
1324 * | |
1325 * L V NVA V R
1326 * |---|-------|---|
1327 */
82dd23e8
URS
1328 lva = __this_cpu_xchg(ne_fit_preload_node, NULL);
1329 if (unlikely(!lva)) {
1330 /*
1331 * For percpu allocator we do not do any pre-allocation
1332 * and leave it as it is. The reason is it most likely
1333 * never ends up with NE_FIT_TYPE splitting. In case of
1334 * percpu allocations offsets and sizes are aligned to
1335 * fixed align request, i.e. RE_FIT_TYPE and FL_FIT_TYPE
1336 * are its main fitting cases.
1337 *
1338 * There are a few exceptions though, as an example it is
1339 * a first allocation (early boot up) when we have "one"
1340 * big free space that has to be split.
060650a2
URS
1341 *
1342 * Also we can hit this path in case of regular "vmap"
1343 * allocations, if "this" current CPU was not preloaded.
1344 * See the comment in alloc_vmap_area() why. If so, then
1345 * GFP_NOWAIT is used instead to get an extra object for
1346 * split purpose. That is rare and most time does not
1347 * occur.
1348 *
1349 * What happens if an allocation gets failed. Basically,
1350 * an "overflow" path is triggered to purge lazily freed
1351 * areas to free some memory, then, the "retry" path is
1352 * triggered to repeat one more time. See more details
1353 * in alloc_vmap_area() function.
82dd23e8
URS
1354 */
1355 lva = kmem_cache_alloc(vmap_area_cachep, GFP_NOWAIT);
1356 if (!lva)
1357 return -1;
1358 }
68ad4a33
URS
1359
1360 /*
1361 * Build the remainder.
1362 */
1363 lva->va_start = va->va_start;
1364 lva->va_end = nva_start_addr;
1365
1366 /*
1367 * Shrink this VA to remaining size.
1368 */
1369 va->va_start = nva_start_addr + size;
1370 } else {
1371 return -1;
1372 }
1373
1374 if (type != FL_FIT_TYPE) {
1375 augment_tree_propagate_from(va);
1376
2c929233 1377 if (lva) /* type == NE_FIT_TYPE */
68ad4a33
URS
1378 insert_vmap_area_augment(lva, &va->rb_node,
1379 &free_vmap_area_root, &free_vmap_area_list);
1380 }
1381
1382 return 0;
1383}
1384
1385/*
1386 * Returns a start address of the newly allocated area, if success.
1387 * Otherwise a vend is returned that indicates failure.
1388 */
1389static __always_inline unsigned long
1390__alloc_vmap_area(unsigned long size, unsigned long align,
cacca6ba 1391 unsigned long vstart, unsigned long vend)
68ad4a33
URS
1392{
1393 unsigned long nva_start_addr;
1394 struct vmap_area *va;
1395 enum fit_type type;
1396 int ret;
1397
1398 va = find_vmap_lowest_match(size, align, vstart);
1399 if (unlikely(!va))
1400 return vend;
1401
1402 if (va->va_start > vstart)
1403 nva_start_addr = ALIGN(va->va_start, align);
1404 else
1405 nva_start_addr = ALIGN(vstart, align);
1406
1407 /* Check the "vend" restriction. */
1408 if (nva_start_addr + size > vend)
1409 return vend;
1410
1411 /* Classify what we have found. */
1412 type = classify_va_fit_type(va, nva_start_addr, size);
1413 if (WARN_ON_ONCE(type == NOTHING_FIT))
1414 return vend;
1415
1416 /* Update the free vmap_area. */
1417 ret = adjust_va_to_fit_type(va, nva_start_addr, size, type);
1418 if (ret)
1419 return vend;
1420
a6cf4e0f
URS
1421#if DEBUG_AUGMENT_LOWEST_MATCH_CHECK
1422 find_vmap_lowest_match_check(size);
1423#endif
1424
68ad4a33
URS
1425 return nva_start_addr;
1426}
4da56b99 1427
d98c9e83
AR
1428/*
1429 * Free a region of KVA allocated by alloc_vmap_area
1430 */
1431static void free_vmap_area(struct vmap_area *va)
1432{
1433 /*
1434 * Remove from the busy tree/list.
1435 */
1436 spin_lock(&vmap_area_lock);
1437 unlink_va(va, &vmap_area_root);
1438 spin_unlock(&vmap_area_lock);
1439
1440 /*
1441 * Insert/Merge it back to the free tree/list.
1442 */
1443 spin_lock(&free_vmap_area_lock);
96e2db45 1444 merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
d98c9e83
AR
1445 spin_unlock(&free_vmap_area_lock);
1446}
1447
187f8cc4
URS
1448static inline void
1449preload_this_cpu_lock(spinlock_t *lock, gfp_t gfp_mask, int node)
1450{
1451 struct vmap_area *va = NULL;
1452
1453 /*
1454 * Preload this CPU with one extra vmap_area object. It is used
1455 * when fit type of free area is NE_FIT_TYPE. It guarantees that
1456 * a CPU that does an allocation is preloaded.
1457 *
1458 * We do it in non-atomic context, thus it allows us to use more
1459 * permissive allocation masks to be more stable under low memory
1460 * condition and high memory pressure.
1461 */
1462 if (!this_cpu_read(ne_fit_preload_node))
1463 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
1464
1465 spin_lock(lock);
1466
1467 if (va && __this_cpu_cmpxchg(ne_fit_preload_node, NULL, va))
1468 kmem_cache_free(vmap_area_cachep, va);
1469}
1470
db64fe02
NP
1471/*
1472 * Allocate a region of KVA of the specified size and alignment, within the
1473 * vstart and vend.
1474 */
1475static struct vmap_area *alloc_vmap_area(unsigned long size,
1476 unsigned long align,
1477 unsigned long vstart, unsigned long vend,
1478 int node, gfp_t gfp_mask)
1479{
187f8cc4 1480 struct vmap_area *va;
1da177e4 1481 unsigned long addr;
db64fe02 1482 int purged = 0;
d98c9e83 1483 int ret;
db64fe02 1484
7766970c 1485 BUG_ON(!size);
891c49ab 1486 BUG_ON(offset_in_page(size));
89699605 1487 BUG_ON(!is_power_of_2(align));
db64fe02 1488
68ad4a33
URS
1489 if (unlikely(!vmap_initialized))
1490 return ERR_PTR(-EBUSY);
1491
5803ed29 1492 might_sleep();
f07116d7 1493 gfp_mask = gfp_mask & GFP_RECLAIM_MASK;
4da56b99 1494
f07116d7 1495 va = kmem_cache_alloc_node(vmap_area_cachep, gfp_mask, node);
db64fe02
NP
1496 if (unlikely(!va))
1497 return ERR_PTR(-ENOMEM);
1498
7f88f88f
CM
1499 /*
1500 * Only scan the relevant parts containing pointers to other objects
1501 * to avoid false negatives.
1502 */
f07116d7 1503 kmemleak_scan_area(&va->rb_node, SIZE_MAX, gfp_mask);
7f88f88f 1504
db64fe02 1505retry:
187f8cc4
URS
1506 preload_this_cpu_lock(&free_vmap_area_lock, gfp_mask, node);
1507 addr = __alloc_vmap_area(size, align, vstart, vend);
1508 spin_unlock(&free_vmap_area_lock);
89699605 1509
afd07389 1510 /*
68ad4a33
URS
1511 * If an allocation fails, the "vend" address is
1512 * returned. Therefore trigger the overflow path.
afd07389 1513 */
68ad4a33 1514 if (unlikely(addr == vend))
89699605 1515 goto overflow;
db64fe02
NP
1516
1517 va->va_start = addr;
1518 va->va_end = addr + size;
688fcbfc 1519 va->vm = NULL;
68ad4a33 1520
e36176be
URS
1521 spin_lock(&vmap_area_lock);
1522 insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
db64fe02
NP
1523 spin_unlock(&vmap_area_lock);
1524
61e16557 1525 BUG_ON(!IS_ALIGNED(va->va_start, align));
89699605
NP
1526 BUG_ON(va->va_start < vstart);
1527 BUG_ON(va->va_end > vend);
1528
d98c9e83
AR
1529 ret = kasan_populate_vmalloc(addr, size);
1530 if (ret) {
1531 free_vmap_area(va);
1532 return ERR_PTR(ret);
1533 }
1534
db64fe02 1535 return va;
89699605
NP
1536
1537overflow:
89699605
NP
1538 if (!purged) {
1539 purge_vmap_area_lazy();
1540 purged = 1;
1541 goto retry;
1542 }
4da56b99
CW
1543
1544 if (gfpflags_allow_blocking(gfp_mask)) {
1545 unsigned long freed = 0;
1546 blocking_notifier_call_chain(&vmap_notify_list, 0, &freed);
1547 if (freed > 0) {
1548 purged = 0;
1549 goto retry;
1550 }
1551 }
1552
03497d76 1553 if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit())
756a025f
JP
1554 pr_warn("vmap allocation for size %lu failed: use vmalloc=<size> to increase size\n",
1555 size);
68ad4a33
URS
1556
1557 kmem_cache_free(vmap_area_cachep, va);
89699605 1558 return ERR_PTR(-EBUSY);
db64fe02
NP
1559}
1560
4da56b99
CW
1561int register_vmap_purge_notifier(struct notifier_block *nb)
1562{
1563 return blocking_notifier_chain_register(&vmap_notify_list, nb);
1564}
1565EXPORT_SYMBOL_GPL(register_vmap_purge_notifier);
1566
1567int unregister_vmap_purge_notifier(struct notifier_block *nb)
1568{
1569 return blocking_notifier_chain_unregister(&vmap_notify_list, nb);
1570}
1571EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
1572
db64fe02
NP
1573/*
1574 * lazy_max_pages is the maximum amount of virtual address space we gather up
1575 * before attempting to purge with a TLB flush.
1576 *
1577 * There is a tradeoff here: a larger number will cover more kernel page tables
1578 * and take slightly longer to purge, but it will linearly reduce the number of
1579 * global TLB flushes that must be performed. It would seem natural to scale
1580 * this number up linearly with the number of CPUs (because vmapping activity
1581 * could also scale linearly with the number of CPUs), however it is likely
1582 * that in practice, workloads might be constrained in other ways that mean
1583 * vmap activity will not scale linearly with CPUs. Also, I want to be
1584 * conservative and not introduce a big latency on huge systems, so go with
1585 * a less aggressive log scale. It will still be an improvement over the old
1586 * code, and it will be simple to change the scale factor if we find that it
1587 * becomes a problem on bigger systems.
1588 */
1589static unsigned long lazy_max_pages(void)
1590{
1591 unsigned int log;
1592
1593 log = fls(num_online_cpus());
1594
1595 return log * (32UL * 1024 * 1024 / PAGE_SIZE);
1596}
1597
4d36e6f8 1598static atomic_long_t vmap_lazy_nr = ATOMIC_LONG_INIT(0);
db64fe02 1599
0574ecd1 1600/*
f0953a1b 1601 * Serialize vmap purging. There is no actual critical section protected
0574ecd1
CH
1602 * by this look, but we want to avoid concurrent calls for performance
1603 * reasons and to make the pcpu_get_vm_areas more deterministic.
1604 */
f9e09977 1605static DEFINE_MUTEX(vmap_purge_lock);
0574ecd1 1606
02b709df
NP
1607/* for per-CPU blocks */
1608static void purge_fragmented_blocks_allcpus(void);
1609
3ee48b6a
CW
1610/*
1611 * called before a call to iounmap() if the caller wants vm_area_struct's
1612 * immediately freed.
1613 */
1614void set_iounmap_nonlazy(void)
1615{
4d36e6f8 1616 atomic_long_set(&vmap_lazy_nr, lazy_max_pages()+1);
3ee48b6a
CW
1617}
1618
db64fe02
NP
1619/*
1620 * Purges all lazily-freed vmap areas.
db64fe02 1621 */
0574ecd1 1622static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
db64fe02 1623{
4d36e6f8 1624 unsigned long resched_threshold;
96e2db45
URS
1625 struct list_head local_pure_list;
1626 struct vmap_area *va, *n_va;
db64fe02 1627
0574ecd1 1628 lockdep_assert_held(&vmap_purge_lock);
02b709df 1629
96e2db45
URS
1630 spin_lock(&purge_vmap_area_lock);
1631 purge_vmap_area_root = RB_ROOT;
1632 list_replace_init(&purge_vmap_area_list, &local_pure_list);
1633 spin_unlock(&purge_vmap_area_lock);
1634
1635 if (unlikely(list_empty(&local_pure_list)))
68571be9
URS
1636 return false;
1637
96e2db45
URS
1638 start = min(start,
1639 list_first_entry(&local_pure_list,
1640 struct vmap_area, list)->va_start);
1641
1642 end = max(end,
1643 list_last_entry(&local_pure_list,
1644 struct vmap_area, list)->va_end);
db64fe02 1645
0574ecd1 1646 flush_tlb_kernel_range(start, end);
4d36e6f8 1647 resched_threshold = lazy_max_pages() << 1;
db64fe02 1648
e36176be 1649 spin_lock(&free_vmap_area_lock);
96e2db45 1650 list_for_each_entry_safe(va, n_va, &local_pure_list, list) {
4d36e6f8 1651 unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
3c5c3cfb
DA
1652 unsigned long orig_start = va->va_start;
1653 unsigned long orig_end = va->va_end;
763b218d 1654
dd3b8353
URS
1655 /*
1656 * Finally insert or merge lazily-freed area. It is
1657 * detached and there is no need to "unlink" it from
1658 * anything.
1659 */
96e2db45
URS
1660 va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
1661 &free_vmap_area_list);
3c5c3cfb 1662
9c801f61
URS
1663 if (!va)
1664 continue;
1665
3c5c3cfb
DA
1666 if (is_vmalloc_or_module_addr((void *)orig_start))
1667 kasan_release_vmalloc(orig_start, orig_end,
1668 va->va_start, va->va_end);
dd3b8353 1669
4d36e6f8 1670 atomic_long_sub(nr, &vmap_lazy_nr);
68571be9 1671
4d36e6f8 1672 if (atomic_long_read(&vmap_lazy_nr) < resched_threshold)
e36176be 1673 cond_resched_lock(&free_vmap_area_lock);
763b218d 1674 }
e36176be 1675 spin_unlock(&free_vmap_area_lock);
0574ecd1 1676 return true;
db64fe02
NP
1677}
1678
496850e5
NP
1679/*
1680 * Kick off a purge of the outstanding lazy areas. Don't bother if somebody
1681 * is already purging.
1682 */
1683static void try_purge_vmap_area_lazy(void)
1684{
f9e09977 1685 if (mutex_trylock(&vmap_purge_lock)) {
0574ecd1 1686 __purge_vmap_area_lazy(ULONG_MAX, 0);
f9e09977 1687 mutex_unlock(&vmap_purge_lock);
0574ecd1 1688 }
496850e5
NP
1689}
1690
db64fe02
NP
1691/*
1692 * Kick off a purge of the outstanding lazy areas.
1693 */
1694static void purge_vmap_area_lazy(void)
1695{
f9e09977 1696 mutex_lock(&vmap_purge_lock);
0574ecd1
CH
1697 purge_fragmented_blocks_allcpus();
1698 __purge_vmap_area_lazy(ULONG_MAX, 0);
f9e09977 1699 mutex_unlock(&vmap_purge_lock);
db64fe02
NP
1700}
1701
1702/*
64141da5
JF
1703 * Free a vmap area, caller ensuring that the area has been unmapped
1704 * and flush_cache_vunmap had been called for the correct range
1705 * previously.
db64fe02 1706 */
64141da5 1707static void free_vmap_area_noflush(struct vmap_area *va)
db64fe02 1708{
4d36e6f8 1709 unsigned long nr_lazy;
80c4bd7a 1710
dd3b8353
URS
1711 spin_lock(&vmap_area_lock);
1712 unlink_va(va, &vmap_area_root);
1713 spin_unlock(&vmap_area_lock);
1714
4d36e6f8
URS
1715 nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
1716 PAGE_SHIFT, &vmap_lazy_nr);
80c4bd7a 1717
96e2db45
URS
1718 /*
1719 * Merge or place it to the purge tree/list.
1720 */
1721 spin_lock(&purge_vmap_area_lock);
1722 merge_or_add_vmap_area(va,
1723 &purge_vmap_area_root, &purge_vmap_area_list);
1724 spin_unlock(&purge_vmap_area_lock);
80c4bd7a 1725
96e2db45 1726 /* After this point, we may free va at any time */
80c4bd7a 1727 if (unlikely(nr_lazy > lazy_max_pages()))
496850e5 1728 try_purge_vmap_area_lazy();
db64fe02
NP
1729}
1730
b29acbdc
NP
1731/*
1732 * Free and unmap a vmap area
1733 */
1734static void free_unmap_vmap_area(struct vmap_area *va)
1735{
1736 flush_cache_vunmap(va->va_start, va->va_end);
4ad0ae8c 1737 vunmap_range_noflush(va->va_start, va->va_end);
8e57f8ac 1738 if (debug_pagealloc_enabled_static())
82a2e924
CP
1739 flush_tlb_kernel_range(va->va_start, va->va_end);
1740
c8eef01e 1741 free_vmap_area_noflush(va);
b29acbdc
NP
1742}
1743
db64fe02
NP
1744static struct vmap_area *find_vmap_area(unsigned long addr)
1745{
1746 struct vmap_area *va;
1747
1748 spin_lock(&vmap_area_lock);
1749 va = __find_vmap_area(addr);
1750 spin_unlock(&vmap_area_lock);
1751
1752 return va;
1753}
1754
db64fe02
NP
1755/*** Per cpu kva allocator ***/
1756
1757/*
1758 * vmap space is limited especially on 32 bit architectures. Ensure there is
1759 * room for at least 16 percpu vmap blocks per CPU.
1760 */
1761/*
1762 * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able
1763 * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess
1764 * instead (we just need a rough idea)
1765 */
1766#if BITS_PER_LONG == 32
1767#define VMALLOC_SPACE (128UL*1024*1024)
1768#else
1769#define VMALLOC_SPACE (128UL*1024*1024*1024)
1770#endif
1771
1772#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE)
1773#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */
1774#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */
1775#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2)
1776#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */
1777#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */
f982f915
CL
1778#define VMAP_BBMAP_BITS \
1779 VMAP_MIN(VMAP_BBMAP_BITS_MAX, \
1780 VMAP_MAX(VMAP_BBMAP_BITS_MIN, \
1781 VMALLOC_PAGES / roundup_pow_of_two(NR_CPUS) / 16))
db64fe02
NP
1782
1783#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE)
1784
1785struct vmap_block_queue {
1786 spinlock_t lock;
1787 struct list_head free;
db64fe02
NP
1788};
1789
1790struct vmap_block {
1791 spinlock_t lock;
1792 struct vmap_area *va;
db64fe02 1793 unsigned long free, dirty;
7d61bfe8 1794 unsigned long dirty_min, dirty_max; /*< dirty range */
de560423
NP
1795 struct list_head free_list;
1796 struct rcu_head rcu_head;
02b709df 1797 struct list_head purge;
db64fe02
NP
1798};
1799
1800/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */
1801static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
1802
1803/*
0f14599c 1804 * XArray of vmap blocks, indexed by address, to quickly find a vmap block
db64fe02
NP
1805 * in the free path. Could get rid of this if we change the API to return a
1806 * "cookie" from alloc, to be passed to free. But no big deal yet.
1807 */
0f14599c 1808static DEFINE_XARRAY(vmap_blocks);
db64fe02
NP
1809
1810/*
1811 * We should probably have a fallback mechanism to allocate virtual memory
1812 * out of partially filled vmap blocks. However vmap block sizing should be
1813 * fairly reasonable according to the vmalloc size, so it shouldn't be a
1814 * big problem.
1815 */
1816
1817static unsigned long addr_to_vb_idx(unsigned long addr)
1818{
1819 addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1);
1820 addr /= VMAP_BLOCK_SIZE;
1821 return addr;
1822}
1823
cf725ce2
RP
1824static void *vmap_block_vaddr(unsigned long va_start, unsigned long pages_off)
1825{
1826 unsigned long addr;
1827
1828 addr = va_start + (pages_off << PAGE_SHIFT);
1829 BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(va_start));
1830 return (void *)addr;
1831}
1832
1833/**
1834 * new_vmap_block - allocates new vmap_block and occupies 2^order pages in this
1835 * block. Of course pages number can't exceed VMAP_BBMAP_BITS
1836 * @order: how many 2^order pages should be occupied in newly allocated block
1837 * @gfp_mask: flags for the page level allocator
1838 *
a862f68a 1839 * Return: virtual address in a newly allocated block or ERR_PTR(-errno)
cf725ce2
RP
1840 */
1841static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
db64fe02
NP
1842{
1843 struct vmap_block_queue *vbq;
1844 struct vmap_block *vb;
1845 struct vmap_area *va;
1846 unsigned long vb_idx;
1847 int node, err;
cf725ce2 1848 void *vaddr;
db64fe02
NP
1849
1850 node = numa_node_id();
1851
1852 vb = kmalloc_node(sizeof(struct vmap_block),
1853 gfp_mask & GFP_RECLAIM_MASK, node);
1854 if (unlikely(!vb))
1855 return ERR_PTR(-ENOMEM);
1856
1857 va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE,
1858 VMALLOC_START, VMALLOC_END,
1859 node, gfp_mask);
ddf9c6d4 1860 if (IS_ERR(va)) {
db64fe02 1861 kfree(vb);
e7d86340 1862 return ERR_CAST(va);
db64fe02
NP
1863 }
1864
cf725ce2 1865 vaddr = vmap_block_vaddr(va->va_start, 0);
db64fe02
NP
1866 spin_lock_init(&vb->lock);
1867 vb->va = va;
cf725ce2
RP
1868 /* At least something should be left free */
1869 BUG_ON(VMAP_BBMAP_BITS <= (1UL << order));
1870 vb->free = VMAP_BBMAP_BITS - (1UL << order);
db64fe02 1871 vb->dirty = 0;
7d61bfe8
RP
1872 vb->dirty_min = VMAP_BBMAP_BITS;
1873 vb->dirty_max = 0;
db64fe02 1874 INIT_LIST_HEAD(&vb->free_list);
db64fe02
NP
1875
1876 vb_idx = addr_to_vb_idx(va->va_start);
0f14599c
MWO
1877 err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
1878 if (err) {
1879 kfree(vb);
1880 free_vmap_area(va);
1881 return ERR_PTR(err);
1882 }
db64fe02
NP
1883
1884 vbq = &get_cpu_var(vmap_block_queue);
db64fe02 1885 spin_lock(&vbq->lock);
68ac546f 1886 list_add_tail_rcu(&vb->free_list, &vbq->free);
db64fe02 1887 spin_unlock(&vbq->lock);
3f04ba85 1888 put_cpu_var(vmap_block_queue);
db64fe02 1889
cf725ce2 1890 return vaddr;
db64fe02
NP
1891}
1892
db64fe02
NP
1893static void free_vmap_block(struct vmap_block *vb)
1894{
1895 struct vmap_block *tmp;
db64fe02 1896
0f14599c 1897 tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
db64fe02
NP
1898 BUG_ON(tmp != vb);
1899
64141da5 1900 free_vmap_area_noflush(vb->va);
22a3c7d1 1901 kfree_rcu(vb, rcu_head);
db64fe02
NP
1902}
1903
02b709df
NP
1904static void purge_fragmented_blocks(int cpu)
1905{
1906 LIST_HEAD(purge);
1907 struct vmap_block *vb;
1908 struct vmap_block *n_vb;
1909 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
1910
1911 rcu_read_lock();
1912 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
1913
1914 if (!(vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS))
1915 continue;
1916
1917 spin_lock(&vb->lock);
1918 if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
1919 vb->free = 0; /* prevent further allocs after releasing lock */
1920 vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
7d61bfe8
RP
1921 vb->dirty_min = 0;
1922 vb->dirty_max = VMAP_BBMAP_BITS;
02b709df
NP
1923 spin_lock(&vbq->lock);
1924 list_del_rcu(&vb->free_list);
1925 spin_unlock(&vbq->lock);
1926 spin_unlock(&vb->lock);
1927 list_add_tail(&vb->purge, &purge);
1928 } else
1929 spin_unlock(&vb->lock);
1930 }
1931 rcu_read_unlock();
1932
1933 list_for_each_entry_safe(vb, n_vb, &purge, purge) {
1934 list_del(&vb->purge);
1935 free_vmap_block(vb);
1936 }
1937}
1938
02b709df
NP
1939static void purge_fragmented_blocks_allcpus(void)
1940{
1941 int cpu;
1942
1943 for_each_possible_cpu(cpu)
1944 purge_fragmented_blocks(cpu);
1945}
1946
db64fe02
NP
1947static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
1948{
1949 struct vmap_block_queue *vbq;
1950 struct vmap_block *vb;
cf725ce2 1951 void *vaddr = NULL;
db64fe02
NP
1952 unsigned int order;
1953
891c49ab 1954 BUG_ON(offset_in_page(size));
db64fe02 1955 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
aa91c4d8
JK
1956 if (WARN_ON(size == 0)) {
1957 /*
1958 * Allocating 0 bytes isn't what caller wants since
1959 * get_order(0) returns funny result. Just warn and terminate
1960 * early.
1961 */
1962 return NULL;
1963 }
db64fe02
NP
1964 order = get_order(size);
1965
db64fe02
NP
1966 rcu_read_lock();
1967 vbq = &get_cpu_var(vmap_block_queue);
1968 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
cf725ce2 1969 unsigned long pages_off;
db64fe02
NP
1970
1971 spin_lock(&vb->lock);
cf725ce2
RP
1972 if (vb->free < (1UL << order)) {
1973 spin_unlock(&vb->lock);
1974 continue;
1975 }
02b709df 1976
cf725ce2
RP
1977 pages_off = VMAP_BBMAP_BITS - vb->free;
1978 vaddr = vmap_block_vaddr(vb->va->va_start, pages_off);
02b709df
NP
1979 vb->free -= 1UL << order;
1980 if (vb->free == 0) {
1981 spin_lock(&vbq->lock);
1982 list_del_rcu(&vb->free_list);
1983 spin_unlock(&vbq->lock);
1984 }
cf725ce2 1985
02b709df
NP
1986 spin_unlock(&vb->lock);
1987 break;
db64fe02 1988 }
02b709df 1989
3f04ba85 1990 put_cpu_var(vmap_block_queue);
db64fe02
NP
1991 rcu_read_unlock();
1992
cf725ce2
RP
1993 /* Allocate new block if nothing was found */
1994 if (!vaddr)
1995 vaddr = new_vmap_block(order, gfp_mask);
db64fe02 1996
cf725ce2 1997 return vaddr;
db64fe02
NP
1998}
1999
78a0e8c4 2000static void vb_free(unsigned long addr, unsigned long size)
db64fe02
NP
2001{
2002 unsigned long offset;
db64fe02
NP
2003 unsigned int order;
2004 struct vmap_block *vb;
2005
891c49ab 2006 BUG_ON(offset_in_page(size));
db64fe02 2007 BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
b29acbdc 2008
78a0e8c4 2009 flush_cache_vunmap(addr, addr + size);
b29acbdc 2010
db64fe02 2011 order = get_order(size);
78a0e8c4 2012 offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
0f14599c 2013 vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
db64fe02 2014
4ad0ae8c 2015 vunmap_range_noflush(addr, addr + size);
64141da5 2016
8e57f8ac 2017 if (debug_pagealloc_enabled_static())
78a0e8c4 2018 flush_tlb_kernel_range(addr, addr + size);
82a2e924 2019
db64fe02 2020 spin_lock(&vb->lock);
7d61bfe8
RP
2021
2022 /* Expand dirty range */
2023 vb->dirty_min = min(vb->dirty_min, offset);
2024 vb->dirty_max = max(vb->dirty_max, offset + (1UL << order));
d086817d 2025
db64fe02
NP
2026 vb->dirty += 1UL << order;
2027 if (vb->dirty == VMAP_BBMAP_BITS) {
de560423 2028 BUG_ON(vb->free);
db64fe02
NP
2029 spin_unlock(&vb->lock);
2030 free_vmap_block(vb);
2031 } else
2032 spin_unlock(&vb->lock);
2033}
2034
868b104d 2035static void _vm_unmap_aliases(unsigned long start, unsigned long end, int flush)
db64fe02 2036{
db64fe02 2037 int cpu;
db64fe02 2038
9b463334
JF
2039 if (unlikely(!vmap_initialized))
2040 return;
2041
5803ed29
CH
2042 might_sleep();
2043
db64fe02
NP
2044 for_each_possible_cpu(cpu) {
2045 struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu);
2046 struct vmap_block *vb;
2047
2048 rcu_read_lock();
2049 list_for_each_entry_rcu(vb, &vbq->free, free_list) {
db64fe02 2050 spin_lock(&vb->lock);
ad216c03 2051 if (vb->dirty && vb->dirty != VMAP_BBMAP_BITS) {
7d61bfe8 2052 unsigned long va_start = vb->va->va_start;
db64fe02 2053 unsigned long s, e;
b136be5e 2054
7d61bfe8
RP
2055 s = va_start + (vb->dirty_min << PAGE_SHIFT);
2056 e = va_start + (vb->dirty_max << PAGE_SHIFT);
db64fe02 2057
7d61bfe8
RP
2058 start = min(s, start);
2059 end = max(e, end);
db64fe02 2060
7d61bfe8 2061 flush = 1;
db64fe02
NP
2062 }
2063 spin_unlock(&vb->lock);
2064 }
2065 rcu_read_unlock();
2066 }
2067
f9e09977 2068 mutex_lock(&vmap_purge_lock);
0574ecd1
CH
2069 purge_fragmented_blocks_allcpus();
2070 if (!__purge_vmap_area_lazy(start, end) && flush)
2071 flush_tlb_kernel_range(start, end);
f9e09977 2072 mutex_unlock(&vmap_purge_lock);
db64fe02 2073}
868b104d
RE
2074
2075/**
2076 * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer
2077 *
2078 * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily
2079 * to amortize TLB flushing overheads. What this means is that any page you
2080 * have now, may, in a former life, have been mapped into kernel virtual
2081 * address by the vmap layer and so there might be some CPUs with TLB entries
2082 * still referencing that page (additional to the regular 1:1 kernel mapping).
2083 *
2084 * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can
2085 * be sure that none of the pages we have control over will have any aliases
2086 * from the vmap layer.
2087 */
2088void vm_unmap_aliases(void)
2089{
2090 unsigned long start = ULONG_MAX, end = 0;
2091 int flush = 0;
2092
2093 _vm_unmap_aliases(start, end, flush);
2094}
db64fe02
NP
2095EXPORT_SYMBOL_GPL(vm_unmap_aliases);
2096
2097/**
2098 * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram
2099 * @mem: the pointer returned by vm_map_ram
2100 * @count: the count passed to that vm_map_ram call (cannot unmap partial)
2101 */
2102void vm_unmap_ram(const void *mem, unsigned int count)
2103{
65ee03c4 2104 unsigned long size = (unsigned long)count << PAGE_SHIFT;
db64fe02 2105 unsigned long addr = (unsigned long)mem;
9c3acf60 2106 struct vmap_area *va;
db64fe02 2107
5803ed29 2108 might_sleep();
db64fe02
NP
2109 BUG_ON(!addr);
2110 BUG_ON(addr < VMALLOC_START);
2111 BUG_ON(addr > VMALLOC_END);
a1c0b1a0 2112 BUG_ON(!PAGE_ALIGNED(addr));
db64fe02 2113
d98c9e83
AR
2114 kasan_poison_vmalloc(mem, size);
2115
9c3acf60 2116 if (likely(count <= VMAP_MAX_ALLOC)) {
05e3ff95 2117 debug_check_no_locks_freed(mem, size);
78a0e8c4 2118 vb_free(addr, size);
9c3acf60
CH
2119 return;
2120 }
2121
2122 va = find_vmap_area(addr);
2123 BUG_ON(!va);
05e3ff95
CP
2124 debug_check_no_locks_freed((void *)va->va_start,
2125 (va->va_end - va->va_start));
9c3acf60 2126 free_unmap_vmap_area(va);
db64fe02
NP
2127}
2128EXPORT_SYMBOL(vm_unmap_ram);
2129
2130/**
2131 * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space)
2132 * @pages: an array of pointers to the pages to be mapped
2133 * @count: number of pages
2134 * @node: prefer to allocate data structures on this node
e99c97ad 2135 *
36437638
GK
2136 * If you use this function for less than VMAP_MAX_ALLOC pages, it could be
2137 * faster than vmap so it's good. But if you mix long-life and short-life
2138 * objects with vm_map_ram(), it could consume lots of address space through
2139 * fragmentation (especially on a 32bit machine). You could see failures in
2140 * the end. Please use this function for short-lived objects.
2141 *
e99c97ad 2142 * Returns: a pointer to the address that has been mapped, or %NULL on failure
db64fe02 2143 */
d4efd79a 2144void *vm_map_ram(struct page **pages, unsigned int count, int node)
db64fe02 2145{
65ee03c4 2146 unsigned long size = (unsigned long)count << PAGE_SHIFT;
db64fe02
NP
2147 unsigned long addr;
2148 void *mem;
2149
2150 if (likely(count <= VMAP_MAX_ALLOC)) {
2151 mem = vb_alloc(size, GFP_KERNEL);
2152 if (IS_ERR(mem))
2153 return NULL;
2154 addr = (unsigned long)mem;
2155 } else {
2156 struct vmap_area *va;
2157 va = alloc_vmap_area(size, PAGE_SIZE,
2158 VMALLOC_START, VMALLOC_END, node, GFP_KERNEL);
2159 if (IS_ERR(va))
2160 return NULL;
2161
2162 addr = va->va_start;
2163 mem = (void *)addr;
2164 }
d98c9e83
AR
2165
2166 kasan_unpoison_vmalloc(mem, size);
2167
b67177ec
NP
2168 if (vmap_pages_range(addr, addr + size, PAGE_KERNEL,
2169 pages, PAGE_SHIFT) < 0) {
db64fe02
NP
2170 vm_unmap_ram(mem, count);
2171 return NULL;
2172 }
b67177ec 2173
db64fe02
NP
2174 return mem;
2175}
2176EXPORT_SYMBOL(vm_map_ram);
2177
4341fa45 2178static struct vm_struct *vmlist __initdata;
92eac168 2179
121e6f32
NP
2180static inline unsigned int vm_area_page_order(struct vm_struct *vm)
2181{
2182#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
2183 return vm->page_order;
2184#else
2185 return 0;
2186#endif
2187}
2188
2189static inline void set_vm_area_page_order(struct vm_struct *vm, unsigned int order)
2190{
2191#ifdef CONFIG_HAVE_ARCH_HUGE_VMALLOC
2192 vm->page_order = order;
2193#else
2194 BUG_ON(order != 0);
2195#endif
2196}
2197
be9b7335
NP
2198/**
2199 * vm_area_add_early - add vmap area early during boot
2200 * @vm: vm_struct to add
2201 *
2202 * This function is used to add fixed kernel vm area to vmlist before
2203 * vmalloc_init() is called. @vm->addr, @vm->size, and @vm->flags
2204 * should contain proper values and the other fields should be zero.
2205 *
2206 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
2207 */
2208void __init vm_area_add_early(struct vm_struct *vm)
2209{
2210 struct vm_struct *tmp, **p;
2211
2212 BUG_ON(vmap_initialized);
2213 for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) {
2214 if (tmp->addr >= vm->addr) {
2215 BUG_ON(tmp->addr < vm->addr + vm->size);
2216 break;
2217 } else
2218 BUG_ON(tmp->addr + tmp->size > vm->addr);
2219 }
2220 vm->next = *p;
2221 *p = vm;
2222}
2223
f0aa6617
TH
2224/**
2225 * vm_area_register_early - register vmap area early during boot
2226 * @vm: vm_struct to register
c0c0a293 2227 * @align: requested alignment
f0aa6617
TH
2228 *
2229 * This function is used to register kernel vm area before
2230 * vmalloc_init() is called. @vm->size and @vm->flags should contain
2231 * proper values on entry and other fields should be zero. On return,
2232 * vm->addr contains the allocated address.
2233 *
2234 * DO NOT USE THIS FUNCTION UNLESS YOU KNOW WHAT YOU'RE DOING.
2235 */
c0c0a293 2236void __init vm_area_register_early(struct vm_struct *vm, size_t align)
f0aa6617
TH
2237{
2238 static size_t vm_init_off __initdata;
c0c0a293
TH
2239 unsigned long addr;
2240
2241 addr = ALIGN(VMALLOC_START + vm_init_off, align);
2242 vm_init_off = PFN_ALIGN(addr + vm->size) - VMALLOC_START;
f0aa6617 2243
c0c0a293 2244 vm->addr = (void *)addr;
f0aa6617 2245
be9b7335 2246 vm_area_add_early(vm);
f0aa6617
TH
2247}
2248
68ad4a33
URS
2249static void vmap_init_free_space(void)
2250{
2251 unsigned long vmap_start = 1;
2252 const unsigned long vmap_end = ULONG_MAX;
2253 struct vmap_area *busy, *free;
2254
2255 /*
2256 * B F B B B F
2257 * -|-----|.....|-----|-----|-----|.....|-
2258 * | The KVA space |
2259 * |<--------------------------------->|
2260 */
2261 list_for_each_entry(busy, &vmap_area_list, list) {
2262 if (busy->va_start - vmap_start > 0) {
2263 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
2264 if (!WARN_ON_ONCE(!free)) {
2265 free->va_start = vmap_start;
2266 free->va_end = busy->va_start;
2267
2268 insert_vmap_area_augment(free, NULL,
2269 &free_vmap_area_root,
2270 &free_vmap_area_list);
2271 }
2272 }
2273
2274 vmap_start = busy->va_end;
2275 }
2276
2277 if (vmap_end - vmap_start > 0) {
2278 free = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
2279 if (!WARN_ON_ONCE(!free)) {
2280 free->va_start = vmap_start;
2281 free->va_end = vmap_end;
2282
2283 insert_vmap_area_augment(free, NULL,
2284 &free_vmap_area_root,
2285 &free_vmap_area_list);
2286 }
2287 }
2288}
2289
db64fe02
NP
2290void __init vmalloc_init(void)
2291{
822c18f2
IK
2292 struct vmap_area *va;
2293 struct vm_struct *tmp;
db64fe02
NP
2294 int i;
2295
68ad4a33
URS
2296 /*
2297 * Create the cache for vmap_area objects.
2298 */
2299 vmap_area_cachep = KMEM_CACHE(vmap_area, SLAB_PANIC);
2300
db64fe02
NP
2301 for_each_possible_cpu(i) {
2302 struct vmap_block_queue *vbq;
32fcfd40 2303 struct vfree_deferred *p;
db64fe02
NP
2304
2305 vbq = &per_cpu(vmap_block_queue, i);
2306 spin_lock_init(&vbq->lock);
2307 INIT_LIST_HEAD(&vbq->free);
32fcfd40
AV
2308 p = &per_cpu(vfree_deferred, i);
2309 init_llist_head(&p->list);
2310 INIT_WORK(&p->wq, free_work);
db64fe02 2311 }
9b463334 2312
822c18f2
IK
2313 /* Import existing vmlist entries. */
2314 for (tmp = vmlist; tmp; tmp = tmp->next) {
68ad4a33
URS
2315 va = kmem_cache_zalloc(vmap_area_cachep, GFP_NOWAIT);
2316 if (WARN_ON_ONCE(!va))
2317 continue;
2318
822c18f2
IK
2319 va->va_start = (unsigned long)tmp->addr;
2320 va->va_end = va->va_start + tmp->size;
dbda591d 2321 va->vm = tmp;
68ad4a33 2322 insert_vmap_area(va, &vmap_area_root, &vmap_area_list);
822c18f2 2323 }
ca23e405 2324
68ad4a33
URS
2325 /*
2326 * Now we can initialize a free vmap space.
2327 */
2328 vmap_init_free_space();
9b463334 2329 vmap_initialized = true;
db64fe02
NP
2330}
2331
e36176be
URS
2332static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
2333 struct vmap_area *va, unsigned long flags, const void *caller)
cf88c790 2334{
cf88c790
TH
2335 vm->flags = flags;
2336 vm->addr = (void *)va->va_start;
2337 vm->size = va->va_end - va->va_start;
2338 vm->caller = caller;
db1aecaf 2339 va->vm = vm;
e36176be
URS
2340}
2341
2342static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
2343 unsigned long flags, const void *caller)
2344{
2345 spin_lock(&vmap_area_lock);
2346 setup_vmalloc_vm_locked(vm, va, flags, caller);
c69480ad 2347 spin_unlock(&vmap_area_lock);
f5252e00 2348}
cf88c790 2349
20fc02b4 2350static void clear_vm_uninitialized_flag(struct vm_struct *vm)
f5252e00 2351{
d4033afd 2352 /*
20fc02b4 2353 * Before removing VM_UNINITIALIZED,
d4033afd
JK
2354 * we should make sure that vm has proper values.
2355 * Pair with smp_rmb() in show_numa_info().
2356 */
2357 smp_wmb();
20fc02b4 2358 vm->flags &= ~VM_UNINITIALIZED;
cf88c790
TH
2359}
2360
db64fe02 2361static struct vm_struct *__get_vm_area_node(unsigned long size,
7ca3027b
DA
2362 unsigned long align, unsigned long shift, unsigned long flags,
2363 unsigned long start, unsigned long end, int node,
2364 gfp_t gfp_mask, const void *caller)
db64fe02 2365{
0006526d 2366 struct vmap_area *va;
db64fe02 2367 struct vm_struct *area;
d98c9e83 2368 unsigned long requested_size = size;
1da177e4 2369
52fd24ca 2370 BUG_ON(in_interrupt());
7ca3027b 2371 size = ALIGN(size, 1ul << shift);
31be8309
OH
2372 if (unlikely(!size))
2373 return NULL;
1da177e4 2374
252e5c6e 2375 if (flags & VM_IOREMAP)
2376 align = 1ul << clamp_t(int, get_count_order_long(size),
2377 PAGE_SHIFT, IOREMAP_MAX_ORDER);
2378
cf88c790 2379 area = kzalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node);
1da177e4
LT
2380 if (unlikely(!area))
2381 return NULL;
2382
71394fe5
AR
2383 if (!(flags & VM_NO_GUARD))
2384 size += PAGE_SIZE;
1da177e4 2385
db64fe02
NP
2386 va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
2387 if (IS_ERR(va)) {
2388 kfree(area);
2389 return NULL;
1da177e4 2390 }
1da177e4 2391
d98c9e83 2392 kasan_unpoison_vmalloc((void *)va->va_start, requested_size);
f5252e00 2393
d98c9e83 2394 setup_vmalloc_vm(area, va, flags, caller);
3c5c3cfb 2395
1da177e4 2396 return area;
1da177e4
LT
2397}
2398
c2968612
BH
2399struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
2400 unsigned long start, unsigned long end,
5e6cafc8 2401 const void *caller)
c2968612 2402{
7ca3027b
DA
2403 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end,
2404 NUMA_NO_NODE, GFP_KERNEL, caller);
c2968612
BH
2405}
2406
1da177e4 2407/**
92eac168
MR
2408 * get_vm_area - reserve a contiguous kernel virtual area
2409 * @size: size of the area
2410 * @flags: %VM_IOREMAP for I/O mappings or VM_ALLOC
1da177e4 2411 *
92eac168
MR
2412 * Search an area of @size in the kernel virtual mapping area,
2413 * and reserved it for out purposes. Returns the area descriptor
2414 * on success or %NULL on failure.
a862f68a
MR
2415 *
2416 * Return: the area descriptor on success or %NULL on failure.
1da177e4
LT
2417 */
2418struct vm_struct *get_vm_area(unsigned long size, unsigned long flags)
2419{
7ca3027b
DA
2420 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
2421 VMALLOC_START, VMALLOC_END,
00ef2d2f
DR
2422 NUMA_NO_NODE, GFP_KERNEL,
2423 __builtin_return_address(0));
23016969
CL
2424}
2425
2426struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags,
5e6cafc8 2427 const void *caller)
23016969 2428{
7ca3027b
DA
2429 return __get_vm_area_node(size, 1, PAGE_SHIFT, flags,
2430 VMALLOC_START, VMALLOC_END,
00ef2d2f 2431 NUMA_NO_NODE, GFP_KERNEL, caller);
1da177e4
LT
2432}
2433
e9da6e99 2434/**
92eac168
MR
2435 * find_vm_area - find a continuous kernel virtual area
2436 * @addr: base address
e9da6e99 2437 *
92eac168
MR
2438 * Search for the kernel VM area starting at @addr, and return it.
2439 * It is up to the caller to do all required locking to keep the returned
2440 * pointer valid.
a862f68a 2441 *
74640617 2442 * Return: the area descriptor on success or %NULL on failure.
e9da6e99
MS
2443 */
2444struct vm_struct *find_vm_area(const void *addr)
83342314 2445{
db64fe02 2446 struct vmap_area *va;
83342314 2447
db64fe02 2448 va = find_vmap_area((unsigned long)addr);
688fcbfc
PL
2449 if (!va)
2450 return NULL;
1da177e4 2451
688fcbfc 2452 return va->vm;
1da177e4
LT
2453}
2454
7856dfeb 2455/**
92eac168
MR
2456 * remove_vm_area - find and remove a continuous kernel virtual area
2457 * @addr: base address
7856dfeb 2458 *
92eac168
MR
2459 * Search for the kernel VM area starting at @addr, and remove it.
2460 * This function returns the found VM area, but using it is NOT safe
2461 * on SMP machines, except for its size or flags.
a862f68a 2462 *
74640617 2463 * Return: the area descriptor on success or %NULL on failure.
7856dfeb 2464 */
b3bdda02 2465struct vm_struct *remove_vm_area(const void *addr)
7856dfeb 2466{
db64fe02
NP
2467 struct vmap_area *va;
2468
5803ed29
CH
2469 might_sleep();
2470
dd3b8353
URS
2471 spin_lock(&vmap_area_lock);
2472 va = __find_vmap_area((unsigned long)addr);
688fcbfc 2473 if (va && va->vm) {
db1aecaf 2474 struct vm_struct *vm = va->vm;
f5252e00 2475
c69480ad 2476 va->vm = NULL;
c69480ad
JK
2477 spin_unlock(&vmap_area_lock);
2478
a5af5aa8 2479 kasan_free_shadow(vm);
dd32c279 2480 free_unmap_vmap_area(va);
dd32c279 2481
db64fe02
NP
2482 return vm;
2483 }
dd3b8353
URS
2484
2485 spin_unlock(&vmap_area_lock);
db64fe02 2486 return NULL;
7856dfeb
AK
2487}
2488
868b104d
RE
2489static inline void set_area_direct_map(const struct vm_struct *area,
2490 int (*set_direct_map)(struct page *page))
2491{
2492 int i;
2493
121e6f32 2494 /* HUGE_VMALLOC passes small pages to set_direct_map */
868b104d
RE
2495 for (i = 0; i < area->nr_pages; i++)
2496 if (page_address(area->pages[i]))
2497 set_direct_map(area->pages[i]);
2498}
2499
2500/* Handle removing and resetting vm mappings related to the vm_struct. */
2501static void vm_remove_mappings(struct vm_struct *area, int deallocate_pages)
2502{
868b104d 2503 unsigned long start = ULONG_MAX, end = 0;
121e6f32 2504 unsigned int page_order = vm_area_page_order(area);
868b104d 2505 int flush_reset = area->flags & VM_FLUSH_RESET_PERMS;
31e67340 2506 int flush_dmap = 0;
868b104d
RE
2507 int i;
2508
868b104d
RE
2509 remove_vm_area(area->addr);
2510
2511 /* If this is not VM_FLUSH_RESET_PERMS memory, no need for the below. */
2512 if (!flush_reset)
2513 return;
2514
2515 /*
2516 * If not deallocating pages, just do the flush of the VM area and
2517 * return.
2518 */
2519 if (!deallocate_pages) {
2520 vm_unmap_aliases();
2521 return;
2522 }
2523
2524 /*
2525 * If execution gets here, flush the vm mapping and reset the direct
2526 * map. Find the start and end range of the direct mappings to make sure
2527 * the vm_unmap_aliases() flush includes the direct map.
2528 */
121e6f32 2529 for (i = 0; i < area->nr_pages; i += 1U << page_order) {
8e41f872
RE
2530 unsigned long addr = (unsigned long)page_address(area->pages[i]);
2531 if (addr) {
121e6f32
NP
2532 unsigned long page_size;
2533
2534 page_size = PAGE_SIZE << page_order;
868b104d 2535 start = min(addr, start);
121e6f32 2536 end = max(addr + page_size, end);
31e67340 2537 flush_dmap = 1;
868b104d
RE
2538 }
2539 }
2540
2541 /*
2542 * Set direct map to something invalid so that it won't be cached if
2543 * there are any accesses after the TLB flush, then flush the TLB and
2544 * reset the direct map permissions to the default.
2545 */
2546 set_area_direct_map(area, set_direct_map_invalid_noflush);
31e67340 2547 _vm_unmap_aliases(start, end, flush_dmap);
868b104d
RE
2548 set_area_direct_map(area, set_direct_map_default_noflush);
2549}
2550
b3bdda02 2551static void __vunmap(const void *addr, int deallocate_pages)
1da177e4
LT
2552{
2553 struct vm_struct *area;
2554
2555 if (!addr)
2556 return;
2557
e69e9d4a 2558 if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
ab15d9b4 2559 addr))
1da177e4 2560 return;
1da177e4 2561
6ade2032 2562 area = find_vm_area(addr);
1da177e4 2563 if (unlikely(!area)) {
4c8573e2 2564 WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n",
1da177e4 2565 addr);
1da177e4
LT
2566 return;
2567 }
2568
05e3ff95
CP
2569 debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
2570 debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
9a11b49a 2571
c041098c 2572 kasan_poison_vmalloc(area->addr, get_vm_area_size(area));
3c5c3cfb 2573
868b104d
RE
2574 vm_remove_mappings(area, deallocate_pages);
2575
1da177e4 2576 if (deallocate_pages) {
121e6f32 2577 unsigned int page_order = vm_area_page_order(area);
1da177e4
LT
2578 int i;
2579
121e6f32 2580 for (i = 0; i < area->nr_pages; i += 1U << page_order) {
bf53d6f8
CL
2581 struct page *page = area->pages[i];
2582
2583 BUG_ON(!page);
121e6f32 2584 __free_pages(page, page_order);
a850e932 2585 cond_resched();
1da177e4 2586 }
97105f0a 2587 atomic_long_sub(area->nr_pages, &nr_vmalloc_pages);
1da177e4 2588
244d63ee 2589 kvfree(area->pages);
1da177e4
LT
2590 }
2591
2592 kfree(area);
1da177e4 2593}
bf22e37a
AR
2594
2595static inline void __vfree_deferred(const void *addr)
2596{
2597 /*
2598 * Use raw_cpu_ptr() because this can be called from preemptible
2599 * context. Preemption is absolutely fine here, because the llist_add()
2600 * implementation is lockless, so it works even if we are adding to
73221d88 2601 * another cpu's list. schedule_work() should be fine with this too.
bf22e37a
AR
2602 */
2603 struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
2604
2605 if (llist_add((struct llist_node *)addr, &p->list))
2606 schedule_work(&p->wq);
2607}
2608
2609/**
92eac168
MR
2610 * vfree_atomic - release memory allocated by vmalloc()
2611 * @addr: memory base address
bf22e37a 2612 *
92eac168
MR
2613 * This one is just like vfree() but can be called in any atomic context
2614 * except NMIs.
bf22e37a
AR
2615 */
2616void vfree_atomic(const void *addr)
2617{
2618 BUG_ON(in_nmi());
2619
2620 kmemleak_free(addr);
2621
2622 if (!addr)
2623 return;
2624 __vfree_deferred(addr);
2625}
2626
c67dc624
RP
2627static void __vfree(const void *addr)
2628{
2629 if (unlikely(in_interrupt()))
2630 __vfree_deferred(addr);
2631 else
2632 __vunmap(addr, 1);
2633}
2634
1da177e4 2635/**
fa307474
MWO
2636 * vfree - Release memory allocated by vmalloc()
2637 * @addr: Memory base address
1da177e4 2638 *
fa307474
MWO
2639 * Free the virtually continuous memory area starting at @addr, as obtained
2640 * from one of the vmalloc() family of APIs. This will usually also free the
2641 * physical memory underlying the virtual allocation, but that memory is
2642 * reference counted, so it will not be freed until the last user goes away.
1da177e4 2643 *
fa307474 2644 * If @addr is NULL, no operation is performed.
c9fcee51 2645 *
fa307474 2646 * Context:
92eac168 2647 * May sleep if called *not* from interrupt context.
fa307474
MWO
2648 * Must not be called in NMI context (strictly speaking, it could be
2649 * if we have CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG, but making the calling
f0953a1b 2650 * conventions for vfree() arch-dependent would be a really bad idea).
1da177e4 2651 */
b3bdda02 2652void vfree(const void *addr)
1da177e4 2653{
32fcfd40 2654 BUG_ON(in_nmi());
89219d37
CM
2655
2656 kmemleak_free(addr);
2657
a8dda165
AR
2658 might_sleep_if(!in_interrupt());
2659
32fcfd40
AV
2660 if (!addr)
2661 return;
c67dc624
RP
2662
2663 __vfree(addr);
1da177e4 2664}
1da177e4
LT
2665EXPORT_SYMBOL(vfree);
2666
2667/**
92eac168
MR
2668 * vunmap - release virtual mapping obtained by vmap()
2669 * @addr: memory base address
1da177e4 2670 *
92eac168
MR
2671 * Free the virtually contiguous memory area starting at @addr,
2672 * which was created from the page array passed to vmap().
1da177e4 2673 *
92eac168 2674 * Must not be called in interrupt context.
1da177e4 2675 */
b3bdda02 2676void vunmap(const void *addr)
1da177e4
LT
2677{
2678 BUG_ON(in_interrupt());
34754b69 2679 might_sleep();
32fcfd40
AV
2680 if (addr)
2681 __vunmap(addr, 0);
1da177e4 2682}
1da177e4
LT
2683EXPORT_SYMBOL(vunmap);
2684
2685/**
92eac168
MR
2686 * vmap - map an array of pages into virtually contiguous space
2687 * @pages: array of page pointers
2688 * @count: number of pages to map
2689 * @flags: vm_area->flags
2690 * @prot: page protection for the mapping
2691 *
b944afc9
CH
2692 * Maps @count pages from @pages into contiguous kernel virtual space.
2693 * If @flags contains %VM_MAP_PUT_PAGES the ownership of the pages array itself
2694 * (which must be kmalloc or vmalloc memory) and one reference per pages in it
2695 * are transferred from the caller to vmap(), and will be freed / dropped when
2696 * vfree() is called on the return value.
a862f68a
MR
2697 *
2698 * Return: the address of the area or %NULL on failure
1da177e4
LT
2699 */
2700void *vmap(struct page **pages, unsigned int count,
92eac168 2701 unsigned long flags, pgprot_t prot)
1da177e4
LT
2702{
2703 struct vm_struct *area;
b67177ec 2704 unsigned long addr;
65ee03c4 2705 unsigned long size; /* In bytes */
1da177e4 2706
34754b69
PZ
2707 might_sleep();
2708
ca79b0c2 2709 if (count > totalram_pages())
1da177e4
LT
2710 return NULL;
2711
65ee03c4
GJM
2712 size = (unsigned long)count << PAGE_SHIFT;
2713 area = get_vm_area_caller(size, flags, __builtin_return_address(0));
1da177e4
LT
2714 if (!area)
2715 return NULL;
23016969 2716
b67177ec
NP
2717 addr = (unsigned long)area->addr;
2718 if (vmap_pages_range(addr, addr + size, pgprot_nx(prot),
2719 pages, PAGE_SHIFT) < 0) {
1da177e4
LT
2720 vunmap(area->addr);
2721 return NULL;
2722 }
2723
c22ee528 2724 if (flags & VM_MAP_PUT_PAGES) {
b944afc9 2725 area->pages = pages;
c22ee528
ML
2726 area->nr_pages = count;
2727 }
1da177e4
LT
2728 return area->addr;
2729}
1da177e4
LT
2730EXPORT_SYMBOL(vmap);
2731
3e9a9e25
CH
2732#ifdef CONFIG_VMAP_PFN
2733struct vmap_pfn_data {
2734 unsigned long *pfns;
2735 pgprot_t prot;
2736 unsigned int idx;
2737};
2738
2739static int vmap_pfn_apply(pte_t *pte, unsigned long addr, void *private)
2740{
2741 struct vmap_pfn_data *data = private;
2742
2743 if (WARN_ON_ONCE(pfn_valid(data->pfns[data->idx])))
2744 return -EINVAL;
2745 *pte = pte_mkspecial(pfn_pte(data->pfns[data->idx++], data->prot));
2746 return 0;
2747}
2748
2749/**
2750 * vmap_pfn - map an array of PFNs into virtually contiguous space
2751 * @pfns: array of PFNs
2752 * @count: number of pages to map
2753 * @prot: page protection for the mapping
2754 *
2755 * Maps @count PFNs from @pfns into contiguous kernel virtual space and returns
2756 * the start address of the mapping.
2757 */
2758void *vmap_pfn(unsigned long *pfns, unsigned int count, pgprot_t prot)
2759{
2760 struct vmap_pfn_data data = { .pfns = pfns, .prot = pgprot_nx(prot) };
2761 struct vm_struct *area;
2762
2763 area = get_vm_area_caller(count * PAGE_SIZE, VM_IOREMAP,
2764 __builtin_return_address(0));
2765 if (!area)
2766 return NULL;
2767 if (apply_to_page_range(&init_mm, (unsigned long)area->addr,
2768 count * PAGE_SIZE, vmap_pfn_apply, &data)) {
2769 free_vm_area(area);
2770 return NULL;
2771 }
2772 return area->addr;
2773}
2774EXPORT_SYMBOL_GPL(vmap_pfn);
2775#endif /* CONFIG_VMAP_PFN */
2776
12b9f873
UR
2777static inline unsigned int
2778vm_area_alloc_pages(gfp_t gfp, int nid,
2779 unsigned int order, unsigned long nr_pages, struct page **pages)
2780{
2781 unsigned int nr_allocated = 0;
2782
2783 /*
2784 * For order-0 pages we make use of bulk allocator, if
2785 * the page array is partly or not at all populated due
2786 * to fails, fallback to a single page allocator that is
2787 * more permissive.
2788 */
2789 if (!order)
2790 nr_allocated = alloc_pages_bulk_array_node(
2791 gfp, nid, nr_pages, pages);
2792 else
2793 /*
2794 * Compound pages required for remap_vmalloc_page if
2795 * high-order pages.
2796 */
2797 gfp |= __GFP_COMP;
2798
2799 /* High-order pages or fallback path if "bulk" fails. */
2800 while (nr_allocated < nr_pages) {
2801 struct page *page;
2802 int i;
2803
2804 page = alloc_pages_node(nid, gfp, order);
2805 if (unlikely(!page))
2806 break;
2807
2808 /*
2809 * Careful, we allocate and map page-order pages, but
2810 * tracking is done per PAGE_SIZE page so as to keep the
2811 * vm_struct APIs independent of the physical/mapped size.
2812 */
2813 for (i = 0; i < (1U << order); i++)
2814 pages[nr_allocated + i] = page + i;
2815
2816 if (gfpflags_allow_blocking(gfp))
2817 cond_resched();
2818
2819 nr_allocated += 1U << order;
2820 }
2821
2822 return nr_allocated;
2823}
2824
e31d9eb5 2825static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
121e6f32
NP
2826 pgprot_t prot, unsigned int page_shift,
2827 int node)
1da177e4 2828{
930f036b 2829 const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
121e6f32
NP
2830 unsigned long addr = (unsigned long)area->addr;
2831 unsigned long size = get_vm_area_size(area);
34fe6537 2832 unsigned long array_size;
121e6f32
NP
2833 unsigned int nr_small_pages = size >> PAGE_SHIFT;
2834 unsigned int page_order;
1da177e4 2835
121e6f32 2836 array_size = (unsigned long)nr_small_pages * sizeof(struct page *);
f255935b
CH
2837 gfp_mask |= __GFP_NOWARN;
2838 if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
2839 gfp_mask |= __GFP_HIGHMEM;
1da177e4 2840
1da177e4 2841 /* Please note that the recursion is strictly bounded. */
8757d5fa 2842 if (array_size > PAGE_SIZE) {
5c1f4e69 2843 area->pages = __vmalloc_node(array_size, 1, nested_gfp, node,
f255935b 2844 area->caller);
286e1ea3 2845 } else {
5c1f4e69 2846 area->pages = kmalloc_node(array_size, nested_gfp, node);
286e1ea3 2847 }
7ea36242 2848
5c1f4e69 2849 if (!area->pages) {
d70bec8c 2850 warn_alloc(gfp_mask, NULL,
f4bdfeaf
URS
2851 "vmalloc error: size %lu, failed to allocated page array size %lu",
2852 nr_small_pages * PAGE_SIZE, array_size);
cd61413b 2853 free_vm_area(area);
1da177e4
LT
2854 return NULL;
2855 }
1da177e4 2856
121e6f32 2857 set_vm_area_page_order(area, page_shift - PAGE_SHIFT);
121e6f32 2858 page_order = vm_area_page_order(area);
bf53d6f8 2859
12b9f873
UR
2860 area->nr_pages = vm_area_alloc_pages(gfp_mask, node,
2861 page_order, nr_small_pages, area->pages);
5c1f4e69 2862
97105f0a 2863 atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
1da177e4 2864
5c1f4e69
URS
2865 /*
2866 * If not enough pages were obtained to accomplish an
2867 * allocation request, free them via __vfree() if any.
2868 */
2869 if (area->nr_pages != nr_small_pages) {
2870 warn_alloc(gfp_mask, NULL,
f4bdfeaf 2871 "vmalloc error: size %lu, page order %u, failed to allocate pages",
5c1f4e69
URS
2872 area->nr_pages * PAGE_SIZE, page_order);
2873 goto fail;
2874 }
2875
12b9f873
UR
2876 if (vmap_pages_range(addr, addr + size, prot, area->pages,
2877 page_shift) < 0) {
d70bec8c 2878 warn_alloc(gfp_mask, NULL,
f4bdfeaf
URS
2879 "vmalloc error: size %lu, failed to map pages",
2880 area->nr_pages * PAGE_SIZE);
1da177e4 2881 goto fail;
d70bec8c 2882 }
ed1f324c 2883
1da177e4
LT
2884 return area->addr;
2885
2886fail:
c67dc624 2887 __vfree(area->addr);
1da177e4
LT
2888 return NULL;
2889}
2890
2891/**
92eac168
MR
2892 * __vmalloc_node_range - allocate virtually contiguous memory
2893 * @size: allocation size
2894 * @align: desired alignment
2895 * @start: vm area range start
2896 * @end: vm area range end
2897 * @gfp_mask: flags for the page level allocator
2898 * @prot: protection mask for the allocated pages
2899 * @vm_flags: additional vm area flags (e.g. %VM_NO_GUARD)
2900 * @node: node to use for allocation or NUMA_NO_NODE
2901 * @caller: caller's return address
2902 *
2903 * Allocate enough pages to cover @size from the page level
2904 * allocator with @gfp_mask flags. Map them into contiguous
2905 * kernel virtual space, using a pagetable protection of @prot.
a862f68a
MR
2906 *
2907 * Return: the address of the area or %NULL on failure
1da177e4 2908 */
d0a21265
DR
2909void *__vmalloc_node_range(unsigned long size, unsigned long align,
2910 unsigned long start, unsigned long end, gfp_t gfp_mask,
cb9e3c29
AR
2911 pgprot_t prot, unsigned long vm_flags, int node,
2912 const void *caller)
1da177e4
LT
2913{
2914 struct vm_struct *area;
89219d37
CM
2915 void *addr;
2916 unsigned long real_size = size;
121e6f32
NP
2917 unsigned long real_align = align;
2918 unsigned int shift = PAGE_SHIFT;
1da177e4 2919
d70bec8c
NP
2920 if (WARN_ON_ONCE(!size))
2921 return NULL;
2922
2923 if ((size >> PAGE_SHIFT) > totalram_pages()) {
2924 warn_alloc(gfp_mask, NULL,
f4bdfeaf
URS
2925 "vmalloc error: size %lu, exceeds total pages",
2926 real_size);
d70bec8c 2927 return NULL;
121e6f32
NP
2928 }
2929
3382bbee 2930 if (vmap_allow_huge && !(vm_flags & VM_NO_HUGE_VMAP)) {
121e6f32 2931 unsigned long size_per_node;
1da177e4 2932
121e6f32
NP
2933 /*
2934 * Try huge pages. Only try for PAGE_KERNEL allocations,
2935 * others like modules don't yet expect huge pages in
2936 * their allocations due to apply_to_page_range not
2937 * supporting them.
2938 */
2939
2940 size_per_node = size;
2941 if (node == NUMA_NO_NODE)
2942 size_per_node /= num_online_nodes();
3382bbee 2943 if (arch_vmap_pmd_supported(prot) && size_per_node >= PMD_SIZE)
121e6f32 2944 shift = PMD_SHIFT;
3382bbee
CL
2945 else
2946 shift = arch_vmap_pte_supported_shift(size_per_node);
2947
2948 align = max(real_align, 1UL << shift);
2949 size = ALIGN(real_size, 1UL << shift);
121e6f32
NP
2950 }
2951
2952again:
7ca3027b
DA
2953 area = __get_vm_area_node(real_size, align, shift, VM_ALLOC |
2954 VM_UNINITIALIZED | vm_flags, start, end, node,
2955 gfp_mask, caller);
d70bec8c
NP
2956 if (!area) {
2957 warn_alloc(gfp_mask, NULL,
f4bdfeaf
URS
2958 "vmalloc error: size %lu, vm_struct allocation failed",
2959 real_size);
de7d2b56 2960 goto fail;
d70bec8c 2961 }
1da177e4 2962
121e6f32 2963 addr = __vmalloc_area_node(area, gfp_mask, prot, shift, node);
1368edf0 2964 if (!addr)
121e6f32 2965 goto fail;
89219d37 2966
f5252e00 2967 /*
20fc02b4
ZY
2968 * In this function, newly allocated vm_struct has VM_UNINITIALIZED
2969 * flag. It means that vm_struct is not fully initialized.
4341fa45 2970 * Now, it is fully initialized, so remove this flag here.
f5252e00 2971 */
20fc02b4 2972 clear_vm_uninitialized_flag(area);
f5252e00 2973
7ca3027b 2974 size = PAGE_ALIGN(size);
94f4a161 2975 kmemleak_vmalloc(area, size, gfp_mask);
89219d37
CM
2976
2977 return addr;
de7d2b56
JP
2978
2979fail:
121e6f32
NP
2980 if (shift > PAGE_SHIFT) {
2981 shift = PAGE_SHIFT;
2982 align = real_align;
2983 size = real_size;
2984 goto again;
2985 }
2986
de7d2b56 2987 return NULL;
1da177e4
LT
2988}
2989
d0a21265 2990/**
92eac168
MR
2991 * __vmalloc_node - allocate virtually contiguous memory
2992 * @size: allocation size
2993 * @align: desired alignment
2994 * @gfp_mask: flags for the page level allocator
92eac168
MR
2995 * @node: node to use for allocation or NUMA_NO_NODE
2996 * @caller: caller's return address
a7c3e901 2997 *
f38fcb9c
CH
2998 * Allocate enough pages to cover @size from the page level allocator with
2999 * @gfp_mask flags. Map them into contiguous kernel virtual space.
a7c3e901 3000 *
92eac168
MR
3001 * Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
3002 * and __GFP_NOFAIL are not supported
a7c3e901 3003 *
92eac168
MR
3004 * Any use of gfp flags outside of GFP_KERNEL should be consulted
3005 * with mm people.
a862f68a
MR
3006 *
3007 * Return: pointer to the allocated memory or %NULL on error
d0a21265 3008 */
2b905948 3009void *__vmalloc_node(unsigned long size, unsigned long align,
f38fcb9c 3010 gfp_t gfp_mask, int node, const void *caller)
d0a21265
DR
3011{
3012 return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
f38fcb9c 3013 gfp_mask, PAGE_KERNEL, 0, node, caller);
d0a21265 3014}
c3f896dc
CH
3015/*
3016 * This is only for performance analysis of vmalloc and stress purpose.
3017 * It is required by vmalloc test module, therefore do not use it other
3018 * than that.
3019 */
3020#ifdef CONFIG_TEST_VMALLOC_MODULE
3021EXPORT_SYMBOL_GPL(__vmalloc_node);
3022#endif
d0a21265 3023
88dca4ca 3024void *__vmalloc(unsigned long size, gfp_t gfp_mask)
930fc45a 3025{
f38fcb9c 3026 return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
23016969 3027 __builtin_return_address(0));
930fc45a 3028}
1da177e4
LT
3029EXPORT_SYMBOL(__vmalloc);
3030
3031/**
92eac168
MR
3032 * vmalloc - allocate virtually contiguous memory
3033 * @size: allocation size
3034 *
3035 * Allocate enough pages to cover @size from the page level
3036 * allocator and map them into contiguous kernel virtual space.
1da177e4 3037 *
92eac168
MR
3038 * For tight control over page level allocator and protection flags
3039 * use __vmalloc() instead.
a862f68a
MR
3040 *
3041 * Return: pointer to the allocated memory or %NULL on error
1da177e4
LT
3042 */
3043void *vmalloc(unsigned long size)
3044{
4d39d728
CH
3045 return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
3046 __builtin_return_address(0));
1da177e4 3047}
1da177e4
LT
3048EXPORT_SYMBOL(vmalloc);
3049
15a64f5a
CI
3050/**
3051 * vmalloc_no_huge - allocate virtually contiguous memory using small pages
3052 * @size: allocation size
3053 *
3054 * Allocate enough non-huge pages to cover @size from the page level
3055 * allocator and map them into contiguous kernel virtual space.
3056 *
3057 * Return: pointer to the allocated memory or %NULL on error
3058 */
3059void *vmalloc_no_huge(unsigned long size)
3060{
3061 return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
3062 GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP,
3063 NUMA_NO_NODE, __builtin_return_address(0));
3064}
3065EXPORT_SYMBOL(vmalloc_no_huge);
3066
e1ca7788 3067/**
92eac168
MR
3068 * vzalloc - allocate virtually contiguous memory with zero fill
3069 * @size: allocation size
3070 *
3071 * Allocate enough pages to cover @size from the page level
3072 * allocator and map them into contiguous kernel virtual space.
3073 * The memory allocated is set to zero.
3074 *
3075 * For tight control over page level allocator and protection flags
3076 * use __vmalloc() instead.
a862f68a
MR
3077 *
3078 * Return: pointer to the allocated memory or %NULL on error
e1ca7788
DY
3079 */
3080void *vzalloc(unsigned long size)
3081{
4d39d728
CH
3082 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
3083 __builtin_return_address(0));
e1ca7788
DY
3084}
3085EXPORT_SYMBOL(vzalloc);
3086
83342314 3087/**
ead04089
REB
3088 * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
3089 * @size: allocation size
83342314 3090 *
ead04089
REB
3091 * The resulting memory area is zeroed so it can be mapped to userspace
3092 * without leaking data.
a862f68a
MR
3093 *
3094 * Return: pointer to the allocated memory or %NULL on error
83342314
NP
3095 */
3096void *vmalloc_user(unsigned long size)
3097{
bc84c535
RP
3098 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
3099 GFP_KERNEL | __GFP_ZERO, PAGE_KERNEL,
3100 VM_USERMAP, NUMA_NO_NODE,
3101 __builtin_return_address(0));
83342314
NP
3102}
3103EXPORT_SYMBOL(vmalloc_user);
3104
930fc45a 3105/**
92eac168
MR
3106 * vmalloc_node - allocate memory on a specific node
3107 * @size: allocation size
3108 * @node: numa node
930fc45a 3109 *
92eac168
MR
3110 * Allocate enough pages to cover @size from the page level
3111 * allocator and map them into contiguous kernel virtual space.
930fc45a 3112 *
92eac168
MR
3113 * For tight control over page level allocator and protection flags
3114 * use __vmalloc() instead.
a862f68a
MR
3115 *
3116 * Return: pointer to the allocated memory or %NULL on error
930fc45a
CL
3117 */
3118void *vmalloc_node(unsigned long size, int node)
3119{
f38fcb9c
CH
3120 return __vmalloc_node(size, 1, GFP_KERNEL, node,
3121 __builtin_return_address(0));
930fc45a
CL
3122}
3123EXPORT_SYMBOL(vmalloc_node);
3124
e1ca7788
DY
3125/**
3126 * vzalloc_node - allocate memory on a specific node with zero fill
3127 * @size: allocation size
3128 * @node: numa node
3129 *
3130 * Allocate enough pages to cover @size from the page level
3131 * allocator and map them into contiguous kernel virtual space.
3132 * The memory allocated is set to zero.
3133 *
a862f68a 3134 * Return: pointer to the allocated memory or %NULL on error
e1ca7788
DY
3135 */
3136void *vzalloc_node(unsigned long size, int node)
3137{
4d39d728
CH
3138 return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
3139 __builtin_return_address(0));
e1ca7788
DY
3140}
3141EXPORT_SYMBOL(vzalloc_node);
3142
0d08e0d3 3143#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
698d0831 3144#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
0d08e0d3 3145#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
698d0831 3146#define GFP_VMALLOC32 (GFP_DMA | GFP_KERNEL)
0d08e0d3 3147#else
698d0831
MH
3148/*
3149 * 64b systems should always have either DMA or DMA32 zones. For others
3150 * GFP_DMA32 should do the right thing and use the normal zone.
3151 */
68d68ff6 3152#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
0d08e0d3
AK
3153#endif
3154
1da177e4 3155/**
92eac168
MR
3156 * vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
3157 * @size: allocation size
1da177e4 3158 *
92eac168
MR
3159 * Allocate enough 32bit PA addressable pages to cover @size from the
3160 * page level allocator and map them into contiguous kernel virtual space.
a862f68a
MR
3161 *
3162 * Return: pointer to the allocated memory or %NULL on error
1da177e4
LT
3163 */
3164void *vmalloc_32(unsigned long size)
3165{
f38fcb9c
CH
3166 return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
3167 __builtin_return_address(0));
1da177e4 3168}
1da177e4
LT
3169EXPORT_SYMBOL(vmalloc_32);
3170
83342314 3171/**
ead04089 3172 * vmalloc_32_user - allocate zeroed virtually contiguous 32bit memory
92eac168 3173 * @size: allocation size
ead04089
REB
3174 *
3175 * The resulting memory area is 32bit addressable and zeroed so it can be
3176 * mapped to userspace without leaking data.
a862f68a
MR
3177 *
3178 * Return: pointer to the allocated memory or %NULL on error
83342314
NP
3179 */
3180void *vmalloc_32_user(unsigned long size)
3181{
bc84c535
RP
3182 return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
3183 GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL,
3184 VM_USERMAP, NUMA_NO_NODE,
3185 __builtin_return_address(0));
83342314
NP
3186}
3187EXPORT_SYMBOL(vmalloc_32_user);
3188
d0107eb0
KH
3189/*
3190 * small helper routine , copy contents to buf from addr.
3191 * If the page is not present, fill zero.
3192 */
3193
3194static int aligned_vread(char *buf, char *addr, unsigned long count)
3195{
3196 struct page *p;
3197 int copied = 0;
3198
3199 while (count) {
3200 unsigned long offset, length;
3201
891c49ab 3202 offset = offset_in_page(addr);
d0107eb0
KH
3203 length = PAGE_SIZE - offset;
3204 if (length > count)
3205 length = count;
3206 p = vmalloc_to_page(addr);
3207 /*
3208 * To do safe access to this _mapped_ area, we need
3209 * lock. But adding lock here means that we need to add
f0953a1b 3210 * overhead of vmalloc()/vfree() calls for this _debug_
d0107eb0
KH
3211 * interface, rarely used. Instead of that, we'll use
3212 * kmap() and get small overhead in this access function.
3213 */
3214 if (p) {
f7c8ce44 3215 /* We can expect USER0 is not used -- see vread() */
9b04c5fe 3216 void *map = kmap_atomic(p);
d0107eb0 3217 memcpy(buf, map + offset, length);
9b04c5fe 3218 kunmap_atomic(map);
d0107eb0
KH
3219 } else
3220 memset(buf, 0, length);
3221
3222 addr += length;
3223 buf += length;
3224 copied += length;
3225 count -= length;
3226 }
3227 return copied;
3228}
3229
d0107eb0 3230/**
92eac168
MR
3231 * vread() - read vmalloc area in a safe way.
3232 * @buf: buffer for reading data
3233 * @addr: vm address.
3234 * @count: number of bytes to be read.
3235 *
92eac168
MR
3236 * This function checks that addr is a valid vmalloc'ed area, and
3237 * copy data from that area to a given buffer. If the given memory range
3238 * of [addr...addr+count) includes some valid address, data is copied to
3239 * proper area of @buf. If there are memory holes, they'll be zero-filled.
3240 * IOREMAP area is treated as memory hole and no copy is done.
3241 *
3242 * If [addr...addr+count) doesn't includes any intersects with alive
3243 * vm_struct area, returns 0. @buf should be kernel's buffer.
3244 *
3245 * Note: In usual ops, vread() is never necessary because the caller
3246 * should know vmalloc() area is valid and can use memcpy().
3247 * This is for routines which have to access vmalloc area without
bbcd53c9 3248 * any information, as /proc/kcore.
a862f68a
MR
3249 *
3250 * Return: number of bytes for which addr and buf should be increased
3251 * (same number as @count) or %0 if [addr...addr+count) doesn't
3252 * include any intersection with valid vmalloc area
d0107eb0 3253 */
1da177e4
LT
3254long vread(char *buf, char *addr, unsigned long count)
3255{
e81ce85f
JK
3256 struct vmap_area *va;
3257 struct vm_struct *vm;
1da177e4 3258 char *vaddr, *buf_start = buf;
d0107eb0 3259 unsigned long buflen = count;
1da177e4
LT
3260 unsigned long n;
3261
3262 /* Don't allow overflow */
3263 if ((unsigned long) addr + count < count)
3264 count = -(unsigned long) addr;
3265
e81ce85f 3266 spin_lock(&vmap_area_lock);
f608788c
SD
3267 va = __find_vmap_area((unsigned long)addr);
3268 if (!va)
3269 goto finished;
3270 list_for_each_entry_from(va, &vmap_area_list, list) {
e81ce85f
JK
3271 if (!count)
3272 break;
3273
688fcbfc 3274 if (!va->vm)
e81ce85f
JK
3275 continue;
3276
3277 vm = va->vm;
3278 vaddr = (char *) vm->addr;
762216ab 3279 if (addr >= vaddr + get_vm_area_size(vm))
1da177e4
LT
3280 continue;
3281 while (addr < vaddr) {
3282 if (count == 0)
3283 goto finished;
3284 *buf = '\0';
3285 buf++;
3286 addr++;
3287 count--;
3288 }
762216ab 3289 n = vaddr + get_vm_area_size(vm) - addr;
d0107eb0
KH
3290 if (n > count)
3291 n = count;
e81ce85f 3292 if (!(vm->flags & VM_IOREMAP))
d0107eb0
KH
3293 aligned_vread(buf, addr, n);
3294 else /* IOREMAP area is treated as memory hole */
3295 memset(buf, 0, n);
3296 buf += n;
3297 addr += n;
3298 count -= n;
1da177e4
LT
3299 }
3300finished:
e81ce85f 3301 spin_unlock(&vmap_area_lock);
d0107eb0
KH
3302
3303 if (buf == buf_start)
3304 return 0;
3305 /* zero-fill memory holes */
3306 if (buf != buf_start + buflen)
3307 memset(buf, 0, buflen - (buf - buf_start));
3308
3309 return buflen;
1da177e4
LT
3310}
3311
83342314 3312/**
92eac168
MR
3313 * remap_vmalloc_range_partial - map vmalloc pages to userspace
3314 * @vma: vma to cover
3315 * @uaddr: target user address to start at
3316 * @kaddr: virtual address of vmalloc kernel memory
bdebd6a2 3317 * @pgoff: offset from @kaddr to start at
92eac168 3318 * @size: size of map area
7682486b 3319 *
92eac168 3320 * Returns: 0 for success, -Exxx on failure
83342314 3321 *
92eac168
MR
3322 * This function checks that @kaddr is a valid vmalloc'ed area,
3323 * and that it is big enough to cover the range starting at
3324 * @uaddr in @vma. Will return failure if that criteria isn't
3325 * met.
83342314 3326 *
92eac168 3327 * Similar to remap_pfn_range() (see mm/memory.c)
83342314 3328 */
e69e9d4a 3329int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr,
bdebd6a2
JH
3330 void *kaddr, unsigned long pgoff,
3331 unsigned long size)
83342314
NP
3332{
3333 struct vm_struct *area;
bdebd6a2
JH
3334 unsigned long off;
3335 unsigned long end_index;
3336
3337 if (check_shl_overflow(pgoff, PAGE_SHIFT, &off))
3338 return -EINVAL;
83342314 3339
e69e9d4a
HD
3340 size = PAGE_ALIGN(size);
3341
3342 if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr))
83342314
NP
3343 return -EINVAL;
3344
e69e9d4a 3345 area = find_vm_area(kaddr);
83342314 3346 if (!area)
db64fe02 3347 return -EINVAL;
83342314 3348
fe9041c2 3349 if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT)))
db64fe02 3350 return -EINVAL;
83342314 3351
bdebd6a2
JH
3352 if (check_add_overflow(size, off, &end_index) ||
3353 end_index > get_vm_area_size(area))
db64fe02 3354 return -EINVAL;
bdebd6a2 3355 kaddr += off;
83342314 3356
83342314 3357 do {
e69e9d4a 3358 struct page *page = vmalloc_to_page(kaddr);
db64fe02
NP
3359 int ret;
3360
83342314
NP
3361 ret = vm_insert_page(vma, uaddr, page);
3362 if (ret)
3363 return ret;
3364
3365 uaddr += PAGE_SIZE;
e69e9d4a
HD
3366 kaddr += PAGE_SIZE;
3367 size -= PAGE_SIZE;
3368 } while (size > 0);
83342314 3369
314e51b9 3370 vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
83342314 3371
db64fe02 3372 return 0;
83342314 3373}
e69e9d4a
HD
3374
3375/**
92eac168
MR
3376 * remap_vmalloc_range - map vmalloc pages to userspace
3377 * @vma: vma to cover (map full range of vma)
3378 * @addr: vmalloc memory
3379 * @pgoff: number of pages into addr before first page to map
e69e9d4a 3380 *
92eac168 3381 * Returns: 0 for success, -Exxx on failure
e69e9d4a 3382 *
92eac168
MR
3383 * This function checks that addr is a valid vmalloc'ed area, and
3384 * that it is big enough to cover the vma. Will return failure if
3385 * that criteria isn't met.
e69e9d4a 3386 *
92eac168 3387 * Similar to remap_pfn_range() (see mm/memory.c)
e69e9d4a
HD
3388 */
3389int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
3390 unsigned long pgoff)
3391{
3392 return remap_vmalloc_range_partial(vma, vma->vm_start,
bdebd6a2 3393 addr, pgoff,
e69e9d4a
HD
3394 vma->vm_end - vma->vm_start);
3395}
83342314
NP
3396EXPORT_SYMBOL(remap_vmalloc_range);
3397
5f4352fb
JF
3398void free_vm_area(struct vm_struct *area)
3399{
3400 struct vm_struct *ret;
3401 ret = remove_vm_area(area->addr);
3402 BUG_ON(ret != area);
3403 kfree(area);
3404}
3405EXPORT_SYMBOL_GPL(free_vm_area);
a10aa579 3406
4f8b02b4 3407#ifdef CONFIG_SMP
ca23e405
TH
3408static struct vmap_area *node_to_va(struct rb_node *n)
3409{
4583e773 3410 return rb_entry_safe(n, struct vmap_area, rb_node);
ca23e405
TH
3411}
3412
3413/**
68ad4a33
URS
3414 * pvm_find_va_enclose_addr - find the vmap_area @addr belongs to
3415 * @addr: target address
ca23e405 3416 *
68ad4a33
URS
3417 * Returns: vmap_area if it is found. If there is no such area
3418 * the first highest(reverse order) vmap_area is returned
3419 * i.e. va->va_start < addr && va->va_end < addr or NULL
3420 * if there are no any areas before @addr.
ca23e405 3421 */
68ad4a33
URS
3422static struct vmap_area *
3423pvm_find_va_enclose_addr(unsigned long addr)
ca23e405 3424{
68ad4a33
URS
3425 struct vmap_area *va, *tmp;
3426 struct rb_node *n;
3427
3428 n = free_vmap_area_root.rb_node;
3429 va = NULL;
ca23e405
TH
3430
3431 while (n) {
68ad4a33
URS
3432 tmp = rb_entry(n, struct vmap_area, rb_node);
3433 if (tmp->va_start <= addr) {
3434 va = tmp;
3435 if (tmp->va_end >= addr)
3436 break;
3437
ca23e405 3438 n = n->rb_right;
68ad4a33
URS
3439 } else {
3440 n = n->rb_left;
3441 }
ca23e405
TH
3442 }
3443
68ad4a33 3444 return va;
ca23e405
TH
3445}
3446
3447/**
68ad4a33
URS
3448 * pvm_determine_end_from_reverse - find the highest aligned address
3449 * of free block below VMALLOC_END
3450 * @va:
3451 * in - the VA we start the search(reverse order);
3452 * out - the VA with the highest aligned end address.
799fa85d 3453 * @align: alignment for required highest address
ca23e405 3454 *
68ad4a33 3455 * Returns: determined end address within vmap_area
ca23e405 3456 */
68ad4a33
URS
3457static unsigned long
3458pvm_determine_end_from_reverse(struct vmap_area **va, unsigned long align)
ca23e405 3459{
68ad4a33 3460 unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
ca23e405
TH
3461 unsigned long addr;
3462
68ad4a33
URS
3463 if (likely(*va)) {
3464 list_for_each_entry_from_reverse((*va),
3465 &free_vmap_area_list, list) {
3466 addr = min((*va)->va_end & ~(align - 1), vmalloc_end);
3467 if ((*va)->va_start < addr)
3468 return addr;
3469 }
ca23e405
TH
3470 }
3471
68ad4a33 3472 return 0;
ca23e405
TH
3473}
3474
3475/**
3476 * pcpu_get_vm_areas - allocate vmalloc areas for percpu allocator
3477 * @offsets: array containing offset of each area
3478 * @sizes: array containing size of each area
3479 * @nr_vms: the number of areas to allocate
3480 * @align: alignment, all entries in @offsets and @sizes must be aligned to this
ca23e405
TH
3481 *
3482 * Returns: kmalloc'd vm_struct pointer array pointing to allocated
3483 * vm_structs on success, %NULL on failure
3484 *
3485 * Percpu allocator wants to use congruent vm areas so that it can
3486 * maintain the offsets among percpu areas. This function allocates
ec3f64fc
DR
3487 * congruent vmalloc areas for it with GFP_KERNEL. These areas tend to
3488 * be scattered pretty far, distance between two areas easily going up
3489 * to gigabytes. To avoid interacting with regular vmallocs, these
3490 * areas are allocated from top.
ca23e405 3491 *
68ad4a33
URS
3492 * Despite its complicated look, this allocator is rather simple. It
3493 * does everything top-down and scans free blocks from the end looking
3494 * for matching base. While scanning, if any of the areas do not fit the
3495 * base address is pulled down to fit the area. Scanning is repeated till
3496 * all the areas fit and then all necessary data structures are inserted
3497 * and the result is returned.
ca23e405
TH
3498 */
3499struct vm_struct **pcpu_get_vm_areas(const unsigned long *offsets,
3500 const size_t *sizes, int nr_vms,
ec3f64fc 3501 size_t align)
ca23e405
TH
3502{
3503 const unsigned long vmalloc_start = ALIGN(VMALLOC_START, align);
3504 const unsigned long vmalloc_end = VMALLOC_END & ~(align - 1);
68ad4a33 3505 struct vmap_area **vas, *va;
ca23e405
TH
3506 struct vm_struct **vms;
3507 int area, area2, last_area, term_area;
253a496d 3508 unsigned long base, start, size, end, last_end, orig_start, orig_end;
ca23e405 3509 bool purged = false;
68ad4a33 3510 enum fit_type type;
ca23e405 3511
ca23e405 3512 /* verify parameters and allocate data structures */
891c49ab 3513 BUG_ON(offset_in_page(align) || !is_power_of_2(align));
ca23e405
TH
3514 for (last_area = 0, area = 0; area < nr_vms; area++) {
3515 start = offsets[area];
3516 end = start + sizes[area];
3517
3518 /* is everything aligned properly? */
3519 BUG_ON(!IS_ALIGNED(offsets[area], align));
3520 BUG_ON(!IS_ALIGNED(sizes[area], align));
3521
3522 /* detect the area with the highest address */
3523 if (start > offsets[last_area])
3524 last_area = area;
3525
c568da28 3526 for (area2 = area + 1; area2 < nr_vms; area2++) {
ca23e405
TH
3527 unsigned long start2 = offsets[area2];
3528 unsigned long end2 = start2 + sizes[area2];
3529
c568da28 3530 BUG_ON(start2 < end && start < end2);
ca23e405
TH
3531 }
3532 }
3533 last_end = offsets[last_area] + sizes[last_area];
3534
3535 if (vmalloc_end - vmalloc_start < last_end) {
3536 WARN_ON(true);
3537 return NULL;
3538 }
3539
4d67d860
TM
3540 vms = kcalloc(nr_vms, sizeof(vms[0]), GFP_KERNEL);
3541 vas = kcalloc(nr_vms, sizeof(vas[0]), GFP_KERNEL);
ca23e405 3542 if (!vas || !vms)
f1db7afd 3543 goto err_free2;
ca23e405
TH
3544
3545 for (area = 0; area < nr_vms; area++) {
68ad4a33 3546 vas[area] = kmem_cache_zalloc(vmap_area_cachep, GFP_KERNEL);
ec3f64fc 3547 vms[area] = kzalloc(sizeof(struct vm_struct), GFP_KERNEL);
ca23e405
TH
3548 if (!vas[area] || !vms[area])
3549 goto err_free;
3550 }
3551retry:
e36176be 3552 spin_lock(&free_vmap_area_lock);
ca23e405
TH
3553
3554 /* start scanning - we scan from the top, begin with the last area */
3555 area = term_area = last_area;
3556 start = offsets[area];
3557 end = start + sizes[area];
3558
68ad4a33
URS
3559 va = pvm_find_va_enclose_addr(vmalloc_end);
3560 base = pvm_determine_end_from_reverse(&va, align) - end;
ca23e405
TH
3561
3562 while (true) {
ca23e405
TH
3563 /*
3564 * base might have underflowed, add last_end before
3565 * comparing.
3566 */
68ad4a33
URS
3567 if (base + last_end < vmalloc_start + last_end)
3568 goto overflow;
ca23e405
TH
3569
3570 /*
68ad4a33 3571 * Fitting base has not been found.
ca23e405 3572 */
68ad4a33
URS
3573 if (va == NULL)
3574 goto overflow;
ca23e405 3575
5336e52c 3576 /*
d8cc323d 3577 * If required width exceeds current VA block, move
5336e52c
KS
3578 * base downwards and then recheck.
3579 */
3580 if (base + end > va->va_end) {
3581 base = pvm_determine_end_from_reverse(&va, align) - end;
3582 term_area = area;
3583 continue;
3584 }
3585
ca23e405 3586 /*
68ad4a33 3587 * If this VA does not fit, move base downwards and recheck.
ca23e405 3588 */
5336e52c 3589 if (base + start < va->va_start) {
68ad4a33
URS
3590 va = node_to_va(rb_prev(&va->rb_node));
3591 base = pvm_determine_end_from_reverse(&va, align) - end;
ca23e405
TH
3592 term_area = area;
3593 continue;
3594 }
3595
3596 /*
3597 * This area fits, move on to the previous one. If
3598 * the previous one is the terminal one, we're done.
3599 */
3600 area = (area + nr_vms - 1) % nr_vms;
3601 if (area == term_area)
3602 break;
68ad4a33 3603
ca23e405
TH
3604 start = offsets[area];
3605 end = start + sizes[area];
68ad4a33 3606 va = pvm_find_va_enclose_addr(base + end);
ca23e405 3607 }
68ad4a33 3608
ca23e405
TH
3609 /* we've found a fitting base, insert all va's */
3610 for (area = 0; area < nr_vms; area++) {
68ad4a33 3611 int ret;
ca23e405 3612
68ad4a33
URS
3613 start = base + offsets[area];
3614 size = sizes[area];
ca23e405 3615
68ad4a33
URS
3616 va = pvm_find_va_enclose_addr(start);
3617 if (WARN_ON_ONCE(va == NULL))
3618 /* It is a BUG(), but trigger recovery instead. */
3619 goto recovery;
3620
3621 type = classify_va_fit_type(va, start, size);
3622 if (WARN_ON_ONCE(type == NOTHING_FIT))
3623 /* It is a BUG(), but trigger recovery instead. */
3624 goto recovery;
3625
3626 ret = adjust_va_to_fit_type(va, start, size, type);
3627 if (unlikely(ret))
3628 goto recovery;
3629
3630 /* Allocated area. */
3631 va = vas[area];
3632 va->va_start = start;
3633 va->va_end = start + size;
68ad4a33 3634 }
ca23e405 3635
e36176be 3636 spin_unlock(&free_vmap_area_lock);
ca23e405 3637
253a496d
DA
3638 /* populate the kasan shadow space */
3639 for (area = 0; area < nr_vms; area++) {
3640 if (kasan_populate_vmalloc(vas[area]->va_start, sizes[area]))
3641 goto err_free_shadow;
3642
3643 kasan_unpoison_vmalloc((void *)vas[area]->va_start,
3644 sizes[area]);
3645 }
3646
ca23e405 3647 /* insert all vm's */
e36176be
URS
3648 spin_lock(&vmap_area_lock);
3649 for (area = 0; area < nr_vms; area++) {
3650 insert_vmap_area(vas[area], &vmap_area_root, &vmap_area_list);
3651
3652 setup_vmalloc_vm_locked(vms[area], vas[area], VM_ALLOC,
3645cb4a 3653 pcpu_get_vm_areas);
e36176be
URS
3654 }
3655 spin_unlock(&vmap_area_lock);
ca23e405
TH
3656
3657 kfree(vas);
3658 return vms;
3659
68ad4a33 3660recovery:
e36176be
URS
3661 /*
3662 * Remove previously allocated areas. There is no
3663 * need in removing these areas from the busy tree,
3664 * because they are inserted only on the final step
3665 * and when pcpu_get_vm_areas() is success.
3666 */
68ad4a33 3667 while (area--) {
253a496d
DA
3668 orig_start = vas[area]->va_start;
3669 orig_end = vas[area]->va_end;
96e2db45
URS
3670 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
3671 &free_vmap_area_list);
9c801f61
URS
3672 if (va)
3673 kasan_release_vmalloc(orig_start, orig_end,
3674 va->va_start, va->va_end);
68ad4a33
URS
3675 vas[area] = NULL;
3676 }
3677
3678overflow:
e36176be 3679 spin_unlock(&free_vmap_area_lock);
68ad4a33
URS
3680 if (!purged) {
3681 purge_vmap_area_lazy();
3682 purged = true;
3683
3684 /* Before "retry", check if we recover. */
3685 for (area = 0; area < nr_vms; area++) {
3686 if (vas[area])
3687 continue;
3688
3689 vas[area] = kmem_cache_zalloc(
3690 vmap_area_cachep, GFP_KERNEL);
3691 if (!vas[area])
3692 goto err_free;
3693 }
3694
3695 goto retry;
3696 }
3697
ca23e405
TH
3698err_free:
3699 for (area = 0; area < nr_vms; area++) {
68ad4a33
URS
3700 if (vas[area])
3701 kmem_cache_free(vmap_area_cachep, vas[area]);
3702
f1db7afd 3703 kfree(vms[area]);
ca23e405 3704 }
f1db7afd 3705err_free2:
ca23e405
TH
3706 kfree(vas);
3707 kfree(vms);
3708 return NULL;
253a496d
DA
3709
3710err_free_shadow:
3711 spin_lock(&free_vmap_area_lock);
3712 /*
3713 * We release all the vmalloc shadows, even the ones for regions that
3714 * hadn't been successfully added. This relies on kasan_release_vmalloc
3715 * being able to tolerate this case.
3716 */
3717 for (area = 0; area < nr_vms; area++) {
3718 orig_start = vas[area]->va_start;
3719 orig_end = vas[area]->va_end;
96e2db45
URS
3720 va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
3721 &free_vmap_area_list);
9c801f61
URS
3722 if (va)
3723 kasan_release_vmalloc(orig_start, orig_end,
3724 va->va_start, va->va_end);
253a496d
DA
3725 vas[area] = NULL;
3726 kfree(vms[area]);
3727 }
3728 spin_unlock(&free_vmap_area_lock);
3729 kfree(vas);
3730 kfree(vms);
3731 return NULL;
ca23e405
TH
3732}
3733
3734/**
3735 * pcpu_free_vm_areas - free vmalloc areas for percpu allocator
3736 * @vms: vm_struct pointer array returned by pcpu_get_vm_areas()
3737 * @nr_vms: the number of allocated areas
3738 *
3739 * Free vm_structs and the array allocated by pcpu_get_vm_areas().
3740 */
3741void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
3742{
3743 int i;
3744
3745 for (i = 0; i < nr_vms; i++)
3746 free_vm_area(vms[i]);
3747 kfree(vms);
3748}
4f8b02b4 3749#endif /* CONFIG_SMP */
a10aa579 3750
5bb1bb35 3751#ifdef CONFIG_PRINTK
98f18083
PM
3752bool vmalloc_dump_obj(void *object)
3753{
3754 struct vm_struct *vm;
3755 void *objp = (void *)PAGE_ALIGN((unsigned long)object);
3756
3757 vm = find_vm_area(objp);
3758 if (!vm)
3759 return false;
bd34dcd4
PM
3760 pr_cont(" %u-page vmalloc region starting at %#lx allocated at %pS\n",
3761 vm->nr_pages, (unsigned long)vm->addr, vm->caller);
98f18083
PM
3762 return true;
3763}
5bb1bb35 3764#endif
98f18083 3765
a10aa579
CL
3766#ifdef CONFIG_PROC_FS
3767static void *s_start(struct seq_file *m, loff_t *pos)
e36176be 3768 __acquires(&vmap_purge_lock)
d4033afd 3769 __acquires(&vmap_area_lock)
a10aa579 3770{
e36176be 3771 mutex_lock(&vmap_purge_lock);
d4033afd 3772 spin_lock(&vmap_area_lock);
e36176be 3773
3f500069 3774 return seq_list_start(&vmap_area_list, *pos);
a10aa579
CL
3775}
3776
3777static void *s_next(struct seq_file *m, void *p, loff_t *pos)
3778{
3f500069 3779 return seq_list_next(p, &vmap_area_list, pos);
a10aa579
CL
3780}
3781
3782static void s_stop(struct seq_file *m, void *p)
d4033afd 3783 __releases(&vmap_area_lock)
0a7dd4e9 3784 __releases(&vmap_purge_lock)
a10aa579 3785{
d4033afd 3786 spin_unlock(&vmap_area_lock);
0a7dd4e9 3787 mutex_unlock(&vmap_purge_lock);
a10aa579
CL
3788}
3789
a47a126a
ED
3790static void show_numa_info(struct seq_file *m, struct vm_struct *v)
3791{
e5adfffc 3792 if (IS_ENABLED(CONFIG_NUMA)) {
a47a126a
ED
3793 unsigned int nr, *counters = m->private;
3794
3795 if (!counters)
3796 return;
3797
af12346c
WL
3798 if (v->flags & VM_UNINITIALIZED)
3799 return;
7e5b528b
DV
3800 /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
3801 smp_rmb();
af12346c 3802
a47a126a
ED
3803 memset(counters, 0, nr_node_ids * sizeof(unsigned int));
3804
3805 for (nr = 0; nr < v->nr_pages; nr++)
3806 counters[page_to_nid(v->pages[nr])]++;
3807
3808 for_each_node_state(nr, N_HIGH_MEMORY)
3809 if (counters[nr])
3810 seq_printf(m, " N%u=%u", nr, counters[nr]);
3811 }
3812}
3813
dd3b8353
URS
3814static void show_purge_info(struct seq_file *m)
3815{
dd3b8353
URS
3816 struct vmap_area *va;
3817
96e2db45
URS
3818 spin_lock(&purge_vmap_area_lock);
3819 list_for_each_entry(va, &purge_vmap_area_list, list) {
dd3b8353
URS
3820 seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
3821 (void *)va->va_start, (void *)va->va_end,
3822 va->va_end - va->va_start);
3823 }
96e2db45 3824 spin_unlock(&purge_vmap_area_lock);
dd3b8353
URS
3825}
3826
a10aa579
CL
3827static int s_show(struct seq_file *m, void *p)
3828{
3f500069 3829 struct vmap_area *va;
d4033afd
JK
3830 struct vm_struct *v;
3831
3f500069 3832 va = list_entry(p, struct vmap_area, list);
3833
c2ce8c14 3834 /*
688fcbfc
PL
3835 * s_show can encounter race with remove_vm_area, !vm on behalf
3836 * of vmap area is being tear down or vm_map_ram allocation.
c2ce8c14 3837 */
688fcbfc 3838 if (!va->vm) {
dd3b8353 3839 seq_printf(m, "0x%pK-0x%pK %7ld vm_map_ram\n",
78c72746 3840 (void *)va->va_start, (void *)va->va_end,
dd3b8353 3841 va->va_end - va->va_start);
78c72746 3842
d4033afd 3843 return 0;
78c72746 3844 }
d4033afd
JK
3845
3846 v = va->vm;
a10aa579 3847
45ec1690 3848 seq_printf(m, "0x%pK-0x%pK %7ld",
a10aa579
CL
3849 v->addr, v->addr + v->size, v->size);
3850
62c70bce
JP
3851 if (v->caller)
3852 seq_printf(m, " %pS", v->caller);
23016969 3853
a10aa579
CL
3854 if (v->nr_pages)
3855 seq_printf(m, " pages=%d", v->nr_pages);
3856
3857 if (v->phys_addr)
199eaa05 3858 seq_printf(m, " phys=%pa", &v->phys_addr);
a10aa579
CL
3859
3860 if (v->flags & VM_IOREMAP)
f4527c90 3861 seq_puts(m, " ioremap");
a10aa579
CL
3862
3863 if (v->flags & VM_ALLOC)
f4527c90 3864 seq_puts(m, " vmalloc");
a10aa579
CL
3865
3866 if (v->flags & VM_MAP)
f4527c90 3867 seq_puts(m, " vmap");
a10aa579
CL
3868
3869 if (v->flags & VM_USERMAP)
f4527c90 3870 seq_puts(m, " user");
a10aa579 3871
fe9041c2
CH
3872 if (v->flags & VM_DMA_COHERENT)
3873 seq_puts(m, " dma-coherent");
3874
244d63ee 3875 if (is_vmalloc_addr(v->pages))
f4527c90 3876 seq_puts(m, " vpages");
a10aa579 3877
a47a126a 3878 show_numa_info(m, v);
a10aa579 3879 seq_putc(m, '\n');
dd3b8353
URS
3880
3881 /*
96e2db45 3882 * As a final step, dump "unpurged" areas.
dd3b8353
URS
3883 */
3884 if (list_is_last(&va->list, &vmap_area_list))
3885 show_purge_info(m);
3886
a10aa579
CL
3887 return 0;
3888}
3889
5f6a6a9c 3890static const struct seq_operations vmalloc_op = {
a10aa579
CL
3891 .start = s_start,
3892 .next = s_next,
3893 .stop = s_stop,
3894 .show = s_show,
3895};
5f6a6a9c 3896
5f6a6a9c
AD
3897static int __init proc_vmalloc_init(void)
3898{
fddda2b7 3899 if (IS_ENABLED(CONFIG_NUMA))
0825a6f9 3900 proc_create_seq_private("vmallocinfo", 0400, NULL,
44414d82
CH
3901 &vmalloc_op,
3902 nr_node_ids * sizeof(unsigned int), NULL);
fddda2b7 3903 else
0825a6f9 3904 proc_create_seq("vmallocinfo", 0400, NULL, &vmalloc_op);
5f6a6a9c
AD
3905 return 0;
3906}
3907module_init(proc_vmalloc_init);
db3808c1 3908
a10aa579 3909#endif