]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
bpf: arena: use kmalloc_nolock() in place of kvcalloc()
authorPuranjay Mohan <puranjay@kernel.org>
Mon, 22 Dec 2025 19:50:17 +0000 (11:50 -0800)
committerAlexei Starovoitov <ast@kernel.org>
Tue, 23 Dec 2025 19:29:59 +0000 (11:29 -0800)
To make arena_alloc_pages() safe to be called from any context, replace
kvcalloc() with kmalloc_nolock() so as it doesn't sleep or take any
locks. kmalloc_nolock() returns NULL for allocations larger than
KMALLOC_MAX_CACHE_SIZE, which is (PAGE_SIZE * 2) = 8KB on systems with
4KB pages. So, round down the allocation done by kmalloc_nolock to 1024
* 8 and reuse the array in a loop.

Signed-off-by: Puranjay Mohan <puranjay@kernel.org>
Link: https://lore.kernel.org/r/20251222195022.431211-3-puranjay@kernel.org
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
kernel/bpf/arena.c

index 55b198b9f1a3e25b4079b6bdc89e5cf172c1bf72..128efb68d47bfc04bec394dfb5dab5be81613a14 100644 (file)
@@ -44,6 +44,8 @@
 #define GUARD_SZ round_up(1ull << sizeof_field(struct bpf_insn, off) * 8, PAGE_SIZE << 1)
 #define KERN_VM_SZ (SZ_4G + GUARD_SZ)
 
+static void arena_free_pages(struct bpf_arena *arena, long uaddr, long page_cnt);
+
 struct bpf_arena {
        struct bpf_map map;
        u64 user_vm_start;
@@ -500,8 +502,10 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
        /* user_vm_end/start are fixed before bpf prog runs */
        long page_cnt_max = (arena->user_vm_end - arena->user_vm_start) >> PAGE_SHIFT;
        u64 kern_vm_start = bpf_arena_get_kern_vm_start(arena);
+       struct apply_range_data data;
        struct page **pages = NULL;
-       long mapped = 0;
+       long remaining, mapped = 0;
+       long alloc_pages;
        long pgoff = 0;
        u32 uaddr32;
        int ret, i;
@@ -518,17 +522,19 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
                        return 0;
        }
 
-       /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
-       pages = kvcalloc(page_cnt, sizeof(struct page *), GFP_KERNEL);
+       /* Cap allocation size to KMALLOC_MAX_CACHE_SIZE so kmalloc_nolock() can succeed. */
+       alloc_pages = min(page_cnt, KMALLOC_MAX_CACHE_SIZE / sizeof(struct page *));
+       pages = kmalloc_nolock(alloc_pages * sizeof(struct page *), 0, NUMA_NO_NODE);
        if (!pages)
                return 0;
+       data.pages = pages;
 
        mutex_lock(&arena->lock);
 
        if (uaddr) {
                ret = is_range_tree_set(&arena->rt, pgoff, page_cnt);
                if (ret)
-                       goto out_free_pages;
+                       goto out_unlock_free_pages;
                ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
        } else {
                ret = pgoff = range_tree_find(&arena->rt, page_cnt);
@@ -536,40 +542,60 @@ static long arena_alloc_pages(struct bpf_arena *arena, long uaddr, long page_cnt
                        ret = range_tree_clear(&arena->rt, pgoff, page_cnt);
        }
        if (ret)
-               goto out_free_pages;
-
-       struct apply_range_data data = { .pages = pages, .i = 0 };
-       ret = bpf_map_alloc_pages(&arena->map, node_id, page_cnt, pages);
-       if (ret)
-               goto out;
+               goto out_unlock_free_pages;
 
+       remaining = page_cnt;
        uaddr32 = (u32)(arena->user_vm_start + pgoff * PAGE_SIZE);
-       /* Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
-        * will not overflow 32-bit. Lower 32-bit need to represent
-        * contiguous user address range.
-        * Map these pages at kern_vm_start base.
-        * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
-        * lower 32-bit and it's ok.
-        */
-       apply_to_page_range(&init_mm, kern_vm_start + uaddr32,
-                           page_cnt << PAGE_SHIFT, apply_range_set_cb, &data);
-       mapped = data.i;
-       flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT);
-       if (mapped < page_cnt) {
-               for (i = mapped; i < page_cnt; i++)
-                       __free_page(pages[i]);
-               goto out;
+
+       while (remaining) {
+               long this_batch = min(remaining, alloc_pages);
+
+               /* zeroing is needed, since alloc_pages_bulk() only fills in non-zero entries */
+               memset(pages, 0, this_batch * sizeof(struct page *));
+
+               ret = bpf_map_alloc_pages(&arena->map, node_id, this_batch, pages);
+               if (ret)
+                       goto out;
+
+               /*
+                * Earlier checks made sure that uaddr32 + page_cnt * PAGE_SIZE - 1
+                * will not overflow 32-bit. Lower 32-bit need to represent
+                * contiguous user address range.
+                * Map these pages at kern_vm_start base.
+                * kern_vm_start + uaddr32 + page_cnt * PAGE_SIZE - 1 can overflow
+                * lower 32-bit and it's ok.
+                */
+               data.i = 0;
+               ret = apply_to_page_range(&init_mm,
+                                         kern_vm_start + uaddr32 + (mapped << PAGE_SHIFT),
+                                         this_batch << PAGE_SHIFT, apply_range_set_cb, &data);
+               if (ret) {
+                       /* data.i pages were mapped, account them and free the remaining */
+                       mapped += data.i;
+                       for (i = data.i; i < this_batch; i++)
+                               __free_page(pages[i]);
+                       goto out;
+               }
+
+               mapped += this_batch;
+               remaining -= this_batch;
        }
+       flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT);
        mutex_unlock(&arena->lock);
-       kvfree(pages);
+       kfree_nolock(pages);
        return clear_lo32(arena->user_vm_start) + uaddr32;
 out:
        range_tree_set(&arena->rt, pgoff + mapped, page_cnt - mapped);
-out_free_pages:
        mutex_unlock(&arena->lock);
-       if (mapped)
+       if (mapped) {
+               flush_vmap_cache(kern_vm_start + uaddr32, mapped << PAGE_SHIFT);
                arena_free_pages(arena, uaddr32, mapped);
-       kvfree(pages);
+       }
+       goto out_free_pages;
+out_unlock_free_pages:
+       mutex_unlock(&arena->lock);
+out_free_pages:
+       kfree_nolock(pages);
        return 0;
 }