u64 user_vm_start;
u64 user_vm_end;
struct vm_struct *kern_vm;
+ struct page *scratch_page;
struct range_tree rt;
/* protects rt */
rqspinlock_t spinlock;
int i;
};
+struct clear_range_data {
+ struct llist_head *free_pages;
+ struct page *scratch_page;
+};
+
static int apply_range_set_cb(pte_t *pte, unsigned long addr, void *data)
{
struct apply_range_data *d = data;
flush_cache_vmap(start, start + size);
}
-static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *free_pages)
+static int apply_range_clear_cb(pte_t *pte, unsigned long addr, void *data)
{
+ struct clear_range_data *d = data;
pte_t old_pte;
struct page *page;
- /* sanity check */
- old_pte = ptep_get(pte);
+ /*
+ * Pairs with ptep_try_set() in the kernel-fault scratch installer.
+ * Both sides must be atomic.
+ */
+ old_pte = ptep_get_and_clear(&init_mm, addr, pte);
if (pte_none(old_pte) || !pte_present(old_pte))
- return 0; /* nothing to do */
+ return 0;
page = pte_page(old_pte);
if (WARN_ON_ONCE(!page))
return -EINVAL;
- pte_clear(&init_mm, addr, pte);
+ /*
+ * Skip the per-arena scratch page. A kernel fault on an unallocated uaddr
+ * scratches its PTE. A later bpf_arena_free_pages() over that range walks
+ * here. Without the skip, scratch_page would be freed.
+ */
+ if (page == d->scratch_page)
+ return 0;
+
+ __llist_add(&page->pcp_llist, d->free_pages);
+ return 0;
+}
- /* Add page to the list so it is freed later */
- if (free_pages)
- __llist_add(&page->pcp_llist, free_pages);
+static int apply_range_set_scratch_cb(pte_t *pte, unsigned long addr, void *data)
+{
+ struct page *scratch_page = data;
+ if (!pte_none(ptep_get(pte)))
+ return 0;
+ /*
+ * Best-effort install. ptep_try_set() returns false only if another
+ * installer (real allocation or concurrent fault) won the cmpxchg.
+ * Their PTE is already valid, so the access retry succeeds.
+ *
+ * No flush_tlb_kernel_range() needed. Stale "not mapped" entries just
+ * cause one extra re-fault through this same path.
+ */
+ ptep_try_set(pte, mk_pte(scratch_page, PAGE_KERNEL));
return 0;
}
static int populate_pgtable_except_pte(struct bpf_arena *arena)
{
+ /* Populate intermediates for the recovery range (4 GiB + upper half-guard). */
return apply_to_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
- KERN_VM_SZ - GUARD_SZ, apply_range_set_cb, NULL);
+ SZ_4G + GUARD_SZ / 2, apply_range_set_cb, NULL);
}
static struct bpf_map *arena_map_alloc(union bpf_attr *attr)
init_irq_work(&arena->free_irq, arena_free_irq);
INIT_WORK(&arena->free_work, arena_free_worker);
bpf_map_init_from_attr(&arena->map, attr);
+
+ err = bpf_map_alloc_pages(&arena->map, NUMA_NO_NODE, 1, &arena->scratch_page);
+ if (err)
+ goto err_free_arena;
+
range_tree_init(&arena->rt);
err = range_tree_set(&arena->rt, 0, attr->max_entries);
- if (err) {
- bpf_map_area_free(arena);
- goto err;
- }
+ if (err)
+ goto err_free_scratch;
mutex_init(&arena->lock);
raw_res_spin_lock_init(&arena->spinlock);
err = populate_pgtable_except_pte(arena);
- if (err) {
- range_tree_destroy(&arena->rt);
- bpf_map_area_free(arena);
- goto err;
- }
+ if (err)
+ goto err_destroy_rt;
return &arena->map;
+
+err_destroy_rt:
+ range_tree_destroy(&arena->rt);
+err_free_scratch:
+ __free_page(arena->scratch_page);
+err_free_arena:
+ bpf_map_area_free(arena);
err:
free_vm_area(kern_vm);
return ERR_PTR(err);
static int existing_page_cb(pte_t *ptep, unsigned long addr, void *data)
{
+ struct bpf_arena *arena = data;
struct page *page;
pte_t pte;
if (!pte_present(pte)) /* sanity check */
return 0;
page = pte_page(pte);
+ /*
+ * Skip the scratch page. The walk is page-table-driven, not range-tree-driven,
+ * so it can visit scratch PTEs at uaddrs the BPF program never allocated.
+ */
+ if (page == arena->scratch_page)
+ return 0;
/*
* We do not update pte here:
* 1. Nobody should be accessing bpf_arena's range outside of a kernel bug
* free those pages.
*/
apply_to_existing_page_range(&init_mm, bpf_arena_get_kern_vm_start(arena),
- KERN_VM_SZ - GUARD_SZ, existing_page_cb, NULL);
+ SZ_4G + GUARD_SZ / 2, existing_page_cb, arena);
free_vm_area(arena->kern_vm);
range_tree_destroy(&arena->rt);
+ __free_page(arena->scratch_page);
bpf_map_area_free(arena);
}
return VM_FAULT_RETRY;
page = vmalloc_to_page((void *)kaddr);
- if (page)
+ if (page) {
+ if (page == arena->scratch_page)
+ /* BPF triggered scratch here; don't lazy-alloc over it */
+ goto out_sigsegv;
/* already have a page vmap-ed */
goto out;
+ }
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
if (arena->map.map_flags & BPF_F_SEGV_ON_FAULT)
/* User space requested to segfault when page is not allocated by bpf prog */
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
ret = range_tree_clear(&arena->rt, vmf->pgoff, 1);
if (ret)
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
struct apply_range_data data = { .pages = &page, .i = 0 };
/* Account into memcg of the process that created bpf_arena */
ret = bpf_map_alloc_pages(map, NUMA_NO_NODE, 1, &page);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
}
ret = apply_to_page_range(&init_mm, kaddr, PAGE_SIZE, apply_range_set_cb, &data);
if (ret) {
range_tree_set(&arena->rt, vmf->pgoff, 1);
free_pages_nolock(page, 0);
- goto out_unlock_sigsegv;
+ goto out_sigsegv_memcg;
}
flush_vmap_cache(kaddr, PAGE_SIZE);
bpf_map_memcg_exit(old_memcg, new_memcg);
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
vmf->page = page;
return 0;
-out_unlock_sigsegv:
+out_sigsegv_memcg:
bpf_map_memcg_exit(old_memcg, new_memcg);
+out_sigsegv:
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
return VM_FAULT_SIGSEGV;
}
struct llist_head free_pages;
struct llist_node *pos, *t;
struct arena_free_span *s;
+ struct clear_range_data cdata;
unsigned long flags;
int ret = 0;
range_tree_set(&arena->rt, pgoff, page_cnt);
init_llist_head(&free_pages);
+ cdata.free_pages = &free_pages;
+ cdata.scratch_page = arena->scratch_page;
/* clear ptes and collect struct pages */
apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
- apply_range_clear_cb, &free_pages);
+ apply_range_clear_cb, &cdata);
/* drop the lock to do the tlb flush and zap pages */
raw_res_spin_unlock_irqrestore(&arena->spinlock, flags);
struct arena_free_span *s;
u64 arena_vm_start, user_vm_start;
struct llist_head free_pages;
+ struct clear_range_data cdata;
struct page *page;
unsigned long full_uaddr;
long kaddr, page_cnt, pgoff;
bpf_map_memcg_enter(&arena->map, &old_memcg, &new_memcg);
init_llist_head(&free_pages);
+ cdata.free_pages = &free_pages;
+ cdata.scratch_page = arena->scratch_page;
arena_vm_start = bpf_arena_get_kern_vm_start(arena);
user_vm_start = bpf_arena_get_user_vm_start(arena);
/* clear ptes and collect pages in free_pages llist */
apply_to_existing_page_range(&init_mm, kaddr, page_cnt << PAGE_SHIFT,
- apply_range_clear_cb, &free_pages);
+ apply_range_clear_cb, &cdata);
range_tree_set(&arena->rt, pgoff, page_cnt);
}
}
late_initcall(kfunc_init);
-void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+static void __bpf_prog_report_arena_violation(struct bpf_prog *prog, bool write,
+ unsigned long addr, unsigned long fault_ip)
{
struct bpf_stream_stage ss;
- struct bpf_prog *prog;
u64 user_vm_start;
- /*
- * The RCU read lock is held to safely traverse the latch tree, but we
- * don't need its protection when accessing the prog, since it will not
- * disappear while we are handling the fault.
- */
- rcu_read_lock();
- prog = bpf_prog_ksym_find(fault_ip);
- rcu_read_unlock();
- if (!prog)
- return;
-
/* Use main prog for stream access */
prog = prog->aux->main_prog_aux->prog;
bpf_stream_dump_stack(ss);
}));
}
+
+bool bpf_arena_handle_page_fault(unsigned long addr, bool is_write, unsigned long fault_ip)
+{
+ struct bpf_arena *arena;
+ struct bpf_prog *prog;
+ unsigned long kbase;
+ unsigned long page_addr = addr & PAGE_MASK;
+
+ prog = bpf_prog_find_from_stack();
+ if (!prog)
+ return false;
+
+ arena = prog->aux->arena;
+ /* a prog not using arena may be on stack, so arena can be NULL */
+ if (!arena)
+ return false;
+
+ kbase = bpf_arena_get_kern_vm_start(arena);
+
+ /*
+ * Recovery covers the 4 GiB mappable band plus the upper half-guard.
+ * Lower guard is unreachable from kfuncs; an address there indicates
+ * a different bug class - leave it to the regular kernel oops path.
+ */
+ if (page_addr < kbase || page_addr >= kbase + SZ_4G + GUARD_SZ / 2)
+ return false;
+
+ apply_to_page_range(&init_mm, page_addr, PAGE_SIZE,
+ apply_range_set_scratch_cb, arena->scratch_page);
+ flush_vmap_cache(page_addr, PAGE_SIZE);
+ __bpf_prog_report_arena_violation(prog, is_write, page_addr - kbase, fault_ip);
+ return true;
+}
+
+void bpf_prog_report_arena_violation(bool write, unsigned long addr, unsigned long fault_ip)
+{
+ struct bpf_prog *prog;
+
+ /*
+ * The RCU read lock is held to safely traverse the latch tree, but we
+ * don't need its protection when accessing the prog, since it will not
+ * disappear while we are handling the fault.
+ */
+ rcu_read_lock();
+ prog = bpf_prog_ksym_find(fault_ip);
+ rcu_read_unlock();
+ if (!prog)
+ return;
+ __bpf_prog_report_arena_violation(prog, write, addr, fault_ip);
+}