]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
bpf: Allow special fields in resizable hashtab
authorMykyta Yatsenko <yatsenko@meta.com>
Fri, 5 Jun 2026 11:41:23 +0000 (04:41 -0700)
committerAlexei Starovoitov <ast@kernel.org>
Fri, 5 Jun 2026 15:00:08 +0000 (08:00 -0700)
Add support for timers, workqueues, task work, spin locks and kptrs.
Without this, users needing deferred callbacks, BPF_F_LOCK, or
refcounted kernel pointers in a dynamically-sized map have no option -
fixed-size htab is the only map supporting these field types.
Resizable hashtab should offer the same capability.

kptr semantics under in-place updates are identical to array map.

Properly clean up BTF record fields on element delete and map
teardown by wiring up bpf_obj_free_fields through a memory allocator
destructor, matching the pattern used by htab for non-prealloc maps.

Signed-off-by: Mykyta Yatsenko <yatsenko@meta.com>
Link: https://lore.kernel.org/r/20260605-rhash-v7-6-5b8e05f8630d@meta.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
kernel/bpf/hashtab.c
kernel/bpf/syscall.c

index a149713d09538815e6b685d6645ef2a0a22fbcc6..7b9408b8320c9bc5702bdb190f1e6a70fb7a130e 100644 (file)
@@ -497,28 +497,26 @@ static void htab_dtor_ctx_free(void *ctx)
        kfree(ctx);
 }
 
-static int htab_set_dtor(struct bpf_htab *htab, void (*dtor)(void *, void *))
+static int bpf_ma_set_dtor(struct bpf_map *map, struct bpf_mem_alloc *ma,
+                          void (*dtor)(void *, void *))
 {
-       u32 key_size = htab->map.key_size;
-       struct bpf_mem_alloc *ma;
        struct htab_btf_record *hrec;
        int err;
 
        /* No need for dtors. */
-       if (IS_ERR_OR_NULL(htab->map.record))
+       if (IS_ERR_OR_NULL(map->record))
                return 0;
 
        hrec = kzalloc(sizeof(*hrec), GFP_KERNEL);
        if (!hrec)
                return -ENOMEM;
-       hrec->key_size = key_size;
-       hrec->record = btf_record_dup(htab->map.record);
+       hrec->key_size = map->key_size;
+       hrec->record = btf_record_dup(map->record);
        if (IS_ERR(hrec->record)) {
                err = PTR_ERR(hrec->record);
                kfree(hrec);
                return err;
        }
-       ma = htab_is_percpu(htab) ? &htab->pcpu_ma : &htab->ma;
        bpf_mem_alloc_set_dtor(ma, dtor, htab_dtor_ctx_free, hrec);
        return 0;
 }
@@ -535,9 +533,9 @@ static int htab_map_check_btf(struct bpf_map *map, const struct btf *btf,
         * populated in htab_map_alloc(), so it will always appear as NULL.
         */
        if (htab_is_percpu(htab))
-               return htab_set_dtor(htab, htab_pcpu_mem_dtor);
+               return bpf_ma_set_dtor(map, &htab->pcpu_ma, htab_pcpu_mem_dtor);
        else
-               return htab_set_dtor(htab, htab_mem_dtor);
+               return bpf_ma_set_dtor(map, &htab->ma, htab_mem_dtor);
 }
 
 static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
@@ -2752,6 +2750,7 @@ struct bpf_rhtab {
        struct rhashtable ht;
        struct bpf_mem_alloc ma;
        u32 elem_size;
+       bool freeing_internal;
 };
 
 static const struct rhashtable_params rhtab_params = {
@@ -2832,11 +2831,34 @@ static int rhtab_map_alloc_check(union bpf_attr *attr)
        return htab_map_alloc_check(attr);
 }
 
+static void rhtab_check_and_free_fields(struct bpf_rhtab *rhtab,
+                                       struct rhtab_elem *elem)
+{
+       if (IS_ERR_OR_NULL(rhtab->map.record))
+               return;
+
+       bpf_obj_free_fields(rhtab->map.record,
+                           rhtab_elem_value(elem, rhtab->map.key_size));
+}
+
+static void rhtab_mem_dtor(void *obj, void *ctx)
+{
+       struct htab_btf_record *hrec = ctx;
+       struct rhtab_elem *elem = obj;
+
+       if (IS_ERR_OR_NULL(hrec->record))
+               return;
+
+       bpf_obj_free_fields(hrec->record,
+                           rhtab_elem_value(elem, hrec->key_size));
+}
+
 static void rhtab_free_elem(void *ptr, void *arg)
 {
        struct bpf_rhtab *rhtab = arg;
        struct rhtab_elem *elem = ptr;
 
+       bpf_map_free_internal_structs(&rhtab->map, rhtab_elem_value(elem, rhtab->map.key_size));
        bpf_mem_cache_free_rcu(&rhtab->ma, elem);
 }
 
@@ -2900,7 +2922,8 @@ static int rhtab_delete_elem(struct bpf_rhtab *rhtab, struct rhtab_elem *elem, v
                rhtab_read_elem_value(&rhtab->map, copy, elem, flags);
                check_and_init_map_value(&rhtab->map, copy);
        }
-
+       /* Release internal structs: kptr, bpf_timer, task_work, wq */
+       rhtab_check_and_free_fields(rhtab, elem);
        bpf_mem_cache_free_rcu(&rhtab->ma, elem);
        return 0;
 }
@@ -2942,6 +2965,7 @@ static int rhtab_map_lookup_and_delete_elem(struct bpf_map *map, void *key, void
 static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *elem, void *value,
                                      u64 map_flags)
 {
+       struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
        void *old_val = rhtab_elem_value(elem, map->key_size);
 
        if (map_flags & BPF_NOEXIST)
@@ -2951,6 +2975,17 @@ static long rhtab_map_update_existing(struct bpf_map *map, struct rhtab_elem *el
                copy_map_value_locked(map, old_val, value, false);
        else
                copy_map_value(map, old_val, value);
+
+       /*
+        * Torn reads: a concurrent reader without BPF_F_LOCK may observe
+        * the value mid-copy. Callers requiring consistent reads must use
+        * BPF_F_LOCK, matching arraymap semantics.
+        *
+        * copy_map_value() skips special-field offsets, so old timers/
+        * kptrs/etc. still sit in the slot. Cancel them after the copy
+        * to match arraymap's update semantics.
+        */
+       rhtab_check_and_free_fields(rhtab, elem);
        return 0;
 }
 
@@ -2973,6 +3008,14 @@ static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u
        if (map_flags & BPF_EXIST)
                return -ENOENT;
 
+       /*
+        * Reject new insertions while map_release_uref cleanup walks the
+        * table. Without this, new elements could keep triggering rehash
+        * and prevent the walk from terminating.
+        */
+       if (READ_ONCE(rhtab->freeing_internal))
+               return -EBUSY;
+
        /* Check max_entries limit before inserting new element */
        if (atomic_read(&rhtab->ht.nelems) >= map->max_entries)
                return -E2BIG;
@@ -2983,6 +3026,7 @@ static long rhtab_map_update_elem(struct bpf_map *map, void *key, void *value, u
 
        memcpy(elem->data, key, map->key_size);
        copy_map_value(map, rhtab_elem_value(elem, map->key_size), value);
+       check_and_init_map_value(map, rhtab_elem_value(elem, map->key_size));
 
        /* Prevent deadlock for NMI programs attempting to take bucket lock */
        bpf_disable_instrumentation();
@@ -3015,8 +3059,54 @@ static int rhtab_map_gen_lookup(struct bpf_map *map, struct bpf_insn *insn_buf)
        return insn - insn_buf;
 }
 
+static int rhtab_map_check_btf(struct bpf_map *map, const struct btf *btf,
+                              const struct btf_type *key_type,
+                              const struct btf_type *value_type)
+{
+       struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+
+       return bpf_ma_set_dtor(map, &rhtab->ma, rhtab_mem_dtor);
+}
+
 static void rhtab_map_free_internal_structs(struct bpf_map *map)
 {
+       struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+       struct rhashtable_iter iter;
+       struct rhtab_elem *elem;
+
+       if (!bpf_map_has_internal_structs(map))
+               return;
+
+       /*
+        * Block new insertions. Once observed, no new growth is triggered,
+        * so any in-flight rehash will drain and the walker is guaranteed
+        * to stop returning -EAGAIN. Treat -EAGAIN as "rehash in progress,
+        * retry"; do not wait for the worker.
+        */
+       WRITE_ONCE(rhtab->freeing_internal, true);
+
+       rhashtable_walk_enter(&rhtab->ht, &iter);
+       rhashtable_walk_start(&iter);
+
+       while ((elem = rhashtable_walk_next(&iter))) {
+               if (IS_ERR(elem)) {
+                       if (PTR_ERR(elem) == -EAGAIN)
+                               continue;
+                       break;
+               }
+
+               bpf_map_free_internal_structs(map, rhtab_elem_value(elem, map->key_size));
+
+               if (need_resched()) { /* Avoid stalls on large maps */
+                       rhashtable_walk_stop(&iter);
+                       cond_resched();
+                       rhashtable_walk_start(&iter);
+               }
+       }
+
+       rhashtable_walk_stop(&iter);
+       rhashtable_walk_exit(&iter);
+       WRITE_ONCE(rhtab->freeing_internal, false);
 }
 
 static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
@@ -3382,6 +3472,7 @@ const struct bpf_map_ops rhtab_map_ops = {
        .map_free = rhtab_map_free,
        .map_get_next_key = rhtab_map_get_next_key,
        .map_release_uref = rhtab_map_free_internal_structs,
+       .map_check_btf = rhtab_map_check_btf,
        .map_lookup_elem = rhtab_map_lookup_elem,
        .map_lookup_and_delete_elem = rhtab_map_lookup_and_delete_elem,
        .map_update_elem = rhtab_map_update_elem,
index 1faae184de482bea55407f61a3820bd7891ecb26..31a3b70a0b5da1d29a767ccbc29e407c23994ea5 100644 (file)
@@ -1280,6 +1280,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
                        case BPF_SPIN_LOCK:
                        case BPF_RES_SPIN_LOCK:
                                if (map->map_type != BPF_MAP_TYPE_HASH &&
+                                   map->map_type != BPF_MAP_TYPE_RHASH &&
                                    map->map_type != BPF_MAP_TYPE_ARRAY &&
                                    map->map_type != BPF_MAP_TYPE_CGROUP_STORAGE &&
                                    map->map_type != BPF_MAP_TYPE_SK_STORAGE &&
@@ -1294,6 +1295,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
                        case BPF_WORKQUEUE:
                        case BPF_TASK_WORK:
                                if (map->map_type != BPF_MAP_TYPE_HASH &&
+                                   map->map_type != BPF_MAP_TYPE_RHASH &&
                                    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
                                    map->map_type != BPF_MAP_TYPE_ARRAY) {
                                        ret = -EOPNOTSUPP;
@@ -1305,6 +1307,7 @@ static int map_check_btf(struct bpf_map *map, struct bpf_token *token,
                        case BPF_KPTR_PERCPU:
                        case BPF_REFCOUNT:
                                if (map->map_type != BPF_MAP_TYPE_HASH &&
+                                   map->map_type != BPF_MAP_TYPE_RHASH &&
                                    map->map_type != BPF_MAP_TYPE_PERCPU_HASH &&
                                    map->map_type != BPF_MAP_TYPE_LRU_HASH &&
                                    map->map_type != BPF_MAP_TYPE_LRU_PERCPU_HASH &&