bpf: Implement iteration ops for resizable hashtab

author Mykyta Yatsenko <yatsenko@meta.com>

Fri, 5 Jun 2026 11:41:22 +0000 (04:41 -0700)

committer Alexei Starovoitov <ast@kernel.org>

Fri, 5 Jun 2026 15:00:08 +0000 (08:00 -0700)
author Mykyta Yatsenko <yatsenko@meta.com>
Fri, 5 Jun 2026 11:41:22 +0000 (04:41 -0700)
committer Alexei Starovoitov <ast@kernel.org>
Fri, 5 Jun 2026 15:00:08 +0000 (08:00 -0700)
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c

index 10f3a058747b3f9bf5dc44312bb3fb4266a2ebfa..a149713d09538815e6b685d6645ef2a0a22fbcc6 100644 (file)
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -3020,8 +3020,79 @@ static void rhtab_map_free_internal_structs(struct bpf_map *map)
  }
  
  static int rhtab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
+       __must_hold_shared(RCU)
  {
-       return -EOPNOTSUPP;
+       struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+       struct rhtab_elem *elem;
+
+       elem = rhashtable_next_key(&rhtab->ht, key);
+
+       /* if not found, return the first key */
+       if (PTR_ERR(elem) == -ENOENT)
+               elem = rhashtable_next_key(&rhtab->ht, NULL);
+
+       if (IS_ERR(elem))
+               return PTR_ERR(elem);
+       if (!elem)
+               return -ENOENT;
+
+       memcpy(next_key, elem->data, map->key_size);
+       return 0;
+}
+
+static void rhtab_map_seq_show_elem(struct bpf_map *map, void *key, struct seq_file *m)
+{
+       void *value;
+
+       /* Guarantee that hashtab value is not freed */
+       guard(rcu)();
+
+       value = rhtab_map_lookup_elem(map, key);
+       if (!value)
+               return;
+
+       btf_type_seq_show(map->btf, map->btf_key_type_id, key, m);
+       seq_puts(m, ": ");
+       btf_type_seq_show(map->btf, map->btf_value_type_id, value, m);
+       seq_putc(m, '\n');
+}
+
+static long bpf_each_rhash_elem(struct bpf_map *map, bpf_callback_t callback_fn,
+                               void *callback_ctx, u64 flags)
+{
+       struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+       void *prev_key = NULL;
+       struct rhtab_elem *elem;
+       int num_elems = 0;
+       u64 ret = 0;
+
+       cant_migrate();
+
+       if (flags != 0)
+               return -EINVAL;
+
+       rcu_read_lock();
+       /*
+        * Best-effort iteration: if rhashtable is concurrently resized or
+        * elements are deleted/inserted, there may be missed or duplicate
+        * elements visited.
+        */
+       while ((elem = rhashtable_next_key(&rhtab->ht, prev_key))) {
+               if (IS_ERR(elem))
+                       break;
+               num_elems++;
+               ret = callback_fn((u64)(long)map,
+                                 (u64)(long)elem->data,
+                                 (u64)(long)rhtab_elem_value(elem, map->key_size),
+                                 (u64)(long)callback_ctx, 0);
+               if (ret)
+                       break;
+
+               prev_key = elem->data;  /* valid while RCU held */
+       }
+       rcu_read_unlock();
+
+       return num_elems;
  }
  
  static u64 rhtab_map_mem_usage(const struct bpf_map *map)
@@ -3034,6 +3105,275 @@ static u64 rhtab_map_mem_usage(const struct bpf_map *map)
         return sizeof(struct bpf_rhtab) + rhtab->elem_size * num_entries;
  }
  
+static int __rhtab_map_lookup_and_delete_batch(struct bpf_map *map,
+                                              const union bpf_attr *attr,
+                                              union bpf_attr __user *uattr,
+                                              bool do_delete)
+{
+       struct bpf_rhtab *rhtab = container_of(map, struct bpf_rhtab, map);
+       void __user *uvalues = u64_to_user_ptr(attr->batch.values);
+       void __user *ukeys = u64_to_user_ptr(attr->batch.keys);
+       void __user *ubatch = u64_to_user_ptr(attr->batch.in_batch);
+       void *cursor = NULL, *keys = NULL, *values = NULL, *dst_key, *dst_val;
+       struct rhtab_elem **del_elems = NULL;
+       u32 max_count, total, key_size, value_size, i;
+       bool has_next_cursor = false;
+       struct rhtab_elem *elem;
+       u64 elem_map_flags, map_flags;
+       int ret = 0;
+
+       elem_map_flags = attr->batch.elem_flags;
+       ret = bpf_map_check_op_flags(map, elem_map_flags, BPF_F_LOCK);
+       if (ret)
+               return ret;
+
+       map_flags = attr->batch.flags;
+       if (map_flags)
+               return -EINVAL;
+
+       max_count = attr->batch.count;
+       if (!max_count)
+               return 0;
+
+       if (put_user(0, &uattr->batch.count))
+               return -EFAULT;
+
+       key_size = map->key_size;
+       value_size = map->value_size;
+
+       keys = kvmalloc_array(max_count, key_size, GFP_USER | __GFP_NOWARN);
+       values = kvmalloc_array(max_count, value_size, GFP_USER | __GFP_NOWARN);
+       if (do_delete)
+               del_elems = kvmalloc_array(max_count, sizeof(void *),
+                                          GFP_USER | __GFP_NOWARN);
+       cursor = kmalloc(key_size, GFP_USER | __GFP_NOWARN);
+
+       if (!keys || !values || !cursor || (do_delete && !del_elems)) {
+               ret = -ENOMEM;
+               goto free;
+       }
+
+       if (ubatch && copy_from_user(cursor, ubatch, key_size)) {
+               ret = -EFAULT;
+               goto free;
+       }
+
+       dst_key = keys;
+       dst_val = values;
+       total = 0;
+
+       rcu_read_lock();
+
+       /*
+        * Cursor stores the key of the next-to-process element (stashed by
+        * the previous batch). Look it up directly so the element is included
+        * here rather than skipped by next_key(). If the cursor was deleted
+        * concurrently (or by the previous do_delete batch), return -EAGAIN
+        * so userspace can distinguish a lost cursor from end-of-iteration
+        * (-ENOENT) and restart from a NULL cursor.
+        */
+       if (ubatch) {
+               elem = rhtab_lookup_elem(map, cursor);
+               if (!elem) {
+                       rcu_read_unlock();
+                       ret = -EAGAIN;
+                       goto free;
+               }
+       } else {
+               elem = rhashtable_next_key(&rhtab->ht, NULL);
+       }
+
+       while (elem && !IS_ERR(elem) && total < max_count) {
+               memcpy(dst_key, elem->data, key_size);
+               rhtab_read_elem_value(map, dst_val, elem, elem_map_flags);
+               check_and_init_map_value(map, dst_val);
+
+               if (do_delete)
+                       del_elems[total] = elem;
+
+               elem = rhashtable_next_key(&rhtab->ht, dst_key);
+               dst_key += key_size;
+               dst_val += value_size;
+               total++;
+
+               /* Bail to userspace to avoid stalls. */
+               if (need_resched())
+                       break;
+       }
+
+       if (elem && !IS_ERR(elem)) {
+               /* Stash next-to-process key as cursor for the next batch. */
+               memcpy(cursor, elem->data, key_size);
+               has_next_cursor = true;
+       }
+
+       if (do_delete) {
+               for (i = 0; i < total; i++)
+                       rhtab_delete_elem(rhtab, del_elems[i], NULL, 0);
+       }
+
+       rcu_read_unlock();
+
+       if (total == 0) {
+               ret = -ENOENT;
+               goto free;
+       }
+
+       /* No more elements after this batch. */
+       if (!has_next_cursor)
+               ret = -ENOENT;
+
+       if (copy_to_user(ukeys, keys, (size_t)total * key_size) ||
+           copy_to_user(uvalues, values, (size_t)total * value_size) ||
+           put_user(total, &uattr->batch.count) ||
+           (has_next_cursor &&
+            copy_to_user(u64_to_user_ptr(attr->batch.out_batch),
+                         cursor, key_size))) {
+               ret = -EFAULT;
+               goto free;
+       }
+
+free:
+       kfree(cursor);
+       kvfree(keys);
+       kvfree(values);
+       kvfree(del_elems);
+       return ret;
+}
+
+static int rhtab_map_lookup_batch(struct bpf_map *map, const union bpf_attr *attr,
+                                 union bpf_attr __user *uattr)
+{
+       return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, false);
+}
+
+static int rhtab_map_lookup_and_delete_batch(struct bpf_map *map, const union bpf_attr *attr,
+                                            union bpf_attr __user *uattr)
+{
+       return __rhtab_map_lookup_and_delete_batch(map, attr, uattr, true);
+}
+
+struct bpf_iter_seq_rhash_map_info {
+       struct bpf_map *map;
+       struct bpf_rhtab *rhtab;
+       struct rhashtable_iter iter;
+};
+
+static void *bpf_rhash_map_seq_start(struct seq_file *seq, loff_t *pos)
+       __acquires(RCU)
+{
+       struct bpf_iter_seq_rhash_map_info *info = seq->private;
+       struct rhtab_elem *elem;
+
+       rhashtable_walk_start(&info->iter);
+       /*
+        * Re-deliver the element returned by walk_next() at the end of the
+        * previous read() — bpf_seq_read may have stopped before show()
+        * consumed it. Rehash rewinds the walker; retry on -EAGAIN.
+        */
+       do {
+               elem = rhashtable_walk_peek(&info->iter);
+       } while (PTR_ERR(elem) == -EAGAIN);
+
+       if (IS_ERR(elem))
+               return NULL;
+
+       if (elem && *pos == 0)
+               ++*pos;
+       return elem;
+}
+
+static void *bpf_rhash_map_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+       struct bpf_iter_seq_rhash_map_info *info = seq->private;
+       struct rhtab_elem *elem;
+
+       ++*pos;
+
+       /* Rehash rewinds the walker; retry until it stops returning -EAGAIN. */
+       do {
+               elem = rhashtable_walk_next(&info->iter);
+       } while (PTR_ERR(elem) == -EAGAIN);
+
+       if (IS_ERR(elem))
+               return NULL;
+       return elem;
+}
+
+static int __bpf_rhash_map_seq_show(struct seq_file *seq,
+                                   struct rhtab_elem *elem)
+{
+       struct bpf_iter_seq_rhash_map_info *info = seq->private;
+       struct bpf_iter__bpf_map_elem ctx = {};
+       struct bpf_iter_meta meta;
+       struct bpf_prog *prog;
+       int ret = 0;
+
+       meta.seq = seq;
+       prog = bpf_iter_get_info(&meta, elem == NULL);
+       if (prog) {
+               ctx.meta = &meta;
+               ctx.map = info->map;
+               if (elem) {
+                       ctx.key = elem->data;
+                       ctx.value = rhtab_elem_value(elem, info->map->key_size);
+               }
+               ret = bpf_iter_run_prog(prog, &ctx);
+       }
+
+       return ret;
+}
+
+static int bpf_rhash_map_seq_show(struct seq_file *seq, void *v)
+{
+       return __bpf_rhash_map_seq_show(seq, v);
+}
+
+static void bpf_rhash_map_seq_stop(struct seq_file *seq, void *v)
+       __releases(RCU)
+{
+       struct bpf_iter_seq_rhash_map_info *info = seq->private;
+
+       if (!v)
+               (void)__bpf_rhash_map_seq_show(seq, NULL);
+
+       rhashtable_walk_stop(&info->iter);
+}
+
+static int bpf_iter_init_rhash_map(void *priv_data, struct bpf_iter_aux_info *aux)
+{
+       struct bpf_iter_seq_rhash_map_info *info = priv_data;
+       struct bpf_map *map = aux->map;
+
+       bpf_map_inc_with_uref(map);
+       info->map = map;
+       info->rhtab = container_of(map, struct bpf_rhtab, map);
+       rhashtable_walk_enter(&info->rhtab->ht, &info->iter);
+       return 0;
+}
+
+static void bpf_iter_fini_rhash_map(void *priv_data)
+{
+       struct bpf_iter_seq_rhash_map_info *info = priv_data;
+
+       rhashtable_walk_exit(&info->iter);
+       bpf_map_put_with_uref(info->map);
+}
+
+static const struct seq_operations bpf_rhash_map_seq_ops = {
+       .start = bpf_rhash_map_seq_start,
+       .next = bpf_rhash_map_seq_next,
+       .stop = bpf_rhash_map_seq_stop,
+       .show = bpf_rhash_map_seq_show,
+};
+
+static const struct bpf_iter_seq_info rhash_iter_seq_info = {
+       .seq_ops = &bpf_rhash_map_seq_ops,
+       .init_seq_private = bpf_iter_init_rhash_map,
+       .fini_seq_private = bpf_iter_fini_rhash_map,
+       .seq_priv_size = sizeof(struct bpf_iter_seq_rhash_map_info),
+};
+
  BTF_ID_LIST_SINGLE(rhtab_map_btf_ids, struct, bpf_rhtab)
  const struct bpf_map_ops rhtab_map_ops = {
         .map_meta_equal = bpf_map_meta_equal,
@@ -3047,6 +3387,11 @@ const struct bpf_map_ops rhtab_map_ops = {
         .map_update_elem = rhtab_map_update_elem,
         .map_delete_elem = rhtab_map_delete_elem,
         .map_gen_lookup = rhtab_map_gen_lookup,
+       .map_seq_show_elem = rhtab_map_seq_show_elem,
+       .map_set_for_each_callback_args = map_set_for_each_callback_args,
+       .map_for_each_callback = bpf_each_rhash_elem,
         .map_mem_usage = rhtab_map_mem_usage,
+       BATCH_OPS(rhtab),
         .map_btf_id = &rhtab_map_btf_ids[0],
+       .iter_seq_info = &rhash_iter_seq_info,
  };
diff --git a/kernel/bpf/map_iter.c b/kernel/bpf/map_iter.c

index ae0741a09c6dee0f7512440bceea8ed19b956b7d..c19b360bad9eaab1fb6f9770f8ede3fd1635d133 100644 (file)
--- a/kernel/bpf/map_iter.c
+++ b/kernel/bpf/map_iter.c
@@ -123,7 +123,8 @@ static int bpf_iter_attach_map(struct bpf_prog *prog,
                 is_percpu = true;
         else if (map->map_type != BPF_MAP_TYPE_HASH &&
                  map->map_type != BPF_MAP_TYPE_LRU_HASH &&
-                map->map_type != BPF_MAP_TYPE_ARRAY)
+                map->map_type != BPF_MAP_TYPE_ARRAY &&
+                map->map_type != BPF_MAP_TYPE_RHASH)
                 goto put_map;
  
         key_acc_size = prog->aux->max_rdonly_access;
author	Mykyta Yatsenko <yatsenko@meta.com>
	Fri, 5 Jun 2026 11:41:22 +0000 (04:41 -0700)
committer	Alexei Starovoitov <ast@kernel.org>
	Fri, 5 Jun 2026 15:00:08 +0000 (08:00 -0700)
kernel/bpf/hashtab.c		patch \| blob \| blame \| history
kernel/bpf/map_iter.c		patch \| blob \| blame \| history