]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
bpf: Allow LPM map access from sleepable BPF programs
authorVlad Poenaru <vlad.wing@gmail.com>
Tue, 9 Jun 2026 13:55:57 +0000 (06:55 -0700)
committerAlexei Starovoitov <ast@kernel.org>
Tue, 9 Jun 2026 19:42:22 +0000 (12:42 -0700)
trie_lookup_elem() annotates its rcu_dereference_check() walks with
only rcu_read_lock_bh_held().  Because rcu_dereference_check(p, c)
resolves to "c || rcu_read_lock_held()", this passes for XDP/NAPI and
classic RCU readers but fails for sleepable BPF programs, which enter
via __bpf_prog_enter_sleepable() and hold only rcu_read_lock_trace().

trie_update_elem() and trie_delete_elem() have the same problem in a
different form: they walk the trie with plain rcu_dereference(), which
asserts rcu_read_lock_held() unconditionally.  Both are reachable from
sleepable BPF programs via the bpf_map_update_elem / bpf_map_delete_elem
helpers, and from the syscall path under classic rcu_read_lock().  In
the writer paths the trie is actually protected by trie->lock (an
rqspinlock taken across the walk); we never relied on the RCU read-side
lock to keep nodes alive there.

A sleepable LSM hook that ends up touching an LPM trie therefore
triggers lockdep on debug kernels:

  =============================
  WARNING: suspicious RCU usage
  7.1.0-... Tainted: G            E
  -----------------------------
  kernel/bpf/lpm_trie.c:249 suspicious rcu_dereference_check() usage!
  1 lock held by net_tests/540:
   #0: (rcu_tasks_trace_srcu_struct){....}-{0:0},
       at: __bpf_prog_enter_sleepable+0x26/0x280
  Call Trace:
   dump_stack_lvl
   lockdep_rcu_suspicious
   trie_lookup_elem
   bpf_prog_..._enforce_security_socket_connect
   bpf_trampoline_...
   security_socket_connect
   __sys_connect
   do_syscall_64

This is lockdep-only -- no UAF, since Tasks Trace RCU does serialize
against the trie's reclaim path -- but it spams the console once per
distinct callsite on every debug kernel running a sleepable BPF LSM
that touches an LPM trie, which is increasingly common.

For the lookup path, switch the rcu_dereference_check() annotation
from rcu_read_lock_bh_held() to bpf_rcu_lock_held(), which accepts all
three contexts (classic, BH, Tasks Trace).  Other map types already
follow this convention.

For trie_update_elem() and trie_delete_elem(), annotate the walks as
rcu_dereference_protected(*p, 1) -- matching trie_free() in the same
file -- since trie->lock is held across the walk.  rqspinlock has no
lockdep_map, so the predicate degenerates to '1' rather than
lockdep_is_held(&trie->lock); the protection is real but not
machine-verifiable.  trie_get_next_key() also uses bare
rcu_dereference() but is reachable only from the BPF syscall, which
holds classic rcu_read_lock() before dispatching, so it is left
untouched.

Fixes: 694cea395fde ("bpf: Allow RCU-protected lookups to happen from bh context")
Cc: stable@vger.kernel.org
Signed-off-by: Vlad Poenaru <vlad.wing@gmail.com>
Reviewed-by: Emil Tsalapatis <emil@etsalapatis.com>
Link: https://lore.kernel.org/r/20260609135558.193287-2-vlad.wing@gmail.com
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
kernel/bpf/lpm_trie.c

index 0f57608b385d43212b72c8e3bb866009245a109c..4d6f25db9ba1274419170f4339a5f0c8a40dc2bd 100644 (file)
@@ -246,7 +246,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
 
        /* Start walking the trie from the root node ... */
 
-       for (node = rcu_dereference_check(trie->root, rcu_read_lock_bh_held());
+       for (node = rcu_dereference_check(trie->root, bpf_rcu_lock_held());
             node;) {
                unsigned int next_bit;
                size_t matchlen;
@@ -280,7 +280,7 @@ static void *trie_lookup_elem(struct bpf_map *map, void *_key)
                 */
                next_bit = extract_bit(key->data, node->prefixlen);
                node = rcu_dereference_check(node->child[next_bit],
-                                            rcu_read_lock_bh_held());
+                                            bpf_rcu_lock_held());
        }
 
        if (!found)
@@ -359,7 +359,7 @@ static long trie_update_elem(struct bpf_map *map,
         */
        slot = &trie->root;
 
-       while ((node = rcu_dereference(*slot))) {
+       while ((node = rcu_dereference_protected(*slot, 1))) {
                matchlen = longest_prefix_match(trie, node, key);
 
                if (node->prefixlen != matchlen ||
@@ -482,7 +482,7 @@ static long trie_delete_elem(struct bpf_map *map, void *_key)
        trim = &trie->root;
        trim2 = trim;
        parent = NULL;
-       while ((node = rcu_dereference(*trim))) {
+       while ((node = rcu_dereference_protected(*trim, 1))) {
                matchlen = longest_prefix_match(trie, node, key);
 
                if (node->prefixlen != matchlen ||