]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
bpf: Reject TCP_NODELAY in bpf-tcp-cc
authorKaFai Wan <kafai.wan@linux.dev>
Tue, 21 Apr 2026 15:58:02 +0000 (23:58 +0800)
committerMartin KaFai Lau <martin.lau@kernel.org>
Wed, 22 Apr 2026 19:58:57 +0000 (12:58 -0700)
A BPF TCP congestion control program can call bpf_setsockopt() from
its callbacks. In current kernels, if it calls
bpf_setsockopt(TCP_NODELAY) from cwnd_event_tx_start(), the call can
re-enter the TCP transmit path before the outer tcp_transmit_skb()
has completed and advanced the send head.

This can re-trigger CA_EVENT_TX_START and lead to unbounded recursion:

  tcp_transmit_skb()
    -> tcp_event_data_sent()
      -> tcp_ca_event(sk, CA_EVENT_TX_START)
        -> cwnd_event_tx_start()
          -> bpf_setsockopt(TCP_NODELAY)
            -> tcp_push_pending_frames()
              -> tcp_write_xmit()
                -> tcp_transmit_skb()

This leads to unbounded recursion and can overflow the kernel stack.

Reject TCP_NODELAY with -EOPNOTSUPP for bpf-tcp-cc by introducing
a dedicated setsockopt proto for BPF_PROG_TYPE_STRUCT_OPS TCP
congestion control programs. To keep it simple, all tcp-cc ops is
rejected for TCP_NODELAY.

Fixes: 7e41df5dbba2 ("bpf: Add a few optnames to bpf_setsockopt")
Suggested-by: Martin KaFai Lau <martin.lau@linux.dev>
Signed-off-by: KaFai Wan <kafai.wan@linux.dev>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Jiayuan Chen <jiayuan.chen@linux.dev>
Link: https://patch.msgid.link/20260421155804.135786-3-kafai.wan@linux.dev
include/linux/bpf.h
net/core/filter.c
net/ipv4/bpf_tcp_ca.c

index b4b703c90ca94f2528f04d87a9d429b7c6b70d6e..01e20396489287269f4ed940a2593677e9c40d58 100644 (file)
@@ -3725,6 +3725,7 @@ extern const struct bpf_func_proto bpf_for_each_map_elem_proto;
 extern const struct bpf_func_proto bpf_btf_find_by_name_kind_proto;
 extern const struct bpf_func_proto bpf_sk_setsockopt_proto;
 extern const struct bpf_func_proto bpf_sk_getsockopt_proto;
+extern const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto;
 extern const struct bpf_func_proto bpf_unlocked_sk_setsockopt_proto;
 extern const struct bpf_func_proto bpf_unlocked_sk_getsockopt_proto;
 extern const struct bpf_func_proto bpf_find_vma_proto;
index 96849f4c1fbccdad06a2573a2ab52e38118042b3..2914f5330310d0026eec2e832aa03af7633b4de6 100644 (file)
@@ -5688,6 +5688,30 @@ const struct bpf_func_proto bpf_sk_getsockopt_proto = {
        .arg5_type      = ARG_CONST_SIZE,
 };
 
+BPF_CALL_5(bpf_sk_setsockopt_nodelay, struct sock *, sk, int, level,
+          int, optname, char *, optval, int, optlen)
+{
+       /*
+        * TCP_NODELAY triggers tcp_push_pending_frames() and re-enters
+        * CA_EVENT_TX_START in bpf_tcp_cc.
+        */
+       if (level == SOL_TCP && optname == TCP_NODELAY)
+               return -EOPNOTSUPP;
+
+       return _bpf_setsockopt(sk, level, optname, optval, optlen);
+}
+
+const struct bpf_func_proto bpf_sk_setsockopt_nodelay_proto = {
+       .func           = bpf_sk_setsockopt_nodelay,
+       .gpl_only       = false,
+       .ret_type       = RET_INTEGER,
+       .arg1_type      = ARG_PTR_TO_BTF_ID_SOCK_COMMON,
+       .arg2_type      = ARG_ANYTHING,
+       .arg3_type      = ARG_ANYTHING,
+       .arg4_type      = ARG_PTR_TO_MEM | MEM_RDONLY,
+       .arg5_type      = ARG_CONST_SIZE,
+};
+
 BPF_CALL_5(bpf_unlocked_sk_setsockopt, struct sock *, sk, int, level,
           int, optname, char *, optval, int, optlen)
 {
index 008edc7f6688523dc86963d90485655e9fa8374e..791e15063237c909c555143aac4982cf1ada03c9 100644 (file)
@@ -168,7 +168,7 @@ bpf_tcp_ca_get_func_proto(enum bpf_func_id func_id,
                 */
                if (prog_ops_moff(prog) !=
                    offsetof(struct tcp_congestion_ops, release))
-                       return &bpf_sk_setsockopt_proto;
+                       return &bpf_sk_setsockopt_nodelay_proto;
                return NULL;
        case BPF_FUNC_getsockopt:
                /* Since get/setsockopt is usually expected to