]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
net/smc: bpf: Introduce generic hook for handshake flow
authorD. Wythe <alibuda@linux.alibaba.com>
Fri, 7 Nov 2025 03:56:31 +0000 (11:56 +0800)
committerMartin KaFai Lau <martin.lau@kernel.org>
Mon, 10 Nov 2025 19:19:41 +0000 (11:19 -0800)
The introduction of IPPROTO_SMC enables eBPF programs to determine
whether to use SMC based on the context of socket creation, such as
network namespaces, PID and comm name, etc.

As a subsequent enhancement, to introduce a new generic hook that
allows decisions on whether to use SMC or not at runtime, including
but not limited to local/remote IP address or ports.

User can write their own implememtion via bpf_struct_ops now to choose
whether to use SMC or not before TCP 3rd handshake to be comleted.

Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
Link: https://patch.msgid.link/20251107035632.115950-3-alibuda@linux.alibaba.com
include/net/netns/smc.h
include/net/smc.h
net/ipv4/tcp_output.c
net/smc/Kconfig
net/smc/Makefile
net/smc/af_smc.c
net/smc/smc_hs_bpf.c [new file with mode: 0644]
net/smc/smc_hs_bpf.h [new file with mode: 0644]
net/smc/smc_sysctl.c

index 6ceb12baec241340d61d70cc447836e7e287903d..ed24c9f638eea2d7d9006c700692daec763fe09d 100644 (file)
@@ -17,6 +17,9 @@ struct netns_smc {
 #ifdef CONFIG_SYSCTL
        struct ctl_table_header         *smc_hdr;
 #endif
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+       struct smc_hs_ctrl __rcu        *hs_ctrl;
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
        unsigned int                    sysctl_autocorking_size;
        unsigned int                    sysctl_smcr_buf_type;
        int                             sysctl_smcr_testlink_time;
index 08bee529ed8d435c1c105c617bc0a6c35dae962b..bfdc4c41f0198b95a71b5d52ee4063c24a813dc5 100644 (file)
@@ -17,6 +17,8 @@
 #include <linux/wait.h>
 #include <linux/dibs.h>
 
+struct tcp_sock;
+struct inet_request_sock;
 struct sock;
 
 #define SMC_MAX_PNETID_LEN     16      /* Max. length of PNET id */
@@ -50,4 +52,55 @@ struct smcd_dev {
        u8 going_away : 1;
 };
 
+#define SMC_HS_CTRL_NAME_MAX 16
+
+enum {
+       /* ops can be inherit from init_net */
+       SMC_HS_CTRL_FLAG_INHERITABLE = 0x1,
+
+       SMC_HS_CTRL_ALL_FLAGS = SMC_HS_CTRL_FLAG_INHERITABLE,
+};
+
+struct smc_hs_ctrl {
+       /* private */
+
+       struct list_head list;
+       struct module *owner;
+
+       /* public */
+
+       /* unique name */
+       char name[SMC_HS_CTRL_NAME_MAX];
+       int flags;
+
+       /* Invoked before computing SMC option for SYN packets.
+        * We can control whether to set SMC options by returning various value.
+        * Return 0 to disable SMC, or return any other value to enable it.
+        */
+       int (*syn_option)(struct tcp_sock *tp);
+
+       /* Invoked before Set up SMC options for SYN-ACK packets
+        * We can control whether to respond SMC options by returning various
+        * value. Return 0 to disable SMC, or return any other value to enable
+        * it.
+        */
+       int (*synack_option)(const struct tcp_sock *tp,
+                            struct inet_request_sock *ireq);
+};
+
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+#define smc_call_hsbpf(init_val, tp, func, ...) ({                             \
+       typeof(init_val) __ret = (init_val);                                    \
+       struct smc_hs_ctrl *ctrl;                                               \
+       rcu_read_lock();                                                        \
+       ctrl = rcu_dereference(sock_net((struct sock *)(tp))->smc.hs_ctrl);     \
+       if (ctrl && ctrl->func)                                                 \
+               __ret = ctrl->func(tp, ##__VA_ARGS__);                          \
+       rcu_read_unlock();                                                      \
+       __ret;                                                                  \
+})
+#else
+#define smc_call_hsbpf(init_val, tp, ...)  ({ (void)(tp); (init_val); })
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
+
 #endif /* _SMC_H */
index 7f5df7a71f62954ef86cc917e1da5e1981e20aa0..479afb714bdf901cdf733c94cd7f22bd705c9d02 100644 (file)
@@ -40,6 +40,7 @@
 #include <net/tcp.h>
 #include <net/tcp_ecn.h>
 #include <net/mptcp.h>
+#include <net/smc.h>
 #include <net/proto_memory.h>
 #include <net/psp.h>
 
@@ -802,34 +803,36 @@ static void tcp_options_write(struct tcphdr *th, struct tcp_sock *tp,
        mptcp_options_write(th, ptr, tp, opts);
 }
 
-static void smc_set_option(const struct tcp_sock *tp,
+static void smc_set_option(struct tcp_sock *tp,
                           struct tcp_out_options *opts,
                           unsigned int *remaining)
 {
 #if IS_ENABLED(CONFIG_SMC)
-       if (static_branch_unlikely(&tcp_have_smc)) {
-               if (tp->syn_smc) {
-                       if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
-                               opts->options |= OPTION_SMC;
-                               *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
-                       }
+       if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc) {
+               tp->syn_smc = !!smc_call_hsbpf(1, tp, syn_option);
+               /* re-check syn_smc */
+               if (tp->syn_smc &&
+                   *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+                       opts->options |= OPTION_SMC;
+                       *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
                }
        }
 #endif
 }
 
 static void smc_set_option_cond(const struct tcp_sock *tp,
-                               const struct inet_request_sock *ireq,
+                               struct inet_request_sock *ireq,
                                struct tcp_out_options *opts,
                                unsigned int *remaining)
 {
 #if IS_ENABLED(CONFIG_SMC)
-       if (static_branch_unlikely(&tcp_have_smc)) {
-               if (tp->syn_smc && ireq->smc_ok) {
-                       if (*remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
-                               opts->options |= OPTION_SMC;
-                               *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
-                       }
+       if (static_branch_unlikely(&tcp_have_smc) && tp->syn_smc && ireq->smc_ok) {
+               ireq->smc_ok = !!smc_call_hsbpf(1, tp, synack_option, ireq);
+               /* re-check smc_ok */
+               if (ireq->smc_ok &&
+                   *remaining >= TCPOLEN_EXP_SMC_BASE_ALIGNED) {
+                       opts->options |= OPTION_SMC;
+                       *remaining -= TCPOLEN_EXP_SMC_BASE_ALIGNED;
                }
        }
 #endif
index 99ecd59d1f4b8a98d35a7156330645cb6dd39646..325addf83cc69f6cab39490aca002fc292b3cff0 100644 (file)
@@ -19,3 +19,13 @@ config SMC_DIAG
          smcss.
 
          if unsure, say Y.
+
+config SMC_HS_CTRL_BPF
+       bool "Generic eBPF hook for SMC handshake flow"
+       depends on SMC && BPF_SYSCALL
+       default y
+       help
+         SMC_HS_CTRL_BPF enables support to register generic eBPF hook for SMC
+         handshake flow, which offer much greater flexibility in modifying the behavior
+         of the SMC protocol stack compared to a complete kernel-based approach. Select
+         this option if you want filtring the handshake process via eBPF programs.
\ No newline at end of file
index 0e754cbc38f9cb6e079f502b0b9733dc9a696750..5368634c5dd6d0fb3ef7714cf61e6be530f4a0fc 100644 (file)
@@ -6,3 +6,4 @@ smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o smc_llc.o
 smc-y += smc_cdc.o smc_tx.o smc_rx.o smc_close.o smc_ism.o smc_netlink.o smc_stats.o
 smc-y += smc_tracepoint.o smc_inet.o
 smc-$(CONFIG_SYSCTL) += smc_sysctl.o
+smc-$(CONFIG_SMC_HS_CTRL_BPF) += smc_hs_bpf.o
index 0ef3e16a8517a8753491915e13a1d8b0c2c25fc6..e388de8dca09701325b74fa25373dcefef9e6df8 100644 (file)
@@ -58,6 +58,7 @@
 #include "smc_tracepoint.h"
 #include "smc_sysctl.h"
 #include "smc_inet.h"
+#include "smc_hs_bpf.h"
 
 static DEFINE_MUTEX(smc_server_lgr_pending);   /* serialize link group
                                                 * creation on server
@@ -3600,8 +3601,16 @@ static int __init smc_init(void)
                pr_err("%s: smc_inet_init fails with %d\n", __func__, rc);
                goto out_ulp;
        }
+       rc = bpf_smc_hs_ctrl_init();
+       if (rc) {
+               pr_err("%s: bpf_smc_hs_ctrl_init fails with %d\n", __func__,
+                      rc);
+               goto out_inet;
+       }
        static_branch_enable(&tcp_have_smc);
        return 0;
+out_inet:
+       smc_inet_exit();
 out_ulp:
        tcp_unregister_ulp(&smc_ulp_ops);
 out_ib:
diff --git a/net/smc/smc_hs_bpf.c b/net/smc/smc_hs_bpf.c
new file mode 100644 (file)
index 0000000..063d23d
--- /dev/null
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Generic hook for SMC handshake flow.
+ *
+ *  Copyright IBM Corp. 2016
+ *  Copyright (c) 2025, Alibaba Inc.
+ *
+ *  Author: D. Wythe <alibuda@linux.alibaba.com>
+ */
+
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/rculist.h>
+
+#include "smc_hs_bpf.h"
+
+static DEFINE_SPINLOCK(smc_hs_ctrl_list_lock);
+static LIST_HEAD(smc_hs_ctrl_list);
+
+static int smc_hs_ctrl_reg(struct smc_hs_ctrl *ctrl)
+{
+       int ret = 0;
+
+       spin_lock(&smc_hs_ctrl_list_lock);
+       /* already exist or duplicate name */
+       if (smc_hs_ctrl_find_by_name(ctrl->name))
+               ret = -EEXIST;
+       else
+               list_add_tail_rcu(&ctrl->list, &smc_hs_ctrl_list);
+       spin_unlock(&smc_hs_ctrl_list_lock);
+       return ret;
+}
+
+static void smc_hs_ctrl_unreg(struct smc_hs_ctrl *ctrl)
+{
+       spin_lock(&smc_hs_ctrl_list_lock);
+       list_del_rcu(&ctrl->list);
+       spin_unlock(&smc_hs_ctrl_list_lock);
+
+       /* Ensure that all readers to complete */
+       synchronize_rcu();
+}
+
+struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name)
+{
+       struct smc_hs_ctrl *ctrl;
+
+       list_for_each_entry_rcu(ctrl, &smc_hs_ctrl_list, list) {
+               if (strcmp(ctrl->name, name) == 0)
+                       return ctrl;
+       }
+       return NULL;
+}
+
+static int __smc_bpf_stub_set_tcp_option(struct tcp_sock *tp) { return 1; }
+static int __smc_bpf_stub_set_tcp_option_cond(const struct tcp_sock *tp,
+                                             struct inet_request_sock *ireq)
+{
+       return 1;
+}
+
+static struct smc_hs_ctrl __smc_bpf_hs_ctrl = {
+       .syn_option     = __smc_bpf_stub_set_tcp_option,
+       .synack_option  = __smc_bpf_stub_set_tcp_option_cond,
+};
+
+static int smc_bpf_hs_ctrl_init(struct btf *btf) { return 0; }
+
+static int smc_bpf_hs_ctrl_reg(void *kdata, struct bpf_link *link)
+{
+       if (link)
+               return -EOPNOTSUPP;
+
+       return smc_hs_ctrl_reg(kdata);
+}
+
+static void smc_bpf_hs_ctrl_unreg(void *kdata, struct bpf_link *link)
+{
+       smc_hs_ctrl_unreg(kdata);
+}
+
+static int smc_bpf_hs_ctrl_init_member(const struct btf_type *t,
+                                      const struct btf_member *member,
+                                      void *kdata, const void *udata)
+{
+       const struct smc_hs_ctrl *u_ctrl;
+       struct smc_hs_ctrl *k_ctrl;
+       u32 moff;
+
+       u_ctrl = (const struct smc_hs_ctrl *)udata;
+       k_ctrl = (struct smc_hs_ctrl *)kdata;
+
+       moff = __btf_member_bit_offset(t, member) / 8;
+       switch (moff) {
+       case offsetof(struct smc_hs_ctrl, name):
+               if (bpf_obj_name_cpy(k_ctrl->name, u_ctrl->name,
+                                    sizeof(u_ctrl->name)) <= 0)
+                       return -EINVAL;
+               return 1;
+       case offsetof(struct smc_hs_ctrl, flags):
+               if (u_ctrl->flags & ~SMC_HS_CTRL_ALL_FLAGS)
+                       return -EINVAL;
+               k_ctrl->flags = u_ctrl->flags;
+               return 1;
+       default:
+               break;
+       }
+
+       return 0;
+}
+
+static const struct bpf_func_proto *
+bpf_smc_hs_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
+{
+       return bpf_base_func_proto(func_id, prog);
+}
+
+static const struct bpf_verifier_ops smc_bpf_verifier_ops = {
+       .get_func_proto         = bpf_smc_hs_func_proto,
+       .is_valid_access        = bpf_tracing_btf_ctx_access,
+};
+
+static struct bpf_struct_ops bpf_smc_hs_ctrl_ops = {
+       .name           = "smc_hs_ctrl",
+       .init           = smc_bpf_hs_ctrl_init,
+       .reg            = smc_bpf_hs_ctrl_reg,
+       .unreg          = smc_bpf_hs_ctrl_unreg,
+       .cfi_stubs      = &__smc_bpf_hs_ctrl,
+       .verifier_ops   = &smc_bpf_verifier_ops,
+       .init_member    = smc_bpf_hs_ctrl_init_member,
+       .owner          = THIS_MODULE,
+};
+
+int bpf_smc_hs_ctrl_init(void)
+{
+       return register_bpf_struct_ops(&bpf_smc_hs_ctrl_ops, smc_hs_ctrl);
+}
diff --git a/net/smc/smc_hs_bpf.h b/net/smc/smc_hs_bpf.h
new file mode 100644 (file)
index 0000000..f5f1807
--- /dev/null
@@ -0,0 +1,31 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ *  Shared Memory Communications over RDMA (SMC-R) and RoCE
+ *
+ *  Generic hook for SMC handshake flow.
+ *
+ *  Copyright IBM Corp. 2016
+ *  Copyright (c) 2025, Alibaba Inc.
+ *
+ *  Author: D. Wythe <alibuda@linux.alibaba.com>
+ */
+
+#ifndef __SMC_HS_CTRL
+#define __SMC_HS_CTRL
+
+#include <net/smc.h>
+
+/* Find hs_ctrl by the target name, which required to be a c-string.
+ * Return NULL if no such ctrl was found,otherwise, return a valid ctrl.
+ *
+ * Note: Caller MUST ensure it's was invoked under rcu_read_lock.
+ */
+struct smc_hs_ctrl *smc_hs_ctrl_find_by_name(const char *name);
+
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+int bpf_smc_hs_ctrl_init(void);
+#else
+static inline int bpf_smc_hs_ctrl_init(void) { return 0; }
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
+
+#endif /* __SMC_HS_CTRL */
index 7b2471904d0499221a944c1994a10d99fc9efe30..b1efed5462435b1a6f2f59584a4cf47f5f6e1981 100644 (file)
 
 #include <linux/init.h>
 #include <linux/sysctl.h>
+#include <linux/bpf.h>
 #include <net/net_namespace.h>
 
 #include "smc.h"
 #include "smc_core.h"
 #include "smc_llc.h"
 #include "smc_sysctl.h"
+#include "smc_hs_bpf.h"
 
 static int min_sndbuf = SMC_BUF_MIN_SIZE;
 static int min_rcvbuf = SMC_BUF_MIN_SIZE;
@@ -32,6 +34,69 @@ static int conns_per_lgr_max = SMC_CONN_PER_LGR_MAX;
 static unsigned int smcr_max_wr_min = 2;
 static unsigned int smcr_max_wr_max = 2048;
 
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+static int smc_net_replace_smc_hs_ctrl(struct net *net, const char *name)
+{
+       struct smc_hs_ctrl *ctrl = NULL;
+
+       rcu_read_lock();
+       /* null or empty name ask to clear current ctrl */
+       if (name && name[0]) {
+               ctrl = smc_hs_ctrl_find_by_name(name);
+               if (!ctrl) {
+                       rcu_read_unlock();
+                       return -EINVAL;
+               }
+               /* no change, just return */
+               if (ctrl == rcu_dereference(net->smc.hs_ctrl)) {
+                       rcu_read_unlock();
+                       return 0;
+               }
+               if (!bpf_try_module_get(ctrl, ctrl->owner)) {
+                       rcu_read_unlock();
+                       return -EBUSY;
+               }
+       }
+       /* xhcg old ctrl with the new one atomically */
+       ctrl = unrcu_pointer(xchg(&net->smc.hs_ctrl, RCU_INITIALIZER(ctrl)));
+       /* release old ctrl */
+       if (ctrl)
+               bpf_module_put(ctrl, ctrl->owner);
+
+       rcu_read_unlock();
+       return 0;
+}
+
+static int proc_smc_hs_ctrl(const struct ctl_table *ctl, int write,
+                           void *buffer, size_t *lenp, loff_t *ppos)
+{
+       struct net *net = container_of(ctl->data, struct net, smc.hs_ctrl);
+       char val[SMC_HS_CTRL_NAME_MAX];
+       const struct ctl_table tbl = {
+               .data = val,
+               .maxlen = SMC_HS_CTRL_NAME_MAX,
+       };
+       struct smc_hs_ctrl *ctrl;
+       int ret;
+
+       rcu_read_lock();
+       ctrl = rcu_dereference(net->smc.hs_ctrl);
+       if (ctrl)
+               memcpy(val, ctrl->name, sizeof(ctrl->name));
+       else
+               val[0] = '\0';
+       rcu_read_unlock();
+
+       ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
+       if (ret)
+               return ret;
+
+       if (write)
+               ret = smc_net_replace_smc_hs_ctrl(net, val);
+       return ret;
+}
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
+
 static struct ctl_table smc_table[] = {
        {
                .procname       = "autocorking_size",
@@ -119,6 +184,15 @@ static struct ctl_table smc_table[] = {
                .extra1         = &smcr_max_wr_min,
                .extra2         = &smcr_max_wr_max,
        },
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+       {
+               .procname       = "hs_ctrl",
+               .data           = &init_net.smc.hs_ctrl,
+               .mode           = 0644,
+               .maxlen         = SMC_HS_CTRL_NAME_MAX,
+               .proc_handler   = proc_smc_hs_ctrl,
+       },
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
 };
 
 int __net_init smc_sysctl_net_init(struct net *net)
@@ -129,6 +203,16 @@ int __net_init smc_sysctl_net_init(struct net *net)
        table = smc_table;
        if (!net_eq(net, &init_net)) {
                int i;
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+               struct smc_hs_ctrl *ctrl;
+
+               rcu_read_lock();
+               ctrl = rcu_dereference(init_net.smc.hs_ctrl);
+               if (ctrl && ctrl->flags & SMC_HS_CTRL_FLAG_INHERITABLE &&
+                   bpf_try_module_get(ctrl, ctrl->owner))
+                       rcu_assign_pointer(net->smc.hs_ctrl, ctrl);
+               rcu_read_unlock();
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
 
                table = kmemdup(table, sizeof(smc_table), GFP_KERNEL);
                if (!table)
@@ -161,6 +245,9 @@ err_reg:
        if (!net_eq(net, &init_net))
                kfree(table);
 err_alloc:
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+       smc_net_replace_smc_hs_ctrl(net, NULL);
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
        return -ENOMEM;
 }
 
@@ -170,6 +257,10 @@ void __net_exit smc_sysctl_net_exit(struct net *net)
 
        table = net->smc.smc_hdr->ctl_table_arg;
        unregister_net_sysctl_table(net->smc.smc_hdr);
+#if IS_ENABLED(CONFIG_SMC_HS_CTRL_BPF)
+       smc_net_replace_smc_hs_ctrl(net, NULL);
+#endif /* CONFIG_SMC_HS_CTRL_BPF */
+
        if (!net_eq(net, &init_net))
                kfree(table);
 }