]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
bpf: net_sched: Support implementation of Qdisc_ops in bpf
authorAmery Hung <amery.hung@bytedance.com>
Wed, 9 Apr 2025 21:45:58 +0000 (14:45 -0700)
committerMartin KaFai Lau <martin.lau@kernel.org>
Thu, 17 Apr 2025 17:54:33 +0000 (10:54 -0700)
The recent advancement in bpf such as allocated objects, bpf list and bpf
rbtree has provided powerful and flexible building blocks to realize
sophisticated packet scheduling algorithms. As struct_ops now supports
core operators in Qdisc_ops, start allowing qdisc to be implemented using
bpf struct_ops with this patch. Users can implement Qdisc_ops.{enqueue,
dequeue, init, reset, destroy} in bpf and register the qdisc dynamically
into the kernel.

Co-developed-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Cong Wang <cong.wang@bytedance.com>
Signed-off-by: Amery Hung <amery.hung@bytedance.com>
Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
Acked-by: Toke Høiland-Jørgensen <toke@redhat.com>
Link: https://patch.msgid.link/20250409214606.2000194-3-ameryhung@gmail.com
net/sched/Kconfig
net/sched/Makefile
net/sched/bpf_qdisc.c [new file with mode: 0644]
net/sched/sch_api.c
net/sched/sch_generic.c

index 8180d0c12fceaf4a9fcdedc117fb131b8a1def44..ccd0255da5a54d128b15d4cc9fea2645b1d42223 100644 (file)
@@ -403,6 +403,18 @@ config NET_SCH_ETS
 
          If unsure, say N.
 
+config NET_SCH_BPF
+       bool "BPF-based Qdisc"
+       depends on BPF_SYSCALL && BPF_JIT && DEBUG_INFO_BTF
+       help
+         This option allows BPF-based queueing disiplines. With BPF struct_ops,
+         users can implement supported operators in Qdisc_ops using BPF programs.
+         The queue holding skb can be built with BPF maps or graphs.
+
+         Say Y here if you want to use BPF-based Qdisc.
+
+         If unsure, say N.
+
 menuconfig NET_SCH_DEFAULT
        bool "Allow override default queue discipline"
        help
index 82c3f78ca486ee700b1ce0127ddda40229212977..904d784902d1433f321a4b858f9639de4d4ea08d 100644 (file)
@@ -62,6 +62,7 @@ obj-$(CONFIG_NET_SCH_FQ_PIE)  += sch_fq_pie.o
 obj-$(CONFIG_NET_SCH_CBS)      += sch_cbs.o
 obj-$(CONFIG_NET_SCH_ETF)      += sch_etf.o
 obj-$(CONFIG_NET_SCH_TAPRIO)   += sch_taprio.o
+obj-$(CONFIG_NET_SCH_BPF)      += bpf_qdisc.o
 
 obj-$(CONFIG_NET_CLS_U32)      += cls_u32.o
 obj-$(CONFIG_NET_CLS_ROUTE4)   += cls_route.o
diff --git a/net/sched/bpf_qdisc.c b/net/sched/bpf_qdisc.c
new file mode 100644 (file)
index 0000000..7e5cb72
--- /dev/null
@@ -0,0 +1,216 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/types.h>
+#include <linux/bpf_verifier.h>
+#include <linux/bpf.h>
+#include <linux/btf.h>
+#include <linux/filter.h>
+#include <net/pkt_sched.h>
+#include <net/pkt_cls.h>
+
+static struct bpf_struct_ops bpf_Qdisc_ops;
+
+struct bpf_sk_buff_ptr {
+       struct sk_buff *skb;
+};
+
+static int bpf_qdisc_init(struct btf *btf)
+{
+       return 0;
+}
+
+BTF_ID_LIST_SINGLE(bpf_qdisc_ids, struct, Qdisc)
+BTF_ID_LIST_SINGLE(bpf_sk_buff_ids, struct, sk_buff)
+BTF_ID_LIST_SINGLE(bpf_sk_buff_ptr_ids, struct, bpf_sk_buff_ptr)
+
+static bool bpf_qdisc_is_valid_access(int off, int size,
+                                     enum bpf_access_type type,
+                                     const struct bpf_prog *prog,
+                                     struct bpf_insn_access_aux *info)
+{
+       struct btf *btf = prog->aux->attach_btf;
+       u32 arg;
+
+       arg = btf_ctx_arg_idx(btf, prog->aux->attach_func_proto, off);
+       if (prog->aux->attach_st_ops_member_off == offsetof(struct Qdisc_ops, enqueue)) {
+               if (arg == 2 && type == BPF_READ) {
+                       info->reg_type = PTR_TO_BTF_ID | PTR_TRUSTED;
+                       info->btf = btf;
+                       info->btf_id = bpf_sk_buff_ptr_ids[0];
+                       return true;
+               }
+       }
+
+       return bpf_tracing_btf_ctx_access(off, size, type, prog, info);
+}
+
+static int bpf_qdisc_qdisc_access(struct bpf_verifier_log *log,
+                                 const struct bpf_reg_state *reg,
+                                 int off, size_t *end)
+{
+       switch (off) {
+       case offsetof(struct Qdisc, limit):
+               *end = offsetofend(struct Qdisc, limit);
+               break;
+       case offsetof(struct Qdisc, q) + offsetof(struct qdisc_skb_head, qlen):
+               *end = offsetof(struct Qdisc, q) + offsetofend(struct qdisc_skb_head, qlen);
+               break;
+       case offsetof(struct Qdisc, qstats) ... offsetofend(struct Qdisc, qstats) - 1:
+               *end = offsetofend(struct Qdisc, qstats);
+               break;
+       default:
+               return -EACCES;
+       }
+
+       return 0;
+}
+
+static int bpf_qdisc_sk_buff_access(struct bpf_verifier_log *log,
+                                   const struct bpf_reg_state *reg,
+                                   int off, size_t *end)
+{
+       switch (off) {
+       case offsetof(struct sk_buff, tstamp):
+               *end = offsetofend(struct sk_buff, tstamp);
+               break;
+       case offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb, data[0]) ...
+            offsetof(struct sk_buff, cb) + offsetof(struct qdisc_skb_cb,
+                                                    data[QDISC_CB_PRIV_LEN - 1]):
+               *end = offsetof(struct sk_buff, cb) +
+                      offsetofend(struct qdisc_skb_cb, data[QDISC_CB_PRIV_LEN - 1]);
+               break;
+       default:
+               return -EACCES;
+       }
+
+       return 0;
+}
+
+static int bpf_qdisc_btf_struct_access(struct bpf_verifier_log *log,
+                                      const struct bpf_reg_state *reg,
+                                      int off, int size)
+{
+       const struct btf_type *t, *skbt, *qdisct;
+       size_t end;
+       int err;
+
+       skbt = btf_type_by_id(reg->btf, bpf_sk_buff_ids[0]);
+       qdisct = btf_type_by_id(reg->btf, bpf_qdisc_ids[0]);
+       t = btf_type_by_id(reg->btf, reg->btf_id);
+
+       if (t == skbt) {
+               err = bpf_qdisc_sk_buff_access(log, reg, off, &end);
+       } else if (t == qdisct) {
+               err = bpf_qdisc_qdisc_access(log, reg, off, &end);
+       } else {
+               bpf_log(log, "only read is supported\n");
+               return -EACCES;
+       }
+
+       if (err) {
+               bpf_log(log, "no write support to %s at off %d\n",
+                       btf_name_by_offset(reg->btf, t->name_off), off);
+               return -EACCES;
+       }
+
+       if (off + size > end) {
+               bpf_log(log,
+                       "write access at off %d with size %d beyond the member of %s ended at %zu\n",
+                       off, size, btf_name_by_offset(reg->btf, t->name_off), end);
+               return -EACCES;
+       }
+
+       return 0;
+}
+
+static const struct bpf_verifier_ops bpf_qdisc_verifier_ops = {
+       .get_func_proto         = bpf_base_func_proto,
+       .is_valid_access        = bpf_qdisc_is_valid_access,
+       .btf_struct_access      = bpf_qdisc_btf_struct_access,
+};
+
+static int bpf_qdisc_init_member(const struct btf_type *t,
+                                const struct btf_member *member,
+                                void *kdata, const void *udata)
+{
+       const struct Qdisc_ops *uqdisc_ops;
+       struct Qdisc_ops *qdisc_ops;
+       u32 moff;
+
+       uqdisc_ops = (const struct Qdisc_ops *)udata;
+       qdisc_ops = (struct Qdisc_ops *)kdata;
+
+       moff = __btf_member_bit_offset(t, member) / 8;
+       switch (moff) {
+       case offsetof(struct Qdisc_ops, peek):
+               qdisc_ops->peek = qdisc_peek_dequeued;
+               return 0;
+       case offsetof(struct Qdisc_ops, id):
+               if (bpf_obj_name_cpy(qdisc_ops->id, uqdisc_ops->id,
+                                    sizeof(qdisc_ops->id)) <= 0)
+                       return -EINVAL;
+               return 1;
+       }
+
+       return 0;
+}
+
+static int bpf_qdisc_reg(void *kdata, struct bpf_link *link)
+{
+       return register_qdisc(kdata);
+}
+
+static void bpf_qdisc_unreg(void *kdata, struct bpf_link *link)
+{
+       return unregister_qdisc(kdata);
+}
+
+static int Qdisc_ops__enqueue(struct sk_buff *skb__ref, struct Qdisc *sch,
+                             struct sk_buff **to_free)
+{
+       return 0;
+}
+
+static struct sk_buff *Qdisc_ops__dequeue(struct Qdisc *sch)
+{
+       return NULL;
+}
+
+static int Qdisc_ops__init(struct Qdisc *sch, struct nlattr *arg,
+                          struct netlink_ext_ack *extack)
+{
+       return 0;
+}
+
+static void Qdisc_ops__reset(struct Qdisc *sch)
+{
+}
+
+static void Qdisc_ops__destroy(struct Qdisc *sch)
+{
+}
+
+static struct Qdisc_ops __bpf_ops_qdisc_ops = {
+       .enqueue = Qdisc_ops__enqueue,
+       .dequeue = Qdisc_ops__dequeue,
+       .init = Qdisc_ops__init,
+       .reset = Qdisc_ops__reset,
+       .destroy = Qdisc_ops__destroy,
+};
+
+static struct bpf_struct_ops bpf_Qdisc_ops = {
+       .verifier_ops = &bpf_qdisc_verifier_ops,
+       .reg = bpf_qdisc_reg,
+       .unreg = bpf_qdisc_unreg,
+       .init_member = bpf_qdisc_init_member,
+       .init = bpf_qdisc_init,
+       .name = "Qdisc_ops",
+       .cfi_stubs = &__bpf_ops_qdisc_ops,
+       .owner = THIS_MODULE,
+};
+
+static int __init bpf_qdisc_kfunc_init(void)
+{
+       return register_bpf_struct_ops(&bpf_Qdisc_ops, Qdisc_ops);
+}
+late_initcall(bpf_qdisc_kfunc_init);
index f74a097f54ae7636b3add43576eaf7be5859071b..db6330258dda967c53ecab175a556221551f3620 100644 (file)
@@ -25,6 +25,7 @@
 #include <linux/hrtimer.h>
 #include <linux/slab.h>
 #include <linux/hashtable.h>
+#include <linux/bpf.h>
 
 #include <net/netdev_lock.h>
 #include <net/net_namespace.h>
@@ -359,7 +360,7 @@ static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
                read_lock(&qdisc_mod_lock);
                for (q = qdisc_base; q; q = q->next) {
                        if (nla_strcmp(kind, q->id) == 0) {
-                               if (!try_module_get(q->owner))
+                               if (!bpf_try_module_get(q, q->owner))
                                        q = NULL;
                                break;
                        }
@@ -1370,7 +1371,7 @@ err_out3:
        netdev_put(dev, &sch->dev_tracker);
        qdisc_free(sch);
 err_out2:
-       module_put(ops->owner);
+       bpf_module_put(ops, ops->owner);
 err_out:
        *errp = err;
        return NULL;
@@ -1782,7 +1783,7 @@ static void request_qdisc_module(struct nlattr *kind)
 
        ops = qdisc_lookup_ops(kind);
        if (ops) {
-               module_put(ops->owner);
+               bpf_module_put(ops, ops->owner);
                return;
        }
 
index 14ab2f4c190a1e201dd1788b413a06e799a829f2..e6fda9f20272a11ff0e7aa1a5a678fc1b9207fce 100644 (file)
@@ -24,6 +24,7 @@
 #include <linux/if_vlan.h>
 #include <linux/skb_array.h>
 #include <linux/if_macvlan.h>
+#include <linux/bpf.h>
 #include <net/sch_generic.h>
 #include <net/pkt_sched.h>
 #include <net/dst.h>
@@ -1078,7 +1079,7 @@ static void __qdisc_destroy(struct Qdisc *qdisc)
                ops->destroy(qdisc);
 
        lockdep_unregister_key(&qdisc->root_lock_key);
-       module_put(ops->owner);
+       bpf_module_put(ops, ops->owner);
        netdev_put(dev, &qdisc->dev_tracker);
 
        trace_qdisc_destroy(qdisc);