io_uring: add support for BPF filtering for opcode restrictions

author Jens Axboe <axboe@kernel.dk>

Thu, 15 Jan 2026 15:24:02 +0000 (08:24 -0700)

committer Jens Axboe <axboe@kernel.dk>

Tue, 27 Jan 2026 18:09:57 +0000 (11:09 -0700)
author Jens Axboe <axboe@kernel.dk>
Thu, 15 Jan 2026 15:24:02 +0000 (08:24 -0700)
committer Jens Axboe <axboe@kernel.dk>
Tue, 27 Jan 2026 18:09:57 +0000 (11:09 -0700)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h

index dc6bd6940a0dd27536ff72874e3720e9b77c88d4..74bf983628768aaddbe0688082aeeae1fa330b8c 100644 (file)
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -219,9 +219,18 @@ struct io_rings {
         struct io_uring_cqe     cqes[] ____cacheline_aligned_in_smp;
  };
  
+struct io_bpf_filter;
+struct io_bpf_filters {
+       refcount_t refs;        /* ref for ->bpf_filters */
+       spinlock_t lock;        /* protects ->bpf_filters modifications */
+       struct io_bpf_filter __rcu **filters;
+       struct rcu_head rcu_head;
+};
+
  struct io_restriction {
         DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
         DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
+       struct io_bpf_filters *bpf_filters;
         u8 sqe_flags_allowed;
         u8 sqe_flags_required;
         /* IORING_OP_* restrictions exist */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h

index b5b23c0d5283419b77c5f3000defbc19903421bc..94669b77fee8c16729e2b64c6a98e3537073bcad 100644 (file)
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -700,6 +700,9 @@ enum io_uring_register_op {
         /* auxiliary zcrx configuration, see enum zcrx_ctrl_op */
         IORING_REGISTER_ZCRX_CTRL               = 36,
  
+       /* register bpf filtering programs */
+       IORING_REGISTER_BPF_FILTER              = 37,
+
         /* this goes last */
         IORING_REGISTER_LAST,
  
diff --git a/include/uapi/linux/io_uring/bpf_filter.h b/include/uapi/linux/io_uring/bpf_filter.h

new file mode 100644 (file)

index 0000000..2d4d0e5
--- /dev/null
+++ b/include/uapi/linux/io_uring/bpf_filter.h
@@ -0,0 +1,50 @@
+/* SPDX-License-Identifier: (GPL-2.0 WITH Linux-syscall-note) OR MIT */
+/*
+ * Header file for the io_uring BPF filters.
+ */
+#ifndef LINUX_IO_URING_BPF_FILTER_H
+#define LINUX_IO_URING_BPF_FILTER_H
+
+#include <linux/types.h>
+
+/*
+ * Struct passed to filters.
+ */
+struct io_uring_bpf_ctx {
+       __u64   user_data;
+       __u8    opcode;
+       __u8    sqe_flags;
+       __u8    pdu_size;       /* size of aux data for filter */
+       __u8    pad[5];
+};
+
+enum {
+       /*
+        * If set, any currently unset opcode will have a deny filter attached
+        */
+       IO_URING_BPF_FILTER_DENY_REST   = 1,
+};
+
+struct io_uring_bpf_filter {
+       __u32   opcode;         /* io_uring opcode to filter */
+       __u32   flags;
+       __u32   filter_len;     /* number of BPF instructions */
+       __u32   resv;
+       __u64   filter_ptr;     /* pointer to BPF filter */
+       __u64   resv2[5];
+};
+
+enum {
+       IO_URING_BPF_CMD_FILTER = 1,
+};
+
+struct io_uring_bpf {
+       __u16   cmd_type;       /* IO_URING_BPF_* values */
+       __u16   cmd_flags;      /* none so far */
+       __u32   resv;
+       union {
+               struct io_uring_bpf_filter      filter;
+       };
+};
+
+#endif
diff --git a/io_uring/Kconfig b/io_uring/Kconfig

index 4b949c42c0bf93dd54b0a319f1d15a8fc80c2571..a7ae23cf103579f310e771ab9ff99f8fe0cce28a 100644 (file)
--- a/io_uring/Kconfig
+++ b/io_uring/Kconfig
@@ -9,3 +9,8 @@ config IO_URING_ZCRX
         depends on PAGE_POOL
         depends on INET
         depends on NET_RX_BUSY_POLL
+
+config IO_URING_BPF
+       def_bool y
+       depends on BPF
+       depends on NET
diff --git a/io_uring/Makefile b/io_uring/Makefile

index bf9eff88427ae3b173c60b4c267fc5614b5be1e0..931f9156132a0dea83ccc40968821a69e319418d 100644 (file)
--- a/io_uring/Makefile
+++ b/io_uring/Makefile
@@ -24,3 +24,4 @@ obj-$(CONFIG_NET_RX_BUSY_POLL)        += napi.o
  obj-$(CONFIG_NET) += net.o cmd_net.o
  obj-$(CONFIG_PROC_FS) += fdinfo.o
  obj-$(CONFIG_IO_URING_MOCK_FILE) += mock_file.o
+obj-$(CONFIG_IO_URING_BPF) += bpf_filter.o
diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c

new file mode 100644 (file)

index 0000000..5207226
--- /dev/null
+++ b/io_uring/bpf_filter.c
@@ -0,0 +1,321 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BPF filter support for io_uring. Supports SQE opcodes for now.
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/io_uring.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "bpf_filter.h"
+#include "net.h"
+
+struct io_bpf_filter {
+       struct bpf_prog         *prog;
+       struct io_bpf_filter    *next;
+};
+
+/* Deny if this is set as the filter */
+static const struct io_bpf_filter dummy_filter;
+
+static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx,
+                                     struct io_kiocb *req)
+{
+       bctx->opcode = req->opcode;
+       bctx->sqe_flags = (__force int) req->flags & SQE_VALID_FLAGS;
+       bctx->user_data = req->cqe.user_data;
+       /* clear residual, anything from pdu_size and below */
+       memset((void *) bctx + offsetof(struct io_uring_bpf_ctx, pdu_size), 0,
+               sizeof(*bctx) - offsetof(struct io_uring_bpf_ctx, pdu_size));
+}
+
+/*
+ * Run registered filters for a given opcode. For filters, a return of 0 denies
+ * execution of the request, a return of 1 allows it. If any filter for an
+ * opcode returns 0, filter processing is stopped, and the request is denied.
+ * This also stops the processing of filters.
+ *
+ * __io_uring_run_bpf_filters() returns 0 on success, allow running the
+ * request, and -EACCES when a request is denied.
+ */
+int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req)
+{
+       struct io_bpf_filter *filter;
+       struct io_uring_bpf_ctx bpf_ctx;
+       int ret;
+
+       /* Fast check for existence of filters outside of RCU */
+       if (!rcu_access_pointer(res->bpf_filters->filters[req->opcode]))
+               return 0;
+
+       /*
+        * req->opcode has already been validated to be within the range
+        * of what we expect, io_init_req() does this.
+        */
+       guard(rcu)();
+       filter = rcu_dereference(res->bpf_filters->filters[req->opcode]);
+       if (!filter)
+               return 0;
+       else if (filter == &dummy_filter)
+               return -EACCES;
+
+       io_uring_populate_bpf_ctx(&bpf_ctx, req);
+
+       /*
+        * Iterate registered filters. The opcode is allowed IFF all filters
+        * return 1. If any filter returns denied, opcode will be denied.
+        */
+       do {
+               if (filter == &dummy_filter)
+                       return -EACCES;
+               ret = bpf_prog_run(filter->prog, &bpf_ctx);
+               if (!ret)
+                       return -EACCES;
+               filter = filter->next;
+       } while (filter);
+
+       return 0;
+}
+
+static void io_free_bpf_filters(struct rcu_head *head)
+{
+       struct io_bpf_filter __rcu **filter;
+       struct io_bpf_filters *filters;
+       int i;
+
+       filters = container_of(head, struct io_bpf_filters, rcu_head);
+       scoped_guard(spinlock, &filters->lock) {
+               filter = filters->filters;
+               if (!filter)
+                       return;
+       }
+
+       for (i = 0; i < IORING_OP_LAST; i++) {
+               struct io_bpf_filter *f;
+
+               rcu_read_lock();
+               f = rcu_dereference(filter[i]);
+               while (f) {
+                       struct io_bpf_filter *next = f->next;
+
+                       /*
+                        * Even if stacked, dummy filter will always be last
+                        * as it can only get installed into an empty spot.
+                        */
+                       if (f == &dummy_filter)
+                               break;
+                       bpf_prog_destroy(f->prog);
+                       kfree(f);
+                       f = next;
+               }
+               rcu_read_unlock();
+       }
+       kfree(filters->filters);
+       kfree(filters);
+}
+
+static void __io_put_bpf_filters(struct io_bpf_filters *filters)
+{
+       if (refcount_dec_and_test(&filters->refs))
+               call_rcu(&filters->rcu_head, io_free_bpf_filters);
+}
+
+void io_put_bpf_filters(struct io_restriction *res)
+{
+       if (res->bpf_filters)
+               __io_put_bpf_filters(res->bpf_filters);
+}
+
+static struct io_bpf_filters *io_new_bpf_filters(void)
+{
+       struct io_bpf_filters *filters __free(kfree) = NULL;
+
+       filters = kzalloc(sizeof(*filters), GFP_KERNEL_ACCOUNT);
+       if (!filters)
+               return ERR_PTR(-ENOMEM);
+
+       filters->filters = kcalloc(IORING_OP_LAST,
+                                  sizeof(struct io_bpf_filter *),
+                                  GFP_KERNEL_ACCOUNT);
+       if (!filters->filters)
+               return ERR_PTR(-ENOMEM);
+
+       refcount_set(&filters->refs, 1);
+       spin_lock_init(&filters->lock);
+       return no_free_ptr(filters);
+}
+
+/*
+ * Validate classic BPF filter instructions. Only allow a safe subset of
+ * operations - no packet data access, just context field loads and basic
+ * ALU/jump operations.
+ */
+static int io_uring_check_cbpf_filter(struct sock_filter *filter,
+                                     unsigned int flen)
+{
+       int pc;
+
+       for (pc = 0; pc < flen; pc++) {
+               struct sock_filter *ftest = &filter[pc];
+               u16 code = ftest->code;
+               u32 k = ftest->k;
+
+               switch (code) {
+               case BPF_LD | BPF_W | BPF_ABS:
+                       ftest->code = BPF_LDX | BPF_W | BPF_ABS;
+                       /* 32-bit aligned and not out of bounds. */
+                       if (k >= sizeof(struct io_uring_bpf_ctx) || k & 3)
+                               return -EINVAL;
+                       continue;
+               case BPF_LD | BPF_W | BPF_LEN:
+                       ftest->code = BPF_LD | BPF_IMM;
+                       ftest->k = sizeof(struct io_uring_bpf_ctx);
+                       continue;
+               case BPF_LDX | BPF_W | BPF_LEN:
+                       ftest->code = BPF_LDX | BPF_IMM;
+                       ftest->k = sizeof(struct io_uring_bpf_ctx);
+                       continue;
+               /* Explicitly include allowed calls. */
+               case BPF_RET | BPF_K:
+               case BPF_RET | BPF_A:
+               case BPF_ALU | BPF_ADD | BPF_K:
+               case BPF_ALU | BPF_ADD | BPF_X:
+               case BPF_ALU | BPF_SUB | BPF_K:
+               case BPF_ALU | BPF_SUB | BPF_X:
+               case BPF_ALU | BPF_MUL | BPF_K:
+               case BPF_ALU | BPF_MUL | BPF_X:
+               case BPF_ALU | BPF_DIV | BPF_K:
+               case BPF_ALU | BPF_DIV | BPF_X:
+               case BPF_ALU | BPF_AND | BPF_K:
+               case BPF_ALU | BPF_AND | BPF_X:
+               case BPF_ALU | BPF_OR | BPF_K:
+               case BPF_ALU | BPF_OR | BPF_X:
+               case BPF_ALU | BPF_XOR | BPF_K:
+               case BPF_ALU | BPF_XOR | BPF_X:
+               case BPF_ALU | BPF_LSH | BPF_K:
+               case BPF_ALU | BPF_LSH | BPF_X:
+               case BPF_ALU | BPF_RSH | BPF_K:
+               case BPF_ALU | BPF_RSH | BPF_X:
+               case BPF_ALU | BPF_NEG:
+               case BPF_LD | BPF_IMM:
+               case BPF_LDX | BPF_IMM:
+               case BPF_MISC | BPF_TAX:
+               case BPF_MISC | BPF_TXA:
+               case BPF_LD | BPF_MEM:
+               case BPF_LDX | BPF_MEM:
+               case BPF_ST:
+               case BPF_STX:
+               case BPF_JMP | BPF_JA:
+               case BPF_JMP | BPF_JEQ | BPF_K:
+               case BPF_JMP | BPF_JEQ | BPF_X:
+               case BPF_JMP | BPF_JGE | BPF_K:
+               case BPF_JMP | BPF_JGE | BPF_X:
+               case BPF_JMP | BPF_JGT | BPF_K:
+               case BPF_JMP | BPF_JGT | BPF_X:
+               case BPF_JMP | BPF_JSET | BPF_K:
+               case BPF_JMP | BPF_JSET | BPF_X:
+                       continue;
+               default:
+                       return -EINVAL;
+               }
+       }
+       return 0;
+}
+
+#define IO_URING_BPF_FILTER_FLAGS      IO_URING_BPF_FILTER_DENY_REST
+
+int io_register_bpf_filter(struct io_restriction *res,
+                          struct io_uring_bpf __user *arg)
+{
+       struct io_bpf_filter *filter, *old_filter;
+       struct io_bpf_filters *filters;
+       struct io_uring_bpf reg;
+       struct bpf_prog *prog;
+       struct sock_fprog fprog;
+       int ret;
+
+       if (copy_from_user(&reg, arg, sizeof(reg)))
+               return -EFAULT;
+       if (reg.cmd_type != IO_URING_BPF_CMD_FILTER)
+               return -EINVAL;
+       if (reg.cmd_flags || reg.resv)
+               return -EINVAL;
+
+       if (reg.filter.opcode >= IORING_OP_LAST)
+               return -EINVAL;
+       if (reg.filter.flags & ~IO_URING_BPF_FILTER_FLAGS)
+               return -EINVAL;
+       if (reg.filter.resv)
+               return -EINVAL;
+       if (!mem_is_zero(reg.filter.resv2, sizeof(reg.filter.resv2)))
+               return -EINVAL;
+       if (!reg.filter.filter_len || reg.filter.filter_len > BPF_MAXINSNS)
+               return -EINVAL;
+
+       fprog.len = reg.filter.filter_len;
+       fprog.filter = u64_to_user_ptr(reg.filter.filter_ptr);
+
+       ret = bpf_prog_create_from_user(&prog, &fprog,
+                                       io_uring_check_cbpf_filter, false);
+       if (ret)
+               return ret;
+
+       /*
+        * No existing filters, allocate set.
+        */
+       filters = res->bpf_filters;
+       if (!filters) {
+               filters = io_new_bpf_filters();
+               if (IS_ERR(filters)) {
+                       ret = PTR_ERR(filters);
+                       goto err_prog;
+               }
+       }
+
+       filter = kzalloc(sizeof(*filter), GFP_KERNEL_ACCOUNT);
+       if (!filter) {
+               ret = -ENOMEM;
+               goto err;
+       }
+       filter->prog = prog;
+       res->bpf_filters = filters;
+
+       /*
+        * Insert filter - if the current opcode already has a filter
+        * attached, add to the set.
+        */
+       rcu_read_lock();
+       spin_lock_bh(&filters->lock);
+       old_filter = rcu_dereference(filters->filters[reg.filter.opcode]);
+       if (old_filter)
+               filter->next = old_filter;
+       rcu_assign_pointer(filters->filters[reg.filter.opcode], filter);
+
+       /*
+        * If IO_URING_BPF_FILTER_DENY_REST is set, fill any unregistered
+        * opcode with the dummy filter. That will cause them to be denied.
+        */
+       if (reg.filter.flags & IO_URING_BPF_FILTER_DENY_REST) {
+               for (int i = 0; i < IORING_OP_LAST; i++) {
+                       if (i == reg.filter.opcode)
+                               continue;
+                       old_filter = rcu_dereference(filters->filters[i]);
+                       if (old_filter)
+                               continue;
+                       rcu_assign_pointer(filters->filters[i], &dummy_filter);
+               }
+       }
+
+       spin_unlock_bh(&filters->lock);
+       rcu_read_unlock();
+       return 0;
+err:
+       if (filters != res->bpf_filters)
+               __io_put_bpf_filters(filters);
+err_prog:
+       bpf_prog_destroy(prog);
+       return ret;
+}
diff --git a/io_uring/bpf_filter.h b/io_uring/bpf_filter.h

new file mode 100644 (file)

index 0000000..27eae97
--- /dev/null
+++ b/io_uring/bpf_filter.h
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+#ifndef IO_URING_BPF_FILTER_H
+#define IO_URING_BPF_FILTER_H
+
+#include <uapi/linux/io_uring/bpf_filter.h>
+
+#ifdef CONFIG_IO_URING_BPF
+
+int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req);
+
+int io_register_bpf_filter(struct io_restriction *res,
+                          struct io_uring_bpf __user *arg);
+
+void io_put_bpf_filters(struct io_restriction *res);
+
+static inline int io_uring_run_bpf_filters(struct io_restriction *res,
+                                          struct io_kiocb *req)
+{
+       if (res->bpf_filters)
+               return __io_uring_run_bpf_filters(res, req);
+
+       return 0;
+}
+
+#else
+
+static inline int io_register_bpf_filter(struct io_restriction *res,
+                                        struct io_uring_bpf __user *arg)
+{
+       return -EINVAL;
+}
+static inline int io_uring_run_bpf_filters(struct io_restriction *res,
+                                          struct io_kiocb *req)
+{
+       return 0;
+}
+static inline void io_put_bpf_filters(struct io_restriction *res)
+{
+}
+#endif /* CONFIG_IO_URING_BPF */
+
+#endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c

index a50459238bee078020806c5206e523342781e7ce..9b9794dfc27abd0d2f30ab7222044e12709e04d0 100644 (file)
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -94,6 +94,7 @@
  #include "alloc_cache.h"
  #include "eventfd.h"
  #include "wait.h"
+#include "bpf_filter.h"
  
  #define SQE_COMMON_FLAGS (IOSQE_FIXED_FILE | IOSQE_IO_LINK | \
                           IOSQE_IO_HARDLINK | IOSQE_ASYNC)
@@ -1874,6 +1875,12 @@ static inline int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req,
         if (unlikely(ret))
                 return io_submit_fail_init(sqe, req, ret);
  
+       if (unlikely(ctx->restrictions.bpf_filters)) {
+               ret = io_uring_run_bpf_filters(&ctx->restrictions, req);
+               if (ret)
+                       return io_submit_fail_init(sqe, req, ret);
+       }
+
         trace_io_uring_submit_req(req);
  
         /*
@@ -2161,6 +2168,7 @@ static __cold void io_ring_ctx_free(struct io_ring_ctx *ctx)
         percpu_ref_exit(&ctx->refs);
         free_uid(ctx->user);
         io_req_caches_free(ctx);
+       io_put_bpf_filters(&ctx->restrictions);
  
         WARN_ON_ONCE(ctx->nr_req_allocated);
  
diff --git a/io_uring/register.c b/io_uring/register.c

index 8551f13920dc70095bdf62664ef993bb4a8b8406..30957c2cb5ebdfe8354527000bed14d4da558a67 100644 (file)
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -33,6 +33,7 @@
  #include "memmap.h"
  #include "zcrx.h"
  #include "query.h"
+#include "bpf_filter.h"
  
  #define IORING_MAX_RESTRICTIONS        (IORING_RESTRICTION_LAST + \
                                  IORING_REGISTER_LAST + IORING_OP_LAST)
@@ -830,6 +831,13 @@ static int __io_uring_register(struct io_ring_ctx *ctx, unsigned opcode,
         case IORING_REGISTER_ZCRX_CTRL:
                 ret = io_zcrx_ctrl(ctx, arg, nr_args);
                 break;
+       case IORING_REGISTER_BPF_FILTER:
+               ret = -EINVAL;
+
+               if (nr_args != 1)
+                       break;
+               ret = io_register_bpf_filter(&ctx->restrictions, arg);
+               break;
         default:
                 ret = -EINVAL;
                 break;
author	Jens Axboe <axboe@kernel.dk>
	Thu, 15 Jan 2026 15:24:02 +0000 (08:24 -0700)
committer	Jens Axboe <axboe@kernel.dk>
	Tue, 27 Jan 2026 18:09:57 +0000 (11:09 -0700)
include/linux/io_uring_types.h		patch \| blob \| blame \| history
include/uapi/linux/io_uring.h		patch \| blob \| blame \| history
include/uapi/linux/io_uring/bpf_filter.h	[new file with mode: 0644]	patch \| blob
io_uring/Kconfig		patch \| blob \| blame \| history
io_uring/Makefile		patch \| blob \| blame \| history
io_uring/bpf_filter.c	[new file with mode: 0644]	patch \| blob
io_uring/bpf_filter.h	[new file with mode: 0644]	patch \| blob
io_uring/io_uring.c		patch \| blob \| blame \| history
io_uring/register.c		patch \| blob \| blame \| history