io_uring: allow registration of per-task restrictions

author Jens Axboe <axboe@kernel.dk>

Thu, 8 Jan 2026 17:18:31 +0000 (10:18 -0700)

committer Jens Axboe <axboe@kernel.dk>

Fri, 6 Feb 2026 14:29:19 +0000 (07:29 -0700)
author Jens Axboe <axboe@kernel.dk>
Thu, 8 Jan 2026 17:18:31 +0000 (10:18 -0700)
committer Jens Axboe <axboe@kernel.dk>
Fri, 6 Feb 2026 14:29:19 +0000 (07:29 -0700)
diff --git a/include/linux/io_uring_types.h b/include/linux/io_uring_types.h

index 7617df2472381fdee2c084d04c4ea9bcafc53a15..510d801b9a55e135d6a72b6fd891bfa0a526b7d0 100644 (file)
--- a/include/linux/io_uring_types.h
+++ b/include/linux/io_uring_types.h
@@ -231,6 +231,8 @@ struct io_restriction {
         DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
         DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
         struct io_bpf_filters *bpf_filters;
+       /* ->bpf_filters needs COW on modification */
+       bool bpf_filters_cow;
         u8 sqe_flags_allowed;
         u8 sqe_flags_required;
         /* IORING_OP_* restrictions exist */
diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h

index 94669b77fee8c16729e2b64c6a98e3537073bcad..aeeffcf27fee177ba78b1c81de300d0fc099fd73 100644 (file)
--- a/include/uapi/linux/io_uring.h
+++ b/include/uapi/linux/io_uring.h
@@ -808,6 +808,13 @@ struct io_uring_restriction {
         __u32 resv2[3];
  };
  
+struct io_uring_task_restriction {
+       __u16 flags;
+       __u16 nr_res;
+       __u32 resv[3];
+       __DECLARE_FLEX_ARRAY(struct io_uring_restriction, restrictions);
+};
+
  struct io_uring_clock_register {
         __u32   clockid;
         __u32   __resv[3];
diff --git a/io_uring/bpf_filter.c b/io_uring/bpf_filter.c

index b94944ab84429a34ff9ff5b911be2079a7dbdfcb..3816883a45ed0a490b4f7a92a380b759995c5e60 100644 (file)
--- a/io_uring/bpf_filter.c
+++ b/io_uring/bpf_filter.c
@@ -249,13 +249,77 @@ static int io_uring_check_cbpf_filter(struct sock_filter *filter,
         return 0;
  }
  
+void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src)
+{
+       if (!src->bpf_filters)
+               return;
+
+       rcu_read_lock();
+       /*
+        * If the src filter is going away, just ignore it.
+        */
+       if (refcount_inc_not_zero(&src->bpf_filters->refs)) {
+               dst->bpf_filters = src->bpf_filters;
+               dst->bpf_filters_cow = true;
+       }
+       rcu_read_unlock();
+}
+
+/*
+ * Allocate a new struct io_bpf_filters. Used when a filter is cloned and
+ * modifications need to be made.
+ */
+static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src)
+{
+       struct io_bpf_filters *filters;
+       struct io_bpf_filter *srcf;
+       int i;
+
+       filters = io_new_bpf_filters();
+       if (IS_ERR(filters))
+               return filters;
+
+       /*
+        * Iterate filters from src and assign in destination. Grabbing
+        * a reference is enough, we don't need to duplicate the memory.
+        * This is safe because filters are only ever appended to the
+        * front of the list, hence the only memory ever touched inside
+        * a filter is the refcount.
+        */
+       rcu_read_lock();
+       for (i = 0; i < IORING_OP_LAST; i++) {
+               srcf = rcu_dereference(src->bpf_filters->filters[i]);
+               if (!srcf) {
+                       continue;
+               } else if (srcf == &dummy_filter) {
+                       rcu_assign_pointer(filters->filters[i], &dummy_filter);
+                       continue;
+               }
+
+               /*
+                * Getting a ref on the first node is enough, putting the
+                * filter and iterating nodes to free will stop on the first
+                * one that doesn't hit zero when dropping.
+                */
+               if (!refcount_inc_not_zero(&srcf->refs))
+                       goto err;
+               rcu_assign_pointer(filters->filters[i], srcf);
+       }
+       rcu_read_unlock();
+       return filters;
+err:
+       rcu_read_unlock();
+       __io_put_bpf_filters(filters);
+       return ERR_PTR(-EBUSY);
+}
+
  #define IO_URING_BPF_FILTER_FLAGS      IO_URING_BPF_FILTER_DENY_REST
  
  int io_register_bpf_filter(struct io_restriction *res,
                            struct io_uring_bpf __user *arg)
  {
+       struct io_bpf_filters *filters, *old_filters = NULL;
         struct io_bpf_filter *filter, *old_filter;
-       struct io_bpf_filters *filters;
         struct io_uring_bpf reg;
         struct bpf_prog *prog;
         struct sock_fprog fprog;
@@ -297,6 +361,17 @@ int io_register_bpf_filter(struct io_restriction *res,
                         ret = PTR_ERR(filters);
                         goto err_prog;
                 }
+       } else if (res->bpf_filters_cow) {
+               filters = io_bpf_filter_cow(res);
+               if (IS_ERR(filters)) {
+                       ret = PTR_ERR(filters);
+                       goto err_prog;
+               }
+               /*
+                * Stash old filters, we'll put them once we know we'll
+                * succeed. Until then, res->bpf_filters is left untouched.
+                */
+               old_filters = res->bpf_filters;
         }
  
         filter = kzalloc(sizeof(*filter), GFP_KERNEL_ACCOUNT);
@@ -306,6 +381,15 @@ int io_register_bpf_filter(struct io_restriction *res,
         }
         refcount_set(&filter->refs, 1);
         filter->prog = prog;
+
+       /*
+        * Success - install the new filter set now. If we did COW, put
+        * the old filters as we're replacing them.
+        */
+       if (old_filters) {
+               __io_put_bpf_filters(old_filters);
+               res->bpf_filters_cow = false;
+       }
         res->bpf_filters = filters;
  
         /*
diff --git a/io_uring/bpf_filter.h b/io_uring/bpf_filter.h

index 9f3cdb92eb16eec4caaf8b2fcb1938c76fbe167d..66a776cf25b426d101b9160ca7341fc362540ef5 100644 (file)
--- a/io_uring/bpf_filter.h
+++ b/io_uring/bpf_filter.h
@@ -13,6 +13,8 @@ int io_register_bpf_filter(struct io_restriction *res,
  
  void io_put_bpf_filters(struct io_restriction *res);
  
+void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src);
+
  static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
                                            struct io_kiocb *req)
  {
@@ -37,6 +39,10 @@ static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
  static inline void io_put_bpf_filters(struct io_restriction *res)
  {
  }
+static inline void io_bpf_filter_clone(struct io_restriction *dst,
+                                      struct io_restriction *src)
+{
+}
  #endif /* CONFIG_IO_URING_BPF */
  
  #endif
diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c

index 049454278563bcf6deb943f0f8380478c5625279..e43c5283b23a9b7ff0767ad515e5d959aabc2ba8 100644 (file)
--- a/io_uring/io_uring.c
+++ b/io_uring/io_uring.c
@@ -2880,6 +2880,32 @@ int io_prepare_config(struct io_ctx_config *config)
         return 0;
  }
  
+void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src)
+{
+       memcpy(&dst->register_op, &src->register_op, sizeof(dst->register_op));
+       memcpy(&dst->sqe_op, &src->sqe_op, sizeof(dst->sqe_op));
+       dst->sqe_flags_allowed = src->sqe_flags_allowed;
+       dst->sqe_flags_required = src->sqe_flags_required;
+       dst->op_registered = src->op_registered;
+       dst->reg_registered = src->reg_registered;
+
+       io_bpf_filter_clone(dst, src);
+}
+
+static void io_ctx_restriction_clone(struct io_ring_ctx *ctx,
+                                    struct io_restriction *src)
+{
+       struct io_restriction *dst = &ctx->restrictions;
+
+       io_restriction_clone(dst, src);
+       if (dst->bpf_filters)
+               WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters);
+       if (dst->op_registered)
+               ctx->op_restricted = 1;
+       if (dst->reg_registered)
+               ctx->reg_restricted = 1;
+}
+
  static __cold int io_uring_create(struct io_ctx_config *config)
  {
         struct io_uring_params *p = &config->p;
@@ -2940,6 +2966,13 @@ static __cold int io_uring_create(struct io_ctx_config *config)
         else
                 ctx->notify_method = TWA_SIGNAL;
  
+       /*
+        * If the current task has restrictions enabled, then copy them to
+        * our newly created ring and mark it as registered.
+        */
+       if (current->io_uring_restrict)
+               io_ctx_restriction_clone(ctx, current->io_uring_restrict);
+
         /*
          * This is just grabbed for accounting purposes. When a process exits,
          * the mm is exited and dropped before the files, hence we need to hang
diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h

index 29b8f90fdabf7d249a3ad2d8d2ec8f77933240d4..a08d78c716f8a82f14aab0575a139c5ebf326271 100644 (file)
--- a/io_uring/io_uring.h
+++ b/io_uring/io_uring.h
@@ -197,6 +197,7 @@ void io_task_refs_refill(struct io_uring_task *tctx);
  bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
  
  void io_activate_pollwq(struct io_ring_ctx *ctx);
+void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src);
  
  static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
  {
diff --git a/io_uring/register.c b/io_uring/register.c

index 40de9b8924b9cb0f2d9307c0f9dcd6908b443e1b..af4815bc11d66c490ec295295e7e04f04f6b7e2c 100644 (file)
--- a/io_uring/register.c
+++ b/io_uring/register.c
@@ -190,6 +190,82 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
         return 0;
  }
  
+static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
+{
+       struct io_uring_task_restriction __user *ures = arg;
+       struct io_uring_task_restriction tres;
+       struct io_restriction *res;
+       int ret;
+
+       /* Disallow if task already has registered restrictions */
+       if (current->io_uring_restrict)
+               return -EPERM;
+       /*
+        * Similar to seccomp, disallow setting a filter if task_no_new_privs
+        * is true and we're not CAP_SYS_ADMIN.
+        */
+       if (!task_no_new_privs(current) &&
+           !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
+               return -EACCES;
+       if (nr_args != 1)
+               return -EINVAL;
+
+       if (copy_from_user(&tres, arg, sizeof(tres)))
+               return -EFAULT;
+
+       if (tres.flags)
+               return -EINVAL;
+       if (!mem_is_zero(tres.resv, sizeof(tres.resv)))
+               return -EINVAL;
+
+       res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
+       if (!res)
+               return -ENOMEM;
+
+       ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res);
+       if (ret < 0) {
+               kfree(res);
+               return ret;
+       }
+       current->io_uring_restrict = res;
+       return 0;
+}
+
+static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
+{
+       struct io_restriction *res;
+       int ret;
+
+       /*
+        * Similar to seccomp, disallow setting a filter if task_no_new_privs
+        * is true and we're not CAP_SYS_ADMIN.
+        */
+       if (!task_no_new_privs(current) &&
+           !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
+               return -EACCES;
+
+       if (nr_args != 1)
+               return -EINVAL;
+
+       /* If no task restrictions exist, setup a new set */
+       res = current->io_uring_restrict;
+       if (!res) {
+               res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
+               if (!res)
+                       return -ENOMEM;
+       }
+
+       ret = io_register_bpf_filter(res, arg);
+       if (ret) {
+               if (res != current->io_uring_restrict)
+                       kfree(res);
+               return ret;
+       }
+       if (!current->io_uring_restrict)
+               current->io_uring_restrict = res;
+       return 0;
+}
+
  static int io_register_enable_rings(struct io_ring_ctx *ctx)
  {
         if (!(ctx->flags & IORING_SETUP_R_DISABLED))
@@ -912,6 +988,10 @@ static int io_uring_register_blind(unsigned int opcode, void __user *arg,
                 return io_uring_register_send_msg_ring(arg, nr_args);
         case IORING_REGISTER_QUERY:
                 return io_query(arg, nr_args);
+       case IORING_REGISTER_RESTRICTIONS:
+               return io_register_restrictions_task(arg, nr_args);
+       case IORING_REGISTER_BPF_FILTER:
+               return io_register_bpf_filter_task(arg, nr_args);
         }
         return -EINVAL;
  }
diff --git a/io_uring/tctx.c b/io_uring/tctx.c

index d4f7698805e4468e920f4f5950af089cf16fcbc2..e3da31fdf16f675c02da9e44c2e3cfea9debce88 100644 (file)
--- a/io_uring/tctx.c
+++ b/io_uring/tctx.c
@@ -11,6 +11,7 @@
  
  #include "io_uring.h"
  #include "tctx.h"
+#include "bpf_filter.h"
  
  static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
                                         struct task_struct *task)
@@ -66,6 +67,11 @@ void __io_uring_free(struct task_struct *tsk)
                 kfree(tctx);
                 tsk->io_uring = NULL;
         }
+       if (tsk->io_uring_restrict) {
+               io_put_bpf_filters(tsk->io_uring_restrict);
+               kfree(tsk->io_uring_restrict);
+               tsk->io_uring_restrict = NULL;
+       }
  }
  
  __cold int io_uring_alloc_task_context(struct task_struct *task,
@@ -356,5 +362,16 @@ int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
  
  int __io_uring_fork(struct task_struct *tsk)
  {
+       struct io_restriction *res, *src = tsk->io_uring_restrict;
+
+       /* Don't leave it dangling on error */
+       tsk->io_uring_restrict = NULL;
+
+       res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
+       if (!res)
+               return -ENOMEM;
+
+       tsk->io_uring_restrict = res;
+       io_restriction_clone(res, src);
         return 0;
  }
author	Jens Axboe <axboe@kernel.dk>
	Thu, 8 Jan 2026 17:18:31 +0000 (10:18 -0700)
committer	Jens Axboe <axboe@kernel.dk>
	Fri, 6 Feb 2026 14:29:19 +0000 (07:29 -0700)
include/linux/io_uring_types.h		patch \| blob \| blame \| history
include/uapi/linux/io_uring.h		patch \| blob \| blame \| history
io_uring/bpf_filter.c		patch \| blob \| blame \| history
io_uring/bpf_filter.h		patch \| blob \| blame \| history
io_uring/io_uring.c		patch \| blob \| blame \| history
io_uring/io_uring.h		patch \| blob \| blame \| history
io_uring/register.c		patch \| blob \| blame \| history
io_uring/tctx.c		patch \| blob \| blame \| history