]> git.ipfire.org Git - thirdparty/linux.git/commitdiff
io_uring: allow registration of per-task restrictions
authorJens Axboe <axboe@kernel.dk>
Thu, 8 Jan 2026 17:18:31 +0000 (10:18 -0700)
committerJens Axboe <axboe@kernel.dk>
Fri, 6 Feb 2026 14:29:19 +0000 (07:29 -0700)
Currently io_uring supports restricting operations on a per-ring basis.
To use those, the ring must be setup in a disabled state by setting
IORING_SETUP_R_DISABLED. Then restrictions can be set for the ring, and
the ring can then be enabled.

This commit adds support for IORING_REGISTER_RESTRICTIONS with ring_fd
== -1, like the other "blind" register opcodes which work on the task
rather than a specific ring. This allows registration of the same kind
of restrictions as can been done on a specific ring, but with the task
itself. Once done, any ring created will inherit these restrictions.

If a restriction filter is registered with a task, then it's inherited
on fork for its children. Children may only further restrict operations,
not extend them.

Inheriting restrictions include both the classic
IORING_REGISTER_RESTRICTIONS based restrictions, as well as the BPF
filters that have been registered with the task via
IORING_REGISTER_BPF_FILTER.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
include/linux/io_uring_types.h
include/uapi/linux/io_uring.h
io_uring/bpf_filter.c
io_uring/bpf_filter.h
io_uring/io_uring.c
io_uring/io_uring.h
io_uring/register.c
io_uring/tctx.c

index 7617df2472381fdee2c084d04c4ea9bcafc53a15..510d801b9a55e135d6a72b6fd891bfa0a526b7d0 100644 (file)
@@ -231,6 +231,8 @@ struct io_restriction {
        DECLARE_BITMAP(register_op, IORING_REGISTER_LAST);
        DECLARE_BITMAP(sqe_op, IORING_OP_LAST);
        struct io_bpf_filters *bpf_filters;
+       /* ->bpf_filters needs COW on modification */
+       bool bpf_filters_cow;
        u8 sqe_flags_allowed;
        u8 sqe_flags_required;
        /* IORING_OP_* restrictions exist */
index 94669b77fee8c16729e2b64c6a98e3537073bcad..aeeffcf27fee177ba78b1c81de300d0fc099fd73 100644 (file)
@@ -808,6 +808,13 @@ struct io_uring_restriction {
        __u32 resv2[3];
 };
 
+struct io_uring_task_restriction {
+       __u16 flags;
+       __u16 nr_res;
+       __u32 resv[3];
+       __DECLARE_FLEX_ARRAY(struct io_uring_restriction, restrictions);
+};
+
 struct io_uring_clock_register {
        __u32   clockid;
        __u32   __resv[3];
index b94944ab84429a34ff9ff5b911be2079a7dbdfcb..3816883a45ed0a490b4f7a92a380b759995c5e60 100644 (file)
@@ -249,13 +249,77 @@ static int io_uring_check_cbpf_filter(struct sock_filter *filter,
        return 0;
 }
 
+void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src)
+{
+       if (!src->bpf_filters)
+               return;
+
+       rcu_read_lock();
+       /*
+        * If the src filter is going away, just ignore it.
+        */
+       if (refcount_inc_not_zero(&src->bpf_filters->refs)) {
+               dst->bpf_filters = src->bpf_filters;
+               dst->bpf_filters_cow = true;
+       }
+       rcu_read_unlock();
+}
+
+/*
+ * Allocate a new struct io_bpf_filters. Used when a filter is cloned and
+ * modifications need to be made.
+ */
+static struct io_bpf_filters *io_bpf_filter_cow(struct io_restriction *src)
+{
+       struct io_bpf_filters *filters;
+       struct io_bpf_filter *srcf;
+       int i;
+
+       filters = io_new_bpf_filters();
+       if (IS_ERR(filters))
+               return filters;
+
+       /*
+        * Iterate filters from src and assign in destination. Grabbing
+        * a reference is enough, we don't need to duplicate the memory.
+        * This is safe because filters are only ever appended to the
+        * front of the list, hence the only memory ever touched inside
+        * a filter is the refcount.
+        */
+       rcu_read_lock();
+       for (i = 0; i < IORING_OP_LAST; i++) {
+               srcf = rcu_dereference(src->bpf_filters->filters[i]);
+               if (!srcf) {
+                       continue;
+               } else if (srcf == &dummy_filter) {
+                       rcu_assign_pointer(filters->filters[i], &dummy_filter);
+                       continue;
+               }
+
+               /*
+                * Getting a ref on the first node is enough, putting the
+                * filter and iterating nodes to free will stop on the first
+                * one that doesn't hit zero when dropping.
+                */
+               if (!refcount_inc_not_zero(&srcf->refs))
+                       goto err;
+               rcu_assign_pointer(filters->filters[i], srcf);
+       }
+       rcu_read_unlock();
+       return filters;
+err:
+       rcu_read_unlock();
+       __io_put_bpf_filters(filters);
+       return ERR_PTR(-EBUSY);
+}
+
 #define IO_URING_BPF_FILTER_FLAGS      IO_URING_BPF_FILTER_DENY_REST
 
 int io_register_bpf_filter(struct io_restriction *res,
                           struct io_uring_bpf __user *arg)
 {
+       struct io_bpf_filters *filters, *old_filters = NULL;
        struct io_bpf_filter *filter, *old_filter;
-       struct io_bpf_filters *filters;
        struct io_uring_bpf reg;
        struct bpf_prog *prog;
        struct sock_fprog fprog;
@@ -297,6 +361,17 @@ int io_register_bpf_filter(struct io_restriction *res,
                        ret = PTR_ERR(filters);
                        goto err_prog;
                }
+       } else if (res->bpf_filters_cow) {
+               filters = io_bpf_filter_cow(res);
+               if (IS_ERR(filters)) {
+                       ret = PTR_ERR(filters);
+                       goto err_prog;
+               }
+               /*
+                * Stash old filters, we'll put them once we know we'll
+                * succeed. Until then, res->bpf_filters is left untouched.
+                */
+               old_filters = res->bpf_filters;
        }
 
        filter = kzalloc(sizeof(*filter), GFP_KERNEL_ACCOUNT);
@@ -306,6 +381,15 @@ int io_register_bpf_filter(struct io_restriction *res,
        }
        refcount_set(&filter->refs, 1);
        filter->prog = prog;
+
+       /*
+        * Success - install the new filter set now. If we did COW, put
+        * the old filters as we're replacing them.
+        */
+       if (old_filters) {
+               __io_put_bpf_filters(old_filters);
+               res->bpf_filters_cow = false;
+       }
        res->bpf_filters = filters;
 
        /*
index 9f3cdb92eb16eec4caaf8b2fcb1938c76fbe167d..66a776cf25b426d101b9160ca7341fc362540ef5 100644 (file)
@@ -13,6 +13,8 @@ int io_register_bpf_filter(struct io_restriction *res,
 
 void io_put_bpf_filters(struct io_restriction *res);
 
+void io_bpf_filter_clone(struct io_restriction *dst, struct io_restriction *src);
+
 static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
                                           struct io_kiocb *req)
 {
@@ -37,6 +39,10 @@ static inline int io_uring_run_bpf_filters(struct io_bpf_filter __rcu **filters,
 static inline void io_put_bpf_filters(struct io_restriction *res)
 {
 }
+static inline void io_bpf_filter_clone(struct io_restriction *dst,
+                                      struct io_restriction *src)
+{
+}
 #endif /* CONFIG_IO_URING_BPF */
 
 #endif
index 049454278563bcf6deb943f0f8380478c5625279..e43c5283b23a9b7ff0767ad515e5d959aabc2ba8 100644 (file)
@@ -2880,6 +2880,32 @@ int io_prepare_config(struct io_ctx_config *config)
        return 0;
 }
 
+void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src)
+{
+       memcpy(&dst->register_op, &src->register_op, sizeof(dst->register_op));
+       memcpy(&dst->sqe_op, &src->sqe_op, sizeof(dst->sqe_op));
+       dst->sqe_flags_allowed = src->sqe_flags_allowed;
+       dst->sqe_flags_required = src->sqe_flags_required;
+       dst->op_registered = src->op_registered;
+       dst->reg_registered = src->reg_registered;
+
+       io_bpf_filter_clone(dst, src);
+}
+
+static void io_ctx_restriction_clone(struct io_ring_ctx *ctx,
+                                    struct io_restriction *src)
+{
+       struct io_restriction *dst = &ctx->restrictions;
+
+       io_restriction_clone(dst, src);
+       if (dst->bpf_filters)
+               WRITE_ONCE(ctx->bpf_filters, dst->bpf_filters->filters);
+       if (dst->op_registered)
+               ctx->op_restricted = 1;
+       if (dst->reg_registered)
+               ctx->reg_restricted = 1;
+}
+
 static __cold int io_uring_create(struct io_ctx_config *config)
 {
        struct io_uring_params *p = &config->p;
@@ -2940,6 +2966,13 @@ static __cold int io_uring_create(struct io_ctx_config *config)
        else
                ctx->notify_method = TWA_SIGNAL;
 
+       /*
+        * If the current task has restrictions enabled, then copy them to
+        * our newly created ring and mark it as registered.
+        */
+       if (current->io_uring_restrict)
+               io_ctx_restriction_clone(ctx, current->io_uring_restrict);
+
        /*
         * This is just grabbed for accounting purposes. When a process exits,
         * the mm is exited and dropped before the files, hence we need to hang
index 29b8f90fdabf7d249a3ad2d8d2ec8f77933240d4..a08d78c716f8a82f14aab0575a139c5ebf326271 100644 (file)
@@ -197,6 +197,7 @@ void io_task_refs_refill(struct io_uring_task *tctx);
 bool __io_alloc_req_refill(struct io_ring_ctx *ctx);
 
 void io_activate_pollwq(struct io_ring_ctx *ctx);
+void io_restriction_clone(struct io_restriction *dst, struct io_restriction *src);
 
 static inline void io_lockdep_assert_cq_locked(struct io_ring_ctx *ctx)
 {
index 40de9b8924b9cb0f2d9307c0f9dcd6908b443e1b..af4815bc11d66c490ec295295e7e04f04f6b7e2c 100644 (file)
@@ -190,6 +190,82 @@ static __cold int io_register_restrictions(struct io_ring_ctx *ctx,
        return 0;
 }
 
+static int io_register_restrictions_task(void __user *arg, unsigned int nr_args)
+{
+       struct io_uring_task_restriction __user *ures = arg;
+       struct io_uring_task_restriction tres;
+       struct io_restriction *res;
+       int ret;
+
+       /* Disallow if task already has registered restrictions */
+       if (current->io_uring_restrict)
+               return -EPERM;
+       /*
+        * Similar to seccomp, disallow setting a filter if task_no_new_privs
+        * is true and we're not CAP_SYS_ADMIN.
+        */
+       if (!task_no_new_privs(current) &&
+           !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
+               return -EACCES;
+       if (nr_args != 1)
+               return -EINVAL;
+
+       if (copy_from_user(&tres, arg, sizeof(tres)))
+               return -EFAULT;
+
+       if (tres.flags)
+               return -EINVAL;
+       if (!mem_is_zero(tres.resv, sizeof(tres.resv)))
+               return -EINVAL;
+
+       res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
+       if (!res)
+               return -ENOMEM;
+
+       ret = io_parse_restrictions(ures->restrictions, tres.nr_res, res);
+       if (ret < 0) {
+               kfree(res);
+               return ret;
+       }
+       current->io_uring_restrict = res;
+       return 0;
+}
+
+static int io_register_bpf_filter_task(void __user *arg, unsigned int nr_args)
+{
+       struct io_restriction *res;
+       int ret;
+
+       /*
+        * Similar to seccomp, disallow setting a filter if task_no_new_privs
+        * is true and we're not CAP_SYS_ADMIN.
+        */
+       if (!task_no_new_privs(current) &&
+           !ns_capable_noaudit(current_user_ns(), CAP_SYS_ADMIN))
+               return -EACCES;
+
+       if (nr_args != 1)
+               return -EINVAL;
+
+       /* If no task restrictions exist, setup a new set */
+       res = current->io_uring_restrict;
+       if (!res) {
+               res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
+               if (!res)
+                       return -ENOMEM;
+       }
+
+       ret = io_register_bpf_filter(res, arg);
+       if (ret) {
+               if (res != current->io_uring_restrict)
+                       kfree(res);
+               return ret;
+       }
+       if (!current->io_uring_restrict)
+               current->io_uring_restrict = res;
+       return 0;
+}
+
 static int io_register_enable_rings(struct io_ring_ctx *ctx)
 {
        if (!(ctx->flags & IORING_SETUP_R_DISABLED))
@@ -912,6 +988,10 @@ static int io_uring_register_blind(unsigned int opcode, void __user *arg,
                return io_uring_register_send_msg_ring(arg, nr_args);
        case IORING_REGISTER_QUERY:
                return io_query(arg, nr_args);
+       case IORING_REGISTER_RESTRICTIONS:
+               return io_register_restrictions_task(arg, nr_args);
+       case IORING_REGISTER_BPF_FILTER:
+               return io_register_bpf_filter_task(arg, nr_args);
        }
        return -EINVAL;
 }
index d4f7698805e4468e920f4f5950af089cf16fcbc2..e3da31fdf16f675c02da9e44c2e3cfea9debce88 100644 (file)
@@ -11,6 +11,7 @@
 
 #include "io_uring.h"
 #include "tctx.h"
+#include "bpf_filter.h"
 
 static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx,
                                        struct task_struct *task)
@@ -66,6 +67,11 @@ void __io_uring_free(struct task_struct *tsk)
                kfree(tctx);
                tsk->io_uring = NULL;
        }
+       if (tsk->io_uring_restrict) {
+               io_put_bpf_filters(tsk->io_uring_restrict);
+               kfree(tsk->io_uring_restrict);
+               tsk->io_uring_restrict = NULL;
+       }
 }
 
 __cold int io_uring_alloc_task_context(struct task_struct *task,
@@ -356,5 +362,16 @@ int io_ringfd_unregister(struct io_ring_ctx *ctx, void __user *__arg,
 
 int __io_uring_fork(struct task_struct *tsk)
 {
+       struct io_restriction *res, *src = tsk->io_uring_restrict;
+
+       /* Don't leave it dangling on error */
+       tsk->io_uring_restrict = NULL;
+
+       res = kzalloc(sizeof(*res), GFP_KERNEL_ACCOUNT);
+       if (!res)
+               return -ENOMEM;
+
+       tsk->io_uring_restrict = res;
+       io_restriction_clone(res, src);
        return 0;
 }