--- /dev/null
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * BPF filter support for io_uring. Supports SQE opcodes for now.
+ */
+#include <linux/kernel.h>
+#include <linux/errno.h>
+#include <linux/io_uring.h>
+#include <linux/filter.h>
+#include <linux/bpf.h>
+#include <uapi/linux/io_uring.h>
+
+#include "io_uring.h"
+#include "bpf_filter.h"
+#include "net.h"
+
+struct io_bpf_filter {
+ struct bpf_prog *prog;
+ struct io_bpf_filter *next;
+};
+
+/* Deny if this is set as the filter */
+static const struct io_bpf_filter dummy_filter;
+
+static void io_uring_populate_bpf_ctx(struct io_uring_bpf_ctx *bctx,
+ struct io_kiocb *req)
+{
+ bctx->opcode = req->opcode;
+ bctx->sqe_flags = (__force int) req->flags & SQE_VALID_FLAGS;
+ bctx->user_data = req->cqe.user_data;
+ /* clear residual, anything from pdu_size and below */
+ memset((void *) bctx + offsetof(struct io_uring_bpf_ctx, pdu_size), 0,
+ sizeof(*bctx) - offsetof(struct io_uring_bpf_ctx, pdu_size));
+}
+
+/*
+ * Run registered filters for a given opcode. For filters, a return of 0 denies
+ * execution of the request, a return of 1 allows it. If any filter for an
+ * opcode returns 0, filter processing is stopped, and the request is denied.
+ * This also stops the processing of filters.
+ *
+ * __io_uring_run_bpf_filters() returns 0 on success, allow running the
+ * request, and -EACCES when a request is denied.
+ */
+int __io_uring_run_bpf_filters(struct io_restriction *res, struct io_kiocb *req)
+{
+ struct io_bpf_filter *filter;
+ struct io_uring_bpf_ctx bpf_ctx;
+ int ret;
+
+ /* Fast check for existence of filters outside of RCU */
+ if (!rcu_access_pointer(res->bpf_filters->filters[req->opcode]))
+ return 0;
+
+ /*
+ * req->opcode has already been validated to be within the range
+ * of what we expect, io_init_req() does this.
+ */
+ guard(rcu)();
+ filter = rcu_dereference(res->bpf_filters->filters[req->opcode]);
+ if (!filter)
+ return 0;
+ else if (filter == &dummy_filter)
+ return -EACCES;
+
+ io_uring_populate_bpf_ctx(&bpf_ctx, req);
+
+ /*
+ * Iterate registered filters. The opcode is allowed IFF all filters
+ * return 1. If any filter returns denied, opcode will be denied.
+ */
+ do {
+ if (filter == &dummy_filter)
+ return -EACCES;
+ ret = bpf_prog_run(filter->prog, &bpf_ctx);
+ if (!ret)
+ return -EACCES;
+ filter = filter->next;
+ } while (filter);
+
+ return 0;
+}
+
+static void io_free_bpf_filters(struct rcu_head *head)
+{
+ struct io_bpf_filter __rcu **filter;
+ struct io_bpf_filters *filters;
+ int i;
+
+ filters = container_of(head, struct io_bpf_filters, rcu_head);
+ scoped_guard(spinlock, &filters->lock) {
+ filter = filters->filters;
+ if (!filter)
+ return;
+ }
+
+ for (i = 0; i < IORING_OP_LAST; i++) {
+ struct io_bpf_filter *f;
+
+ rcu_read_lock();
+ f = rcu_dereference(filter[i]);
+ while (f) {
+ struct io_bpf_filter *next = f->next;
+
+ /*
+ * Even if stacked, dummy filter will always be last
+ * as it can only get installed into an empty spot.
+ */
+ if (f == &dummy_filter)
+ break;
+ bpf_prog_destroy(f->prog);
+ kfree(f);
+ f = next;
+ }
+ rcu_read_unlock();
+ }
+ kfree(filters->filters);
+ kfree(filters);
+}
+
+static void __io_put_bpf_filters(struct io_bpf_filters *filters)
+{
+ if (refcount_dec_and_test(&filters->refs))
+ call_rcu(&filters->rcu_head, io_free_bpf_filters);
+}
+
+void io_put_bpf_filters(struct io_restriction *res)
+{
+ if (res->bpf_filters)
+ __io_put_bpf_filters(res->bpf_filters);
+}
+
+static struct io_bpf_filters *io_new_bpf_filters(void)
+{
+ struct io_bpf_filters *filters __free(kfree) = NULL;
+
+ filters = kzalloc(sizeof(*filters), GFP_KERNEL_ACCOUNT);
+ if (!filters)
+ return ERR_PTR(-ENOMEM);
+
+ filters->filters = kcalloc(IORING_OP_LAST,
+ sizeof(struct io_bpf_filter *),
+ GFP_KERNEL_ACCOUNT);
+ if (!filters->filters)
+ return ERR_PTR(-ENOMEM);
+
+ refcount_set(&filters->refs, 1);
+ spin_lock_init(&filters->lock);
+ return no_free_ptr(filters);
+}
+
+/*
+ * Validate classic BPF filter instructions. Only allow a safe subset of
+ * operations - no packet data access, just context field loads and basic
+ * ALU/jump operations.
+ */
+static int io_uring_check_cbpf_filter(struct sock_filter *filter,
+ unsigned int flen)
+{
+ int pc;
+
+ for (pc = 0; pc < flen; pc++) {
+ struct sock_filter *ftest = &filter[pc];
+ u16 code = ftest->code;
+ u32 k = ftest->k;
+
+ switch (code) {
+ case BPF_LD | BPF_W | BPF_ABS:
+ ftest->code = BPF_LDX | BPF_W | BPF_ABS;
+ /* 32-bit aligned and not out of bounds. */
+ if (k >= sizeof(struct io_uring_bpf_ctx) || k & 3)
+ return -EINVAL;
+ continue;
+ case BPF_LD | BPF_W | BPF_LEN:
+ ftest->code = BPF_LD | BPF_IMM;
+ ftest->k = sizeof(struct io_uring_bpf_ctx);
+ continue;
+ case BPF_LDX | BPF_W | BPF_LEN:
+ ftest->code = BPF_LDX | BPF_IMM;
+ ftest->k = sizeof(struct io_uring_bpf_ctx);
+ continue;
+ /* Explicitly include allowed calls. */
+ case BPF_RET | BPF_K:
+ case BPF_RET | BPF_A:
+ case BPF_ALU | BPF_ADD | BPF_K:
+ case BPF_ALU | BPF_ADD | BPF_X:
+ case BPF_ALU | BPF_SUB | BPF_K:
+ case BPF_ALU | BPF_SUB | BPF_X:
+ case BPF_ALU | BPF_MUL | BPF_K:
+ case BPF_ALU | BPF_MUL | BPF_X:
+ case BPF_ALU | BPF_DIV | BPF_K:
+ case BPF_ALU | BPF_DIV | BPF_X:
+ case BPF_ALU | BPF_AND | BPF_K:
+ case BPF_ALU | BPF_AND | BPF_X:
+ case BPF_ALU | BPF_OR | BPF_K:
+ case BPF_ALU | BPF_OR | BPF_X:
+ case BPF_ALU | BPF_XOR | BPF_K:
+ case BPF_ALU | BPF_XOR | BPF_X:
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_LSH | BPF_X:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_X:
+ case BPF_ALU | BPF_NEG:
+ case BPF_LD | BPF_IMM:
+ case BPF_LDX | BPF_IMM:
+ case BPF_MISC | BPF_TAX:
+ case BPF_MISC | BPF_TXA:
+ case BPF_LD | BPF_MEM:
+ case BPF_LDX | BPF_MEM:
+ case BPF_ST:
+ case BPF_STX:
+ case BPF_JMP | BPF_JA:
+ case BPF_JMP | BPF_JEQ | BPF_K:
+ case BPF_JMP | BPF_JEQ | BPF_X:
+ case BPF_JMP | BPF_JGE | BPF_K:
+ case BPF_JMP | BPF_JGE | BPF_X:
+ case BPF_JMP | BPF_JGT | BPF_K:
+ case BPF_JMP | BPF_JGT | BPF_X:
+ case BPF_JMP | BPF_JSET | BPF_K:
+ case BPF_JMP | BPF_JSET | BPF_X:
+ continue;
+ default:
+ return -EINVAL;
+ }
+ }
+ return 0;
+}
+
+#define IO_URING_BPF_FILTER_FLAGS IO_URING_BPF_FILTER_DENY_REST
+
+int io_register_bpf_filter(struct io_restriction *res,
+ struct io_uring_bpf __user *arg)
+{
+ struct io_bpf_filter *filter, *old_filter;
+ struct io_bpf_filters *filters;
+ struct io_uring_bpf reg;
+ struct bpf_prog *prog;
+ struct sock_fprog fprog;
+ int ret;
+
+ if (copy_from_user(®, arg, sizeof(reg)))
+ return -EFAULT;
+ if (reg.cmd_type != IO_URING_BPF_CMD_FILTER)
+ return -EINVAL;
+ if (reg.cmd_flags || reg.resv)
+ return -EINVAL;
+
+ if (reg.filter.opcode >= IORING_OP_LAST)
+ return -EINVAL;
+ if (reg.filter.flags & ~IO_URING_BPF_FILTER_FLAGS)
+ return -EINVAL;
+ if (reg.filter.resv)
+ return -EINVAL;
+ if (!mem_is_zero(reg.filter.resv2, sizeof(reg.filter.resv2)))
+ return -EINVAL;
+ if (!reg.filter.filter_len || reg.filter.filter_len > BPF_MAXINSNS)
+ return -EINVAL;
+
+ fprog.len = reg.filter.filter_len;
+ fprog.filter = u64_to_user_ptr(reg.filter.filter_ptr);
+
+ ret = bpf_prog_create_from_user(&prog, &fprog,
+ io_uring_check_cbpf_filter, false);
+ if (ret)
+ return ret;
+
+ /*
+ * No existing filters, allocate set.
+ */
+ filters = res->bpf_filters;
+ if (!filters) {
+ filters = io_new_bpf_filters();
+ if (IS_ERR(filters)) {
+ ret = PTR_ERR(filters);
+ goto err_prog;
+ }
+ }
+
+ filter = kzalloc(sizeof(*filter), GFP_KERNEL_ACCOUNT);
+ if (!filter) {
+ ret = -ENOMEM;
+ goto err;
+ }
+ filter->prog = prog;
+ res->bpf_filters = filters;
+
+ /*
+ * Insert filter - if the current opcode already has a filter
+ * attached, add to the set.
+ */
+ rcu_read_lock();
+ spin_lock_bh(&filters->lock);
+ old_filter = rcu_dereference(filters->filters[reg.filter.opcode]);
+ if (old_filter)
+ filter->next = old_filter;
+ rcu_assign_pointer(filters->filters[reg.filter.opcode], filter);
+
+ /*
+ * If IO_URING_BPF_FILTER_DENY_REST is set, fill any unregistered
+ * opcode with the dummy filter. That will cause them to be denied.
+ */
+ if (reg.filter.flags & IO_URING_BPF_FILTER_DENY_REST) {
+ for (int i = 0; i < IORING_OP_LAST; i++) {
+ if (i == reg.filter.opcode)
+ continue;
+ old_filter = rcu_dereference(filters->filters[i]);
+ if (old_filter)
+ continue;
+ rcu_assign_pointer(filters->filters[i], &dummy_filter);
+ }
+ }
+
+ spin_unlock_bh(&filters->lock);
+ rcu_read_unlock();
+ return 0;
+err:
+ if (filters != res->bpf_filters)
+ __io_put_bpf_filters(filters);
+err_prog:
+ bpf_prog_destroy(prog);
+ return ret;
+}