bpf: Implement mprog API on top of existing cgroup progs

author Yonghong Song <yonghong.song@linux.dev>

Fri, 6 Jun 2025 16:31:41 +0000 (09:31 -0700)

committer Andrii Nakryiko <andrii@kernel.org>

Mon, 9 Jun 2025 23:28:28 +0000 (16:28 -0700)
author Yonghong Song <yonghong.song@linux.dev>
Fri, 6 Jun 2025 16:31:41 +0000 (09:31 -0700)
committer Andrii Nakryiko <andrii@kernel.org>
Mon, 9 Jun 2025 23:28:28 +0000 (16:28 -0700)
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index f1160ebbf526581b38ca5284169099518208915d..25e9cf92ffaf357ed88284b19f6d59be90b9a651 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -1794,6 +1794,13 @@ union bpf_attr {
                                 };
                                 __u64           expected_revision;
                         } netkit;
+                       struct {
+                               union {
+                                       __u32   relative_fd;
+                                       __u32   relative_id;
+                               };
+                               __u64           expected_revision;
+                       } cgroup;
                 };
         } link_create;
  
diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c

index 9122c39870bff858a08f0d03207c6be33a49bf7d..ffbafbef5010d0e4712cffd3149142da764dc2a8 100644 (file)
--- a/kernel/bpf/cgroup.c
+++ b/kernel/bpf/cgroup.c
@@ -658,6 +658,116 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
         return NULL;
  }
  
+static struct bpf_link *bpf_get_anchor_link(u32 flags, u32 id_or_fd)
+{
+       struct bpf_link *link = ERR_PTR(-EINVAL);
+
+       if (flags & BPF_F_ID)
+               link = bpf_link_by_id(id_or_fd);
+       else if (id_or_fd)
+               link = bpf_link_get_from_fd(id_or_fd);
+       return link;
+}
+
+static struct bpf_prog *bpf_get_anchor_prog(u32 flags, u32 id_or_fd)
+{
+       struct bpf_prog *prog = ERR_PTR(-EINVAL);
+
+       if (flags & BPF_F_ID)
+               prog = bpf_prog_by_id(id_or_fd);
+       else if (id_or_fd)
+               prog = bpf_prog_get(id_or_fd);
+       return prog;
+}
+
+static struct bpf_prog_list *get_prog_list(struct hlist_head *progs, struct bpf_prog *prog,
+                                          struct bpf_cgroup_link *link, u32 flags, u32 id_or_fd)
+{
+       bool is_link = flags & BPF_F_LINK, is_id = flags & BPF_F_ID;
+       struct bpf_prog_list *pltmp, *pl = ERR_PTR(-EINVAL);
+       bool preorder = flags & BPF_F_PREORDER;
+       struct bpf_link *anchor_link = NULL;
+       struct bpf_prog *anchor_prog = NULL;
+       bool is_before, is_after;
+
+       is_before = flags & BPF_F_BEFORE;
+       is_after = flags & BPF_F_AFTER;
+       if (is_link || is_id || id_or_fd) {
+               /* flags must have either BPF_F_BEFORE or BPF_F_AFTER */
+               if (is_before == is_after)
+                       return ERR_PTR(-EINVAL);
+               if ((is_link && !link) || (!is_link && !prog))
+                       return ERR_PTR(-EINVAL);
+       } else if (!hlist_empty(progs)) {
+               /* flags cannot have both BPF_F_BEFORE and BPF_F_AFTER */
+               if (is_before && is_after)
+                       return ERR_PTR(-EINVAL);
+       }
+
+       if (is_link) {
+               anchor_link = bpf_get_anchor_link(flags, id_or_fd);
+               if (IS_ERR(anchor_link))
+                       return ERR_PTR(PTR_ERR(anchor_link));
+       } else if (is_id || id_or_fd) {
+               anchor_prog = bpf_get_anchor_prog(flags, id_or_fd);
+               if (IS_ERR(anchor_prog))
+                       return ERR_PTR(PTR_ERR(anchor_prog));
+       }
+
+       if (!anchor_prog && !anchor_link) {
+               /* if there is no anchor_prog/anchor_link, then BPF_F_PREORDER
+                * doesn't matter since either prepend or append to a combined
+                * list of progs will end up with correct result.
+                */
+               hlist_for_each_entry(pltmp, progs, node) {
+                       if (is_before)
+                               return pltmp;
+                       if (pltmp->node.next)
+                               continue;
+                       return pltmp;
+               }
+               return NULL;
+       }
+
+       hlist_for_each_entry(pltmp, progs, node) {
+               if ((anchor_prog && anchor_prog == pltmp->prog) ||
+                   (anchor_link && anchor_link == &pltmp->link->link)) {
+                       if (!!(pltmp->flags & BPF_F_PREORDER) != preorder)
+                               goto out;
+                       pl = pltmp;
+                       goto out;
+               }
+       }
+
+       pl = ERR_PTR(-ENOENT);
+out:
+       if (anchor_link)
+               bpf_link_put(anchor_link);
+       else
+               bpf_prog_put(anchor_prog);
+       return pl;
+}
+
+static int insert_pl_to_hlist(struct bpf_prog_list *pl, struct hlist_head *progs,
+                             struct bpf_prog *prog, struct bpf_cgroup_link *link,
+                             u32 flags, u32 id_or_fd)
+{
+       struct bpf_prog_list *pltmp;
+
+       pltmp = get_prog_list(progs, prog, link, flags, id_or_fd);
+       if (IS_ERR(pltmp))
+               return PTR_ERR(pltmp);
+
+       if (!pltmp)
+               hlist_add_head(&pl->node, progs);
+       else if (flags & BPF_F_BEFORE)
+               hlist_add_before(&pl->node, &pltmp->node);
+       else
+               hlist_add_behind(&pl->node, &pltmp->node);
+
+       return 0;
+}
+
  /**
   * __cgroup_bpf_attach() - Attach the program or the link to a cgroup, and
   *                         propagate the change to descendants
@@ -667,6 +777,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
   * @replace_prog: Previously attached program to replace if BPF_F_REPLACE is set
   * @type: Type of attach operation
   * @flags: Option flags
+ * @id_or_fd: Relative prog id or fd
+ * @revision: bpf_prog_list revision
   *
   * Exactly one of @prog or @link can be non-null.
   * Must be called with cgroup_mutex held.
@@ -674,7 +786,8 @@ static struct bpf_prog_list *find_attach_entry(struct hlist_head *progs,
  static int __cgroup_bpf_attach(struct cgroup *cgrp,
                                struct bpf_prog *prog, struct bpf_prog *replace_prog,
                                struct bpf_cgroup_link *link,
-                              enum bpf_attach_type type, u32 flags)
+                              enum bpf_attach_type type, u32 flags, u32 id_or_fd,
+                              u64 revision)
  {
         u32 saved_flags = (flags & (BPF_F_ALLOW_OVERRIDE | BPF_F_ALLOW_MULTI));
         struct bpf_prog *old_prog = NULL;
@@ -690,6 +803,9 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
             ((flags & BPF_F_REPLACE) && !(flags & BPF_F_ALLOW_MULTI)))
                 /* invalid combination */
                 return -EINVAL;
+       if ((flags & BPF_F_REPLACE) && (flags & (BPF_F_BEFORE | BPF_F_AFTER)))
+               /* only either replace or insertion with before/after */
+               return -EINVAL;
         if (link && (prog || replace_prog))
                 /* only either link or prog/replace_prog can be specified */
                 return -EINVAL;
@@ -700,6 +816,8 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
         atype = bpf_cgroup_atype_find(type, new_prog->aux->attach_btf_id);
         if (atype < 0)
                 return -EINVAL;
+       if (revision && revision != cgrp->bpf.revisions[atype])
+               return -ESTALE;
  
         progs = &cgrp->bpf.progs[atype];
  
@@ -728,22 +846,18 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
         if (pl) {
                 old_prog = pl->prog;
         } else {
-               struct hlist_node *last = NULL;
-
                 pl = kmalloc(sizeof(*pl), GFP_KERNEL);
                 if (!pl) {
                         bpf_cgroup_storages_free(new_storage);
                         return -ENOMEM;
                 }
-               if (hlist_empty(progs))
-                       hlist_add_head(&pl->node, progs);
-               else
-                       hlist_for_each(last, progs) {
-                               if (last->next)
-                                       continue;
-                               hlist_add_behind(&pl->node, last);
-                               break;
-                       }
+
+               err = insert_pl_to_hlist(pl, progs, prog, link, flags, id_or_fd);
+               if (err) {
+                       kfree(pl);
+                       bpf_cgroup_storages_free(new_storage);
+                       return err;
+               }
         }
  
         pl->prog = prog;
@@ -762,6 +876,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
         if (err)
                 goto cleanup_trampoline;
  
+       cgrp->bpf.revisions[atype] += 1;
         if (old_prog) {
                 if (type == BPF_LSM_CGROUP)
                         bpf_trampoline_unlink_cgroup_shim(old_prog);
@@ -793,12 +908,13 @@ static int cgroup_bpf_attach(struct cgroup *cgrp,
                              struct bpf_prog *prog, struct bpf_prog *replace_prog,
                              struct bpf_cgroup_link *link,
                              enum bpf_attach_type type,
-                            u32 flags)
+                            u32 flags, u32 id_or_fd, u64 revision)
  {
         int ret;
  
         cgroup_lock();
-       ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags);
+       ret = __cgroup_bpf_attach(cgrp, prog, replace_prog, link, type, flags,
+                                 id_or_fd, revision);
         cgroup_unlock();
         return ret;
  }
@@ -886,6 +1002,7 @@ static int __cgroup_bpf_replace(struct cgroup *cgrp,
         if (!found)
                 return -ENOENT;
  
+       cgrp->bpf.revisions[atype] += 1;
         old_prog = xchg(&link->link.prog, new_prog);
         replace_effective_prog(cgrp, atype, link);
         bpf_prog_put(old_prog);
@@ -1011,12 +1128,14 @@ found:
   * @prog: A program to detach or NULL
   * @link: A link to detach or NULL
   * @type: Type of detach operation
+ * @revision: bpf_prog_list revision
   *
   * At most one of @prog or @link can be non-NULL.
   * Must be called with cgroup_mutex held.
   */
  static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
-                              struct bpf_cgroup_link *link, enum bpf_attach_type type)
+                              struct bpf_cgroup_link *link, enum bpf_attach_type type,
+                              u64 revision)
  {
         enum cgroup_bpf_attach_type atype;
         struct bpf_prog *old_prog;
@@ -1034,6 +1153,9 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
         if (atype < 0)
                 return -EINVAL;
  
+       if (revision && revision != cgrp->bpf.revisions[atype])
+               return -ESTALE;
+
         progs = &cgrp->bpf.progs[atype];
         flags = cgrp->bpf.flags[atype];
  
@@ -1059,6 +1181,7 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
  
         /* now can actually delete it from this cgroup list */
         hlist_del(&pl->node);
+       cgrp->bpf.revisions[atype] += 1;
  
         kfree(pl);
         if (hlist_empty(progs))
@@ -1074,12 +1197,12 @@ static int __cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
  }
  
  static int cgroup_bpf_detach(struct cgroup *cgrp, struct bpf_prog *prog,
-                            enum bpf_attach_type type)
+                            enum bpf_attach_type type, u64 revision)
  {
         int ret;
  
         cgroup_lock();
-       ret = __cgroup_bpf_detach(cgrp, prog, NULL, type);
+       ret = __cgroup_bpf_detach(cgrp, prog, NULL, type, revision);
         cgroup_unlock();
         return ret;
  }
@@ -1097,6 +1220,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
         struct bpf_prog_array *effective;
         int cnt, ret = 0, i;
         int total_cnt = 0;
+       u64 revision = 0;
         u32 flags;
  
         if (effective_query && prog_attach_flags)
@@ -1134,6 +1258,10 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
                 return -EFAULT;
         if (copy_to_user(&uattr->query.prog_cnt, &total_cnt, sizeof(total_cnt)))
                 return -EFAULT;
+       if (!effective_query && from_atype == to_atype)
+               revision = cgrp->bpf.revisions[from_atype];
+       if (copy_to_user(&uattr->query.revision, &revision, sizeof(revision)))
+               return -EFAULT;
         if (attr->query.prog_cnt == 0 || !prog_ids || !total_cnt)
                 /* return early if user requested only program count + flags */
                 return 0;
@@ -1216,7 +1344,8 @@ int cgroup_bpf_prog_attach(const union bpf_attr *attr,
         }
  
         ret = cgroup_bpf_attach(cgrp, prog, replace_prog, NULL,
-                               attr->attach_type, attr->attach_flags);
+                               attr->attach_type, attr->attach_flags,
+                               attr->relative_fd, attr->expected_revision);
  
         if (replace_prog)
                 bpf_prog_put(replace_prog);
@@ -1238,7 +1367,7 @@ int cgroup_bpf_prog_detach(const union bpf_attr *attr, enum bpf_prog_type ptype)
         if (IS_ERR(prog))
                 prog = NULL;
  
-       ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type);
+       ret = cgroup_bpf_detach(cgrp, prog, attr->attach_type, attr->expected_revision);
         if (prog)
                 bpf_prog_put(prog);
  
@@ -1267,7 +1396,7 @@ static void bpf_cgroup_link_release(struct bpf_link *link)
         }
  
         WARN_ON(__cgroup_bpf_detach(cg_link->cgroup, NULL, cg_link,
-                                   cg_link->type));
+                                   cg_link->type, 0));
         if (cg_link->type == BPF_LSM_CGROUP)
                 bpf_trampoline_unlink_cgroup_shim(cg_link->link.prog);
  
@@ -1339,6 +1468,13 @@ static const struct bpf_link_ops bpf_cgroup_link_lops = {
         .fill_link_info = bpf_cgroup_link_fill_link_info,
  };
  
+#define BPF_F_LINK_ATTACH_MASK \
+       (BPF_F_ID |             \
+        BPF_F_BEFORE |         \
+        BPF_F_AFTER |          \
+        BPF_F_PREORDER |       \
+        BPF_F_LINK)
+
  int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
  {
         struct bpf_link_primer link_primer;
@@ -1346,7 +1482,7 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
         struct cgroup *cgrp;
         int err;
  
-       if (attr->link_create.flags)
+       if (attr->link_create.flags & (~BPF_F_LINK_ATTACH_MASK))
                 return -EINVAL;
  
         cgrp = cgroup_get_from_fd(attr->link_create.target_fd);
@@ -1370,7 +1506,9 @@ int cgroup_bpf_link_attach(const union bpf_attr *attr, struct bpf_prog *prog)
         }
  
         err = cgroup_bpf_attach(cgrp, NULL, NULL, link,
-                               link->type, BPF_F_ALLOW_MULTI);
+                               link->type, BPF_F_ALLOW_MULTI | attr->link_create.flags,
+                               attr->link_create.cgroup.relative_fd,
+                               attr->link_create.cgroup.expected_revision);
         if (err) {
                 bpf_link_cleanup(&link_primer);
                 goto out_put_cgroup;
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c

index 89d027cd7ca037b8719ed200cdaf9cbe86ae2f38..97ad57ffc4041d22dd9485e9c0efcbd9bf530a6d 100644 (file)
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -4186,6 +4186,25 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
         }
  }
  
+static bool is_cgroup_prog_type(enum bpf_prog_type ptype, enum bpf_attach_type atype,
+                               bool check_atype)
+{
+       switch (ptype) {
+       case BPF_PROG_TYPE_CGROUP_DEVICE:
+       case BPF_PROG_TYPE_CGROUP_SKB:
+       case BPF_PROG_TYPE_CGROUP_SOCK:
+       case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+       case BPF_PROG_TYPE_CGROUP_SOCKOPT:
+       case BPF_PROG_TYPE_CGROUP_SYSCTL:
+       case BPF_PROG_TYPE_SOCK_OPS:
+               return true;
+       case BPF_PROG_TYPE_LSM:
+               return check_atype ? atype == BPF_LSM_CGROUP : true;
+       default:
+               return false;
+       }
+}
+
  #define BPF_PROG_ATTACH_LAST_FIELD expected_revision
  
  #define BPF_F_ATTACH_MASK_BASE \
@@ -4216,6 +4235,9 @@ static int bpf_prog_attach(const union bpf_attr *attr)
         if (bpf_mprog_supported(ptype)) {
                 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_MPROG)
                         return -EINVAL;
+       } else if (is_cgroup_prog_type(ptype, 0, false)) {
+               if (attr->attach_flags & ~(BPF_F_ATTACH_MASK_BASE | BPF_F_ATTACH_MASK_MPROG))
+                       return -EINVAL;
         } else {
                 if (attr->attach_flags & ~BPF_F_ATTACH_MASK_BASE)
                         return -EINVAL;
@@ -4233,6 +4255,11 @@ static int bpf_prog_attach(const union bpf_attr *attr)
                 return -EINVAL;
         }
  
+       if (is_cgroup_prog_type(ptype, prog->expected_attach_type, true)) {
+               ret = cgroup_bpf_prog_attach(attr, ptype, prog);
+               goto out;
+       }
+
         switch (ptype) {
         case BPF_PROG_TYPE_SK_SKB:
         case BPF_PROG_TYPE_SK_MSG:
@@ -4244,20 +4271,6 @@ static int bpf_prog_attach(const union bpf_attr *attr)
         case BPF_PROG_TYPE_FLOW_DISSECTOR:
                 ret = netns_bpf_prog_attach(attr, prog);
                 break;
-       case BPF_PROG_TYPE_CGROUP_DEVICE:
-       case BPF_PROG_TYPE_CGROUP_SKB:
-       case BPF_PROG_TYPE_CGROUP_SOCK:
-       case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
-       case BPF_PROG_TYPE_CGROUP_SOCKOPT:
-       case BPF_PROG_TYPE_CGROUP_SYSCTL:
-       case BPF_PROG_TYPE_SOCK_OPS:
-       case BPF_PROG_TYPE_LSM:
-               if (ptype == BPF_PROG_TYPE_LSM &&
-                   prog->expected_attach_type != BPF_LSM_CGROUP)
-                       ret = -EINVAL;
-               else
-                       ret = cgroup_bpf_prog_attach(attr, ptype, prog);
-               break;
         case BPF_PROG_TYPE_SCHED_CLS:
                 if (attr->attach_type == BPF_TCX_INGRESS ||
                     attr->attach_type == BPF_TCX_EGRESS)
@@ -4268,7 +4281,7 @@ static int bpf_prog_attach(const union bpf_attr *attr)
         default:
                 ret = -EINVAL;
         }
-
+out:
         if (ret)
                 bpf_prog_put(prog);
         return ret;
@@ -4296,6 +4309,9 @@ static int bpf_prog_detach(const union bpf_attr *attr)
                         if (IS_ERR(prog))
                                 return PTR_ERR(prog);
                 }
+       } else if (is_cgroup_prog_type(ptype, 0, false)) {
+               if (attr->attach_flags || attr->relative_fd)
+                       return -EINVAL;
         } else if (attr->attach_flags ||
                    attr->relative_fd ||
                    attr->expected_revision) {
diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h

index f1160ebbf526581b38ca5284169099518208915d..25e9cf92ffaf357ed88284b19f6d59be90b9a651 100644 (file)
--- a/tools/include/uapi/linux/bpf.h
+++ b/tools/include/uapi/linux/bpf.h
@@ -1794,6 +1794,13 @@ union bpf_attr {
                                 };
                                 __u64           expected_revision;
                         } netkit;
+                       struct {
+                               union {
+                                       __u32   relative_fd;
+                                       __u32   relative_id;
+                               };
+                               __u64           expected_revision;
+                       } cgroup;
                 };
         } link_create;
author	Yonghong Song <yonghong.song@linux.dev>
	Fri, 6 Jun 2025 16:31:41 +0000 (09:31 -0700)
committer	Andrii Nakryiko <andrii@kernel.org>
	Mon, 9 Jun 2025 23:28:28 +0000 (16:28 -0700)
include/uapi/linux/bpf.h		patch \| blob \| blame \| history
kernel/bpf/cgroup.c		patch \| blob \| blame \| history
kernel/bpf/syscall.c		patch \| blob \| blame \| history
tools/include/uapi/linux/bpf.h		patch \| blob \| blame \| history