]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
bpf: Allow pre-ordering for bpf cgroup progs
authorYonghong Song <yonghong.song@linux.dev>
Mon, 24 Feb 2025 23:01:16 +0000 (15:01 -0800)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 4 Jun 2025 12:41:58 +0000 (14:41 +0200)
[ Upstream commit 4b82b181a26cff8bf7adc3a85a88d121d92edeaf ]

Currently for bpf progs in a cgroup hierarchy, the effective prog array
is computed from bottom cgroup to upper cgroups (post-ordering). For
example, the following cgroup hierarchy
    root cgroup: p1, p2
        subcgroup: p3, p4
have BPF_F_ALLOW_MULTI for both cgroup levels.
The effective cgroup array ordering looks like
    p3 p4 p1 p2
and at run time, progs will execute based on that order.

But in some cases, it is desirable to have root prog executes earlier than
children progs (pre-ordering). For example,
  - prog p1 intends to collect original pkt dest addresses.
  - prog p3 will modify original pkt dest addresses to a proxy address for
    security reason.
The end result is that prog p1 gets proxy address which is not what it
wants. Putting p1 to every child cgroup is not desirable either as it
will duplicate itself in many child cgroups. And this is exactly a use case
we are encountering in Meta.

To fix this issue, let us introduce a flag BPF_F_PREORDER. If the flag
is specified at attachment time, the prog has higher priority and the
ordering with that flag will be from top to bottom (pre-ordering).
For example, in the above example,
    root cgroup: p1, p2
        subcgroup: p3, p4
Let us say p2 and p4 are marked with BPF_F_PREORDER. The final
effective array ordering will be
    p2 p4 p3 p1

Suggested-by: Andrii Nakryiko <andrii@kernel.org>
Acked-by: Andrii Nakryiko <andrii@kernel.org>
Signed-off-by: Yonghong Song <yonghong.song@linux.dev>
Link: https://lore.kernel.org/r/20250224230116.283071-1-yonghong.song@linux.dev
Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: Sasha Levin <sashal@kernel.org>
include/linux/bpf-cgroup.h
include/uapi/linux/bpf.h
kernel/bpf/cgroup.c
kernel/bpf/syscall.c
tools/include/uapi/linux/bpf.h

index d4f2c8706042cd2e079775a1fa643cfc7793bfba..2331cd8174fe3f6641d22d6edc666d73cf8c357f 100644 (file)
@@ -106,6 +106,7 @@ struct bpf_prog_list {
        struct bpf_prog *prog;
        struct bpf_cgroup_link *link;
        struct bpf_cgroup_storage *storage[MAX_BPF_CGROUP_STORAGE_TYPE];
+       u32 flags;
 };
 
 int cgroup_bpf_inherit(struct cgroup *cgrp);
index 431bc700bcfb93629c660043dcfcb05b7f7dd3c1..c7f904a72af2178343826f220b03610482bfa13f 100644 (file)
@@ -1140,6 +1140,7 @@ enum bpf_perf_event_type {
 #define BPF_F_BEFORE           (1U << 3)
 #define BPF_F_AFTER            (1U << 4)
 #define BPF_F_ID               (1U << 5)
+#define BPF_F_PREORDER         (1U << 6)
 #define BPF_F_LINK             BPF_F_LINK /* 1 << 13 */
 
 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the
index cf2eb0895d403c967d63236a0984f15a67ea8b4e..684fb450ad086f9f446daca29f3326f9b0a211a3 100644 (file)
@@ -369,7 +369,7 @@ static struct bpf_prog *prog_list_prog(struct bpf_prog_list *pl)
 /* count number of elements in the list.
  * it's slow but the list cannot be long
  */
-static u32 prog_list_length(struct hlist_head *head)
+static u32 prog_list_length(struct hlist_head *head, int *preorder_cnt)
 {
        struct bpf_prog_list *pl;
        u32 cnt = 0;
@@ -377,6 +377,8 @@ static u32 prog_list_length(struct hlist_head *head)
        hlist_for_each_entry(pl, head, node) {
                if (!prog_list_prog(pl))
                        continue;
+               if (preorder_cnt && (pl->flags & BPF_F_PREORDER))
+                       (*preorder_cnt)++;
                cnt++;
        }
        return cnt;
@@ -400,7 +402,7 @@ static bool hierarchy_allows_attach(struct cgroup *cgrp,
 
                if (flags & BPF_F_ALLOW_MULTI)
                        return true;
-               cnt = prog_list_length(&p->bpf.progs[atype]);
+               cnt = prog_list_length(&p->bpf.progs[atype], NULL);
                WARN_ON_ONCE(cnt > 1);
                if (cnt == 1)
                        return !!(flags & BPF_F_ALLOW_OVERRIDE);
@@ -423,12 +425,12 @@ static int compute_effective_progs(struct cgroup *cgrp,
        struct bpf_prog_array *progs;
        struct bpf_prog_list *pl;
        struct cgroup *p = cgrp;
-       int cnt = 0;
+       int i, j, cnt = 0, preorder_cnt = 0, fstart, bstart, init_bstart;
 
        /* count number of effective programs by walking parents */
        do {
                if (cnt == 0 || (p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
-                       cnt += prog_list_length(&p->bpf.progs[atype]);
+                       cnt += prog_list_length(&p->bpf.progs[atype], &preorder_cnt);
                p = cgroup_parent(p);
        } while (p);
 
@@ -439,20 +441,34 @@ static int compute_effective_progs(struct cgroup *cgrp,
        /* populate the array with effective progs */
        cnt = 0;
        p = cgrp;
+       fstart = preorder_cnt;
+       bstart = preorder_cnt - 1;
        do {
                if (cnt > 0 && !(p->bpf.flags[atype] & BPF_F_ALLOW_MULTI))
                        continue;
 
+               init_bstart = bstart;
                hlist_for_each_entry(pl, &p->bpf.progs[atype], node) {
                        if (!prog_list_prog(pl))
                                continue;
 
-                       item = &progs->items[cnt];
+                       if (pl->flags & BPF_F_PREORDER) {
+                               item = &progs->items[bstart];
+                               bstart--;
+                       } else {
+                               item = &progs->items[fstart];
+                               fstart++;
+                       }
                        item->prog = prog_list_prog(pl);
                        bpf_cgroup_storages_assign(item->cgroup_storage,
                                                   pl->storage);
                        cnt++;
                }
+
+               /* reverse pre-ordering progs at this cgroup level */
+               for (i = bstart + 1, j = init_bstart; i < j; i++, j--)
+                       swap(progs->items[i], progs->items[j]);
+
        } while ((p = cgroup_parent(p)));
 
        *array = progs;
@@ -663,7 +679,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
                 */
                return -EPERM;
 
-       if (prog_list_length(progs) >= BPF_CGROUP_MAX_PROGS)
+       if (prog_list_length(progs, NULL) >= BPF_CGROUP_MAX_PROGS)
                return -E2BIG;
 
        pl = find_attach_entry(progs, prog, link, replace_prog,
@@ -698,6 +714,7 @@ static int __cgroup_bpf_attach(struct cgroup *cgrp,
 
        pl->prog = prog;
        pl->link = link;
+       pl->flags = flags;
        bpf_cgroup_storages_assign(pl->storage, storage);
        cgrp->bpf.flags[atype] = saved_flags;
 
@@ -1073,7 +1090,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
                                                              lockdep_is_held(&cgroup_mutex));
                        total_cnt += bpf_prog_array_length(effective);
                } else {
-                       total_cnt += prog_list_length(&cgrp->bpf.progs[atype]);
+                       total_cnt += prog_list_length(&cgrp->bpf.progs[atype], NULL);
                }
        }
 
@@ -1105,7 +1122,7 @@ static int __cgroup_bpf_query(struct cgroup *cgrp, const union bpf_attr *attr,
                        u32 id;
 
                        progs = &cgrp->bpf.progs[atype];
-                       cnt = min_t(int, prog_list_length(progs), total_cnt);
+                       cnt = min_t(int, prog_list_length(progs, NULL), total_cnt);
                        i = 0;
                        hlist_for_each_entry(pl, progs, node) {
                                prog = prog_list_prog(pl);
index 5a8c5a4ef1134d7f49b59647b7d928556632eb02..b66349f892f25ea9096c8694c3390d0824e5ac3f 100644 (file)
@@ -3900,7 +3900,8 @@ static int bpf_prog_attach_check_attach_type(const struct bpf_prog *prog,
 #define BPF_F_ATTACH_MASK_BASE \
        (BPF_F_ALLOW_OVERRIDE | \
         BPF_F_ALLOW_MULTI |    \
-        BPF_F_REPLACE)
+        BPF_F_REPLACE |        \
+        BPF_F_PREORDER)
 
 #define BPF_F_ATTACH_MASK_MPROG        \
        (BPF_F_REPLACE |        \
index 977ec094bc2a6c75dd8d37d037ff89a39c8d3492..2a90f04a4160db9b6203b694c0f5ae1841636333 100644 (file)
@@ -1140,6 +1140,7 @@ enum bpf_perf_event_type {
 #define BPF_F_BEFORE           (1U << 3)
 #define BPF_F_AFTER            (1U << 4)
 #define BPF_F_ID               (1U << 5)
+#define BPF_F_PREORDER         (1U << 6)
 #define BPF_F_LINK             BPF_F_LINK /* 1 << 13 */
 
 /* If BPF_F_STRICT_ALIGNMENT is used in BPF_PROG_LOAD command, the