From acf7f253dedbca7ba5edada37d11221a47101ee3 Mon Sep 17 00:00:00 2001
From: Lennart Poettering <lennart@poettering.net>
Date: Fri, 16 Feb 2018 15:35:49 +0100
Subject: [PATCH] bpf: use BPF_F_ALLOW_MULTI flag if it is available

This new kernel 4.15 flag permits that multiple BPF programs can be
executed for each packet processed: multiple per cgroup plus all
programs defined up the tree on all parent cgroups.

We can use this for two features:

1. Finally provide per-slice IP accounting (which was previously
   unavailable)

2. Permit delegation of BPF programs to services (i.e. leaf nodes).

This patch beefs up PID1's handling of BPF to enable both.

Note two special items to keep in mind:

a. Our inner-node BPF programs (i.e. the ones we attach to slices) do
   not enforce IP access lists, that's done exclsuively in the leaf-node
   BPF programs. That's a good thing, since that way rules in leaf nodes
   can cancel out rules further up (i.e. for example to implement a
   logic of "disallow everything except httpd.service"). Inner node BPF
   programs to accounting however if that's requested. This is
   beneficial for performance reasons: it means in order to provide
   per-slice IP accounting we don't have to add up all child unit's
   data.

b. When this code is run on pre-4.15 kernel (i.e. where
   BPF_F_ALLOW_MULTI is not available) we'll make IP acocunting on slice
   units unavailable (i.e. revert to behaviour from before this commit).
   For leaf nodes we'll fallback to non-ALLOW_MULTI mode however, which
   means that BPF delegation is not available there at all, if IP
   fw/acct is turned on for the unit. This is a change from earlier
   behaviour, where we use the BPF_F_ALLOW_OVERRIDE flag, so that our
   fw/acct would lose its effect as soon as delegation was turned on and
   some client made use of that. I think the new behaviour is the safer
   choice in this case, as silent bypassing of our fw rules is not
   possible anymore. And if people want proper delegation then the way
   out is a more modern kernel or turning off IP firewalling/acct for
   the unit algother.
---
 src/core/bpf-firewall.c | 61 +++++++++++++++++++++++++++++------------
 src/core/cgroup.c       | 22 ++-------------
 2 files changed, 45 insertions(+), 38 deletions(-)

diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c
index bbc876ba39f..4091183a99b 100644
--- a/src/core/bpf-firewall.c
+++ b/src/core/bpf-firewall.c
@@ -486,17 +486,26 @@ static int bpf_firewall_prepare_accounting_maps(bool enabled, int *fd_ingress, i
 
 int bpf_firewall_compile(Unit *u) {
         CGroupContext *cc;
-        int r;
+        int r, supported;
 
         assert(u);
 
-        r = bpf_firewall_supported();
-        if (r < 0)
-                return r;
-        if (r == BPF_FIREWALL_UNSUPPORTED) {
+        supported = bpf_firewall_supported();
+        if (supported < 0)
+                return supported;
+        if (supported == BPF_FIREWALL_UNSUPPORTED) {
                 log_debug("BPF firewalling not supported on this manager, proceeding without.");
                 return -EOPNOTSUPP;
         }
+        if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
+                /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
+                 * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
+                 * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
+                 * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
+                 * all, either. */
+                log_debug("BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
+                return -EOPNOTSUPP;
+        }
 
         /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
          * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
@@ -515,13 +524,21 @@ int bpf_firewall_compile(Unit *u) {
         if (!cc)
                 return -EINVAL;
 
-        r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd);
-        if (r < 0)
-                return log_error_errno(r, "Preparation of eBPF allow maps failed: %m");
+        if (u->type != UNIT_SLICE) {
+                /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
+                 * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
+                 * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
+                 * means that all configure IP access rules *will* take effect on processes, even though we never
+                 * compile them for inner nodes. */
 
-        r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd);
-        if (r < 0)
-                return log_error_errno(r, "Preparation of eBPF deny maps failed: %m");
+                r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd);
+                if (r < 0)
+                        return log_error_errno(r, "Preparation of eBPF allow maps failed: %m");
+
+                r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd);
+                if (r < 0)
+                        return log_error_errno(r, "Preparation of eBPF deny maps failed: %m");
+        }
 
         r = bpf_firewall_prepare_accounting_maps(cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
         if (r < 0)
@@ -541,7 +558,8 @@ int bpf_firewall_compile(Unit *u) {
 int bpf_firewall_install(Unit *u) {
         _cleanup_free_ char *path = NULL;
         CGroupContext *cc;
-        int r;
+        uint32_t flags;
+        int r, supported;
 
         assert(u);
 
@@ -552,24 +570,31 @@ int bpf_firewall_install(Unit *u) {
         if (!cc)
                 return -EINVAL;
 
-        r = bpf_firewall_supported();
-        if (r < 0)
-                return r;
-        if (r == BPF_FIREWALL_UNSUPPORTED) {
+        supported = bpf_firewall_supported();
+        if (supported < 0)
+                return supported;
+        if (supported == BPF_FIREWALL_UNSUPPORTED) {
                 log_debug("BPF firewalling not supported on this manager, proceeding without.");
                 return -EOPNOTSUPP;
         }
+        if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
+                log_debug("BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
+                return -EOPNOTSUPP;
+        }
 
         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
         if (r < 0)
                 return log_error_errno(r, "Failed to determine cgroup path: %m");
 
+        flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
+                 (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
+
         if (u->ip_bpf_egress) {
                 r = bpf_program_load_kernel(u->ip_bpf_egress, NULL, 0);
                 if (r < 0)
                         return log_error_errno(r, "Kernel upload of egress BPF program failed: %m");
 
-                r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, unit_cgroup_delegate(u) ? BPF_F_ALLOW_OVERRIDE : 0);
+                r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
                 if (r < 0)
                         return log_error_errno(r, "Attaching egress BPF program to cgroup %s failed: %m", path);
         } else {
@@ -584,7 +609,7 @@ int bpf_firewall_install(Unit *u) {
                 if (r < 0)
                         return log_error_errno(r, "Kernel upload of ingress BPF program failed: %m");
 
-                r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, unit_cgroup_delegate(u) ? BPF_F_ALLOW_OVERRIDE : 0);
+                r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
                 if (r < 0)
                         return log_error_errno(r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
         } else {
diff --git a/src/core/cgroup.c b/src/core/cgroup.c
index edb702ce48a..52431ec12fe 100644
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -693,20 +693,14 @@ static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_
 }
 
 static void cgroup_apply_firewall(Unit *u) {
-        int r;
-
         assert(u);
 
-        if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
-                                    * not recursive we don't ever touch the bpf on them */
-                return;
+        /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
 
-        r = bpf_firewall_compile(u);
-        if (r < 0)
+        if (bpf_firewall_compile(u) < 0)
                 return;
 
         (void) bpf_firewall_install(u);
-        return;
 }
 
 static void cgroup_context_apply(
@@ -1227,11 +1221,6 @@ bool unit_get_needs_bpf(Unit *u) {
         Unit *p;
         assert(u);
 
-        /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
-         * moment. */
-        if (u->type == UNIT_SLICE)
-                return false;
-
         c = unit_get_cgroup_context(u);
         if (!c)
                 return false;
@@ -2564,13 +2553,6 @@ int unit_get_ip_accounting(
         assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
         assert(ret);
 
-        /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
-         * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
-         * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
-         * filters. */
-        if (u->type == UNIT_SLICE)
-                return -ENODATA;
-
         if (!UNIT_CGROUP_BOOL(u, ip_accounting))
                 return -ENODATA;
 
-- 
2.47.3