bpf: use BPF_F_ALLOW_MULTI flag if it is available

author Lennart Poettering <lennart@poettering.net>

Fri, 16 Feb 2018 14:35:49 +0000 (15:35 +0100)

committer Lennart Poettering <lennart@poettering.net>

Wed, 21 Feb 2018 15:43:36 +0000 (16:43 +0100)
author Lennart Poettering <lennart@poettering.net>
Fri, 16 Feb 2018 14:35:49 +0000 (15:35 +0100)
committer Lennart Poettering <lennart@poettering.net>
Wed, 21 Feb 2018 15:43:36 +0000 (16:43 +0100)
diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c

index bbc876ba39fe14267e0b4140a6961dd4bf4e381d..4091183a99b786842655ba533de309f78c0c73b7 100644 (file)
--- a/src/core/bpf-firewall.c
+++ b/src/core/bpf-firewall.c
@@ -486,17 +486,26 @@ static int bpf_firewall_prepare_accounting_maps(bool enabled, int *fd_ingress, i
  
  int bpf_firewall_compile(Unit *u) {
          CGroupContext *cc;
-        int r;
+        int r, supported;
  
          assert(u);
  
-        r = bpf_firewall_supported();
-        if (r < 0)
-                return r;
-        if (r == BPF_FIREWALL_UNSUPPORTED) {
+        supported = bpf_firewall_supported();
+        if (supported < 0)
+                return supported;
+        if (supported == BPF_FIREWALL_UNSUPPORTED) {
                  log_debug("BPF firewalling not supported on this manager, proceeding without.");
                  return -EOPNOTSUPP;
          }
+        if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
+                /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
+                 * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
+                 * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
+                 * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
+                 * all, either. */
+                log_debug("BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
+                return -EOPNOTSUPP;
+        }
  
          /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
           * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
@@ -515,13 +524,21 @@ int bpf_firewall_compile(Unit *u) {
          if (!cc)
                  return -EINVAL;
  
-        r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd);
-        if (r < 0)
-                return log_error_errno(r, "Preparation of eBPF allow maps failed: %m");
+        if (u->type != UNIT_SLICE) {
+                /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
+                 * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
+                 * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
+                 * means that all configure IP access rules *will* take effect on processes, even though we never
+                 * compile them for inner nodes. */
  
-        r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd);
-        if (r < 0)
-                return log_error_errno(r, "Preparation of eBPF deny maps failed: %m");
+                r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd);
+                if (r < 0)
+                        return log_error_errno(r, "Preparation of eBPF allow maps failed: %m");
+
+                r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd);
+                if (r < 0)
+                        return log_error_errno(r, "Preparation of eBPF deny maps failed: %m");
+        }
  
          r = bpf_firewall_prepare_accounting_maps(cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
          if (r < 0)
@@ -541,7 +558,8 @@ int bpf_firewall_compile(Unit *u) {
  int bpf_firewall_install(Unit *u) {
          _cleanup_free_ char *path = NULL;
          CGroupContext *cc;
-        int r;
+        uint32_t flags;
+        int r, supported;
  
          assert(u);
  
@@ -552,24 +570,31 @@ int bpf_firewall_install(Unit *u) {
          if (!cc)
                  return -EINVAL;
  
-        r = bpf_firewall_supported();
-        if (r < 0)
-                return r;
-        if (r == BPF_FIREWALL_UNSUPPORTED) {
+        supported = bpf_firewall_supported();
+        if (supported < 0)
+                return supported;
+        if (supported == BPF_FIREWALL_UNSUPPORTED) {
                  log_debug("BPF firewalling not supported on this manager, proceeding without.");
                  return -EOPNOTSUPP;
          }
+        if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
+                log_debug("BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
+                return -EOPNOTSUPP;
+        }
  
          r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
          if (r < 0)
                  return log_error_errno(r, "Failed to determine cgroup path: %m");
  
+        flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
+                 (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
+
          if (u->ip_bpf_egress) {
                  r = bpf_program_load_kernel(u->ip_bpf_egress, NULL, 0);
                  if (r < 0)
                          return log_error_errno(r, "Kernel upload of egress BPF program failed: %m");
  
-                r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, unit_cgroup_delegate(u) ? BPF_F_ALLOW_OVERRIDE : 0);
+                r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
                  if (r < 0)
                          return log_error_errno(r, "Attaching egress BPF program to cgroup %s failed: %m", path);
          } else {
@@ -584,7 +609,7 @@ int bpf_firewall_install(Unit *u) {
                  if (r < 0)
                          return log_error_errno(r, "Kernel upload of ingress BPF program failed: %m");
  
-                r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, unit_cgroup_delegate(u) ? BPF_F_ALLOW_OVERRIDE : 0);
+                r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
                  if (r < 0)
                          return log_error_errno(r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
          } else {
diff --git a/src/core/cgroup.c b/src/core/cgroup.c

index edb702ce48a97701bb3b58a3236e32a75042c541..52431ec12fe4d69757ec9c8f10bdda84ce8d784f 100644 (file)
--- a/src/core/cgroup.c
+++ b/src/core/cgroup.c
@@ -693,20 +693,14 @@ static void cgroup_apply_unified_memory_limit(Unit *u, const char *file, uint64_
  }
  
  static void cgroup_apply_firewall(Unit *u) {
-        int r;
-
          assert(u);
  
-        if (u->type == UNIT_SLICE) /* Skip this for slice units, they are inner cgroup nodes, and since bpf/cgroup is
-                                    * not recursive we don't ever touch the bpf on them */
-                return;
+        /* Best-effort: let's apply IP firewalling and/or accounting if that's enabled */
  
-        r = bpf_firewall_compile(u);
-        if (r < 0)
+        if (bpf_firewall_compile(u) < 0)
                  return;
  
          (void) bpf_firewall_install(u);
-        return;
  }
  
  static void cgroup_context_apply(
@@ -1227,11 +1221,6 @@ bool unit_get_needs_bpf(Unit *u) {
          Unit *p;
          assert(u);
  
-        /* We never attach BPF to slice units, as they are inner cgroup nodes and cgroup/BPF is not recursive at the
-         * moment. */
-        if (u->type == UNIT_SLICE)
-                return false;
-
          c = unit_get_cgroup_context(u);
          if (!c)
                  return false;
@@ -2564,13 +2553,6 @@ int unit_get_ip_accounting(
          assert(metric < _CGROUP_IP_ACCOUNTING_METRIC_MAX);
          assert(ret);
  
-        /* IP accounting is currently not recursive, and hence we refuse to return any data for slice nodes. Slices are
-         * inner cgroup nodes and hence have no processes directly attached, hence their counters would be zero
-         * anyway. And if we block this now we can later open this up, if the kernel learns recursive BPF cgroup
-         * filters. */
-        if (u->type == UNIT_SLICE)
-                return -ENODATA;
-
          if (!UNIT_CGROUP_BOOL(u, ip_accounting))
                  return -ENODATA;
author	Lennart Poettering <lennart@poettering.net>
	Fri, 16 Feb 2018 14:35:49 +0000 (15:35 +0100)
committer	Lennart Poettering <lennart@poettering.net>
	Wed, 21 Feb 2018 15:43:36 +0000 (16:43 +0100)
src/core/bpf-firewall.c		patch \| blob \| blame \| history
src/core/cgroup.c		patch \| blob \| blame \| history