bpf: rework how we keep track and attach cgroup bpf programs

author Lennart Poettering <lennart@poettering.net>

Tue, 20 Feb 2018 18:28:24 +0000 (19:28 +0100)

committer Lennart Poettering <lennart@poettering.net>

Wed, 21 Feb 2018 15:43:36 +0000 (16:43 +0100)
author Lennart Poettering <lennart@poettering.net>
Tue, 20 Feb 2018 18:28:24 +0000 (19:28 +0100)
committer Lennart Poettering <lennart@poettering.net>
Wed, 21 Feb 2018 15:43:36 +0000 (16:43 +0100)
diff --git a/src/basic/bpf-program.c b/src/basic/bpf-program.c

index 4c128c6d6b915912ebb282ccfcd6013ccceb5e4e..a244742f917658d9952aa15ed7c17440f82116df 100644 (file)
--- a/src/basic/bpf-program.c
+++ b/src/basic/bpf-program.c
@@ -28,6 +28,7 @@
  #include "fd-util.h"
  #include "log.h"
  #include "missing.h"
+#include "path-util.h"
  #include "util.h"
  
  int bpf_program_new(uint32_t prog_type, BPFProgram **ret) {
@@ -37,6 +38,7 @@ int bpf_program_new(uint32_t prog_type, BPFProgram **ret) {
          if (!p)
                  return log_oom();
  
+        p->n_ref = 1;
          p->prog_type = prog_type;
          p->kernel_fd = -1;
  
@@ -45,12 +47,39 @@ int bpf_program_new(uint32_t prog_type, BPFProgram **ret) {
          return 0;
  }
  
+BPFProgram *bpf_program_ref(BPFProgram *p) {
+        if (!p)
+                return NULL;
+
+        assert(p->n_ref > 0);
+        p->n_ref++;
+
+        return p;
+}
+
  BPFProgram *bpf_program_unref(BPFProgram *p) {
          if (!p)
                  return NULL;
  
+        assert(p->n_ref > 0);
+        p->n_ref--;
+
+        if (p->n_ref > 0)
+                return NULL;
+
+        /* Unfortunately, the kernel currently doesn't implicitly detach BPF programs from their cgroups when the last
+         * fd to the BPF program is closed. This has nasty side-effects since this means that abnormally terminated
+         * programs that attached one of their BPF programs to a cgroup will leave this programs pinned for good with
+         * zero chance of recovery, until the cgroup is removed. This is particularly problematic if the cgroup in
+         * question is the root cgroup (or any other cgroup belonging to a service that cannot be restarted during
+         * operation, such as dbus), as the memory for the BPF program can only be reclaimed through a reboot. To
+         * counter this, we track closely to which cgroup a program was attached to and will detach it on our own
+         * whenever we close the BPF fd. */
+        (void) bpf_program_cgroup_detach(p);
+
          safe_close(p->kernel_fd);
          free(p->instructions);
+        free(p->attached_path);
  
          return mfree(p);
  }
@@ -99,13 +128,47 @@ int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size) {
  }
  
  int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags) {
+        _cleanup_free_ char *copy = NULL;
          _cleanup_close_ int fd = -1;
          union bpf_attr attr;
+        int r;
  
          assert(p);
          assert(type >= 0);
          assert(path);
  
+        if (!IN_SET(flags, 0, BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI))
+                return -EINVAL;
+
+        /* We need to track which cgroup the program is attached to, and we can only track one attachment, hence let's
+        * refuse this early. */
+        if (p->attached_path) {
+                if (!path_equal(p->attached_path, path))
+                        return -EBUSY;
+                if (p->attached_type != type)
+                        return -EBUSY;
+                if (p->attached_flags != flags)
+                        return -EBUSY;
+
+                /* Here's a shortcut: if we previously attached this program already, then we don't have to do so
+                 * again. Well, with one exception: if we are in BPF_F_ALLOW_OVERRIDE mode then someone else might have
+                 * replaced our program since the last time, hence let's reattach it again, just to be safe. In flags
+                 * == 0 mode this is not an issue since nobody else can replace our program in that case, and in flags
+                 * == BPF_F_ALLOW_MULTI mode any other's program would be installed in addition to ours hence ours
+                 * would remain in effect. */
+                if (flags != BPF_F_ALLOW_OVERRIDE)
+                        return 0;
+        }
+
+        /* Ensure we have a kernel object for this. */
+        r = bpf_program_load_kernel(p, NULL, 0);
+        if (r < 0)
+                return r;
+
+        copy = strdup(path);
+        if (!copy)
+                return -ENOMEM;
+
          fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
          if (fd < 0)
                  return -errno;
@@ -120,31 +183,43 @@ int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_
          if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0)
                  return -errno;
  
+        free_and_replace(p->attached_path, copy);
+        p->attached_type = type;
+        p->attached_flags = flags;
+
          return 0;
  }
  
-int bpf_program_cgroup_detach(BPFProgram *p, int type, const char *path) {
+int bpf_program_cgroup_detach(BPFProgram *p) {
          _cleanup_close_ int fd = -1;
-        union bpf_attr attr;
  
-        assert(type >= 0);
-        assert(path);
+        assert(p);
  
-        /* Note that 'p' may be NULL, in which case any program is detached. However, note that if BPF_F_ALLOW_MULTI is
-         * used 'p' is not optional. */
+        if (!p->attached_path)
+                return -EUNATCH;
  
-        fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
-        if (fd < 0)
-                return -errno;
+        fd = open(p->attached_path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
+        if (fd < 0) {
+                if (errno != ENOENT)
+                        return -errno;
  
-        attr = (union bpf_attr) {
-                .attach_type = type,
-                .target_fd = fd,
-                .attach_bpf_fd = p ? p->kernel_fd : -1,
-        };
+                /* If the cgroup does not exist anymore, then we don't have to explicitly detach, it got detached
+                 * implicitly by the removal, hence don't complain */
  
-        if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0)
-                return -errno;
+        } else {
+                union bpf_attr attr;
+
+                attr = (union bpf_attr) {
+                        .attach_type = p->attached_type,
+                        .target_fd = fd,
+                        .attach_bpf_fd = p->kernel_fd,
+                };
+
+                if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0)
+                        return -errno;
+        }
+
+        p->attached_path = mfree(p->attached_path);
  
          return 0;
  }
diff --git a/src/basic/bpf-program.h b/src/basic/bpf-program.h

index 996c1c1ad151645089636af8aab2d87a57943046..3d6c5e50efd217409e09f682fd3632bc173a83a5 100644 (file)
--- a/src/basic/bpf-program.h
+++ b/src/basic/bpf-program.h
@@ -32,22 +32,29 @@
  typedef struct BPFProgram BPFProgram;
  
  struct BPFProgram {
+        unsigned n_ref;
+
          int kernel_fd;
          uint32_t prog_type;
  
          size_t n_instructions;
          size_t allocated;
          struct bpf_insn *instructions;
+
+        char *attached_path;
+        int attached_type;
+        uint32_t attached_flags;
  };
  
  int bpf_program_new(uint32_t prog_type, BPFProgram **ret);
  BPFProgram *bpf_program_unref(BPFProgram *p);
+BPFProgram *bpf_program_ref(BPFProgram *p);
  
  int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *insn, size_t count);
  int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size);
  
  int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags);
-int bpf_program_cgroup_detach(BPFProgram *p, int type, const char *path);
+int bpf_program_cgroup_detach(BPFProgram *p);
  
  int bpf_map_new(enum bpf_map_type type, size_t key_size, size_t value_size, size_t max_entries, uint32_t flags);
  int bpf_map_update_element(int fd, const void *key, void *value);
diff --git a/src/core/bpf-firewall.c b/src/core/bpf-firewall.c

index 4091183a99b786842655ba533de309f78c0c73b7..67cbbca734a91cfa964a8a197bb6223f0e15a924 100644 (file)
--- a/src/core/bpf-firewall.c
+++ b/src/core/bpf-firewall.c
@@ -520,10 +520,6 @@ int bpf_firewall_compile(Unit *u) {
          u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
          u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
  
-        cc = unit_get_cgroup_context(u);
-        if (!cc)
-                return -EINVAL;
-
          if (u->type != UNIT_SLICE) {
                  /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
                   * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
@@ -558,17 +554,18 @@ int bpf_firewall_compile(Unit *u) {
  int bpf_firewall_install(Unit *u) {
          _cleanup_free_ char *path = NULL;
          CGroupContext *cc;
-        uint32_t flags;
          int r, supported;
+        uint32_t flags;
  
          assert(u);
  
-        if (!u->cgroup_path)
-                return -EINVAL;
-
          cc = unit_get_cgroup_context(u);
          if (!cc)
                  return -EINVAL;
+        if (!u->cgroup_path)
+                return -EINVAL;
+        if (!u->cgroup_realized)
+                return -EINVAL;
  
          supported = bpf_firewall_supported();
          if (supported < 0)
@@ -589,34 +586,26 @@ int bpf_firewall_install(Unit *u) {
          flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
                   (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
  
-        if (u->ip_bpf_egress) {
-                r = bpf_program_load_kernel(u->ip_bpf_egress, NULL, 0);
-                if (r < 0)
-                        return log_error_errno(r, "Kernel upload of egress BPF program failed: %m");
+        /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
+         * minimize the time window when we don't account for IP traffic. */
+        u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
+        u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
  
+        if (u->ip_bpf_egress) {
                  r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
                  if (r < 0)
                          return log_error_errno(r, "Attaching egress BPF program to cgroup %s failed: %m", path);
-        } else {
-                r = bpf_program_cgroup_detach(NULL, BPF_CGROUP_INET_EGRESS, path);
-                if (r < 0)
-                        return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_ERR, r,
-                                              "Detaching egress BPF program from cgroup failed: %m");
+
+                /* Remember that this BPF program is installed now. */
+                u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
          }
  
          if (u->ip_bpf_ingress) {
-                r = bpf_program_load_kernel(u->ip_bpf_ingress, NULL, 0);
-                if (r < 0)
-                        return log_error_errno(r, "Kernel upload of ingress BPF program failed: %m");
-
                  r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
                  if (r < 0)
                          return log_error_errno(r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
-        } else {
-                r = bpf_program_cgroup_detach(NULL, BPF_CGROUP_INET_INGRESS, path);
-                if (r < 0)
-                        return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_ERR, r,
-                                              "Detaching ingress BPF program from cgroup failed: %m");
+
+                u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
          }
  
          return 0;
@@ -665,7 +654,6 @@ int bpf_firewall_reset_accounting(int map_fd) {
          return bpf_map_update_element(map_fd, &key, &value);
  }
  
-
  int bpf_firewall_supported(void) {
          struct bpf_insn trivial[] = {
                  BPF_MOV64_IMM(BPF_REG_0, 1),
diff --git a/src/core/unit.c b/src/core/unit.c

index 8c0e157a9003aa30f1044a216f02ad804b896d91..61a720374466dba473a69621c15cfda1b89fdeda 100644 (file)
--- a/src/core/unit.c
+++ b/src/core/unit.c
@@ -659,7 +659,9 @@ void unit_free(Unit *u) {
          safe_close(u->ipv6_deny_map_fd);
  
          bpf_program_unref(u->ip_bpf_ingress);
+        bpf_program_unref(u->ip_bpf_ingress_installed);
          bpf_program_unref(u->ip_bpf_egress);
+        bpf_program_unref(u->ip_bpf_egress_installed);
  
          condition_free_list(u->conditions);
          condition_free_list(u->asserts);
diff --git a/src/core/unit.h b/src/core/unit.h

index 5cab2211bb96b20b04ce8b312e3096ccbe1f7c48..e903bf8ad7ef461481894bd04cdc14a366efb3b7 100644 (file)
--- a/src/core/unit.h
+++ b/src/core/unit.h
@@ -287,8 +287,8 @@ struct Unit {
          int ipv4_deny_map_fd;
          int ipv6_deny_map_fd;
  
-        BPFProgram *ip_bpf_ingress;
-        BPFProgram *ip_bpf_egress;
+        BPFProgram *ip_bpf_ingress, *ip_bpf_ingress_installed;
+        BPFProgram *ip_bpf_egress, *ip_bpf_egress_installed;
  
          uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX];
author	Lennart Poettering <lennart@poettering.net>
	Tue, 20 Feb 2018 18:28:24 +0000 (19:28 +0100)
committer	Lennart Poettering <lennart@poettering.net>
	Wed, 21 Feb 2018 15:43:36 +0000 (16:43 +0100)
src/basic/bpf-program.c		patch \| blob \| blame \| history
src/basic/bpf-program.h		patch \| blob \| blame \| history
src/core/bpf-firewall.c		patch \| blob \| blame \| history
src/core/unit.c		patch \| blob \| blame \| history
src/core/unit.h		patch \| blob \| blame \| history