#include "fd-util.h"
#include "log.h"
#include "missing.h"
+#include "path-util.h"
#include "util.h"
int bpf_program_new(uint32_t prog_type, BPFProgram **ret) {
if (!p)
return log_oom();
+ p->n_ref = 1;
p->prog_type = prog_type;
p->kernel_fd = -1;
return 0;
}
+BPFProgram *bpf_program_ref(BPFProgram *p) {
+ if (!p)
+ return NULL;
+
+ assert(p->n_ref > 0);
+ p->n_ref++;
+
+ return p;
+}
+
BPFProgram *bpf_program_unref(BPFProgram *p) {
if (!p)
return NULL;
+ assert(p->n_ref > 0);
+ p->n_ref--;
+
+ if (p->n_ref > 0)
+ return NULL;
+
+ /* Unfortunately, the kernel currently doesn't implicitly detach BPF programs from their cgroups when the last
+ * fd to the BPF program is closed. This has nasty side-effects since this means that abnormally terminated
+ * programs that attached one of their BPF programs to a cgroup will leave this programs pinned for good with
+ * zero chance of recovery, until the cgroup is removed. This is particularly problematic if the cgroup in
+ * question is the root cgroup (or any other cgroup belonging to a service that cannot be restarted during
+ * operation, such as dbus), as the memory for the BPF program can only be reclaimed through a reboot. To
+ * counter this, we track closely to which cgroup a program was attached to and will detach it on our own
+ * whenever we close the BPF fd. */
+ (void) bpf_program_cgroup_detach(p);
+
safe_close(p->kernel_fd);
free(p->instructions);
+ free(p->attached_path);
return mfree(p);
}
}
int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags) {
+ _cleanup_free_ char *copy = NULL;
_cleanup_close_ int fd = -1;
union bpf_attr attr;
+ int r;
assert(p);
assert(type >= 0);
assert(path);
+ if (!IN_SET(flags, 0, BPF_F_ALLOW_OVERRIDE, BPF_F_ALLOW_MULTI))
+ return -EINVAL;
+
+ /* We need to track which cgroup the program is attached to, and we can only track one attachment, hence let's
+ * refuse this early. */
+ if (p->attached_path) {
+ if (!path_equal(p->attached_path, path))
+ return -EBUSY;
+ if (p->attached_type != type)
+ return -EBUSY;
+ if (p->attached_flags != flags)
+ return -EBUSY;
+
+ /* Here's a shortcut: if we previously attached this program already, then we don't have to do so
+ * again. Well, with one exception: if we are in BPF_F_ALLOW_OVERRIDE mode then someone else might have
+ * replaced our program since the last time, hence let's reattach it again, just to be safe. In flags
+ * == 0 mode this is not an issue since nobody else can replace our program in that case, and in flags
+ * == BPF_F_ALLOW_MULTI mode any other's program would be installed in addition to ours hence ours
+ * would remain in effect. */
+ if (flags != BPF_F_ALLOW_OVERRIDE)
+ return 0;
+ }
+
+ /* Ensure we have a kernel object for this. */
+ r = bpf_program_load_kernel(p, NULL, 0);
+ if (r < 0)
+ return r;
+
+ copy = strdup(path);
+ if (!copy)
+ return -ENOMEM;
+
fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
if (fd < 0)
return -errno;
if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0)
return -errno;
+ free_and_replace(p->attached_path, copy);
+ p->attached_type = type;
+ p->attached_flags = flags;
+
return 0;
}
-int bpf_program_cgroup_detach(BPFProgram *p, int type, const char *path) {
+int bpf_program_cgroup_detach(BPFProgram *p) {
_cleanup_close_ int fd = -1;
- union bpf_attr attr;
- assert(type >= 0);
- assert(path);
+ assert(p);
- /* Note that 'p' may be NULL, in which case any program is detached. However, note that if BPF_F_ALLOW_MULTI is
- * used 'p' is not optional. */
+ if (!p->attached_path)
+ return -EUNATCH;
- fd = open(path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
- if (fd < 0)
- return -errno;
+ fd = open(p->attached_path, O_DIRECTORY|O_RDONLY|O_CLOEXEC);
+ if (fd < 0) {
+ if (errno != ENOENT)
+ return -errno;
- attr = (union bpf_attr) {
- .attach_type = type,
- .target_fd = fd,
- .attach_bpf_fd = p ? p->kernel_fd : -1,
- };
+ /* If the cgroup does not exist anymore, then we don't have to explicitly detach, it got detached
+ * implicitly by the removal, hence don't complain */
- if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0)
- return -errno;
+ } else {
+ union bpf_attr attr;
+
+ attr = (union bpf_attr) {
+ .attach_type = p->attached_type,
+ .target_fd = fd,
+ .attach_bpf_fd = p->kernel_fd,
+ };
+
+ if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0)
+ return -errno;
+ }
+
+ p->attached_path = mfree(p->attached_path);
return 0;
}
typedef struct BPFProgram BPFProgram;
struct BPFProgram {
+ unsigned n_ref;
+
int kernel_fd;
uint32_t prog_type;
size_t n_instructions;
size_t allocated;
struct bpf_insn *instructions;
+
+ char *attached_path;
+ int attached_type;
+ uint32_t attached_flags;
};
int bpf_program_new(uint32_t prog_type, BPFProgram **ret);
BPFProgram *bpf_program_unref(BPFProgram *p);
+BPFProgram *bpf_program_ref(BPFProgram *p);
int bpf_program_add_instructions(BPFProgram *p, const struct bpf_insn *insn, size_t count);
int bpf_program_load_kernel(BPFProgram *p, char *log_buf, size_t log_size);
int bpf_program_cgroup_attach(BPFProgram *p, int type, const char *path, uint32_t flags);
-int bpf_program_cgroup_detach(BPFProgram *p, int type, const char *path);
+int bpf_program_cgroup_detach(BPFProgram *p);
int bpf_map_new(enum bpf_map_type type, size_t key_size, size_t value_size, size_t max_entries, uint32_t flags);
int bpf_map_update_element(int fd, const void *key, void *value);
u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
- cc = unit_get_cgroup_context(u);
- if (!cc)
- return -EINVAL;
-
if (u->type != UNIT_SLICE) {
/* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
* nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
int bpf_firewall_install(Unit *u) {
_cleanup_free_ char *path = NULL;
CGroupContext *cc;
- uint32_t flags;
int r, supported;
+ uint32_t flags;
assert(u);
- if (!u->cgroup_path)
- return -EINVAL;
-
cc = unit_get_cgroup_context(u);
if (!cc)
return -EINVAL;
+ if (!u->cgroup_path)
+ return -EINVAL;
+ if (!u->cgroup_realized)
+ return -EINVAL;
supported = bpf_firewall_supported();
if (supported < 0)
flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
(u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
- if (u->ip_bpf_egress) {
- r = bpf_program_load_kernel(u->ip_bpf_egress, NULL, 0);
- if (r < 0)
- return log_error_errno(r, "Kernel upload of egress BPF program failed: %m");
+ /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
+ * minimize the time window when we don't account for IP traffic. */
+ u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
+ u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
+ if (u->ip_bpf_egress) {
r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
if (r < 0)
return log_error_errno(r, "Attaching egress BPF program to cgroup %s failed: %m", path);
- } else {
- r = bpf_program_cgroup_detach(NULL, BPF_CGROUP_INET_EGRESS, path);
- if (r < 0)
- return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_ERR, r,
- "Detaching egress BPF program from cgroup failed: %m");
+
+ /* Remember that this BPF program is installed now. */
+ u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
}
if (u->ip_bpf_ingress) {
- r = bpf_program_load_kernel(u->ip_bpf_ingress, NULL, 0);
- if (r < 0)
- return log_error_errno(r, "Kernel upload of ingress BPF program failed: %m");
-
r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
if (r < 0)
return log_error_errno(r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
- } else {
- r = bpf_program_cgroup_detach(NULL, BPF_CGROUP_INET_INGRESS, path);
- if (r < 0)
- return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_ERR, r,
- "Detaching ingress BPF program from cgroup failed: %m");
+
+ u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
}
return 0;
return bpf_map_update_element(map_fd, &key, &value);
}
-
int bpf_firewall_supported(void) {
struct bpf_insn trivial[] = {
BPF_MOV64_IMM(BPF_REG_0, 1),
safe_close(u->ipv6_deny_map_fd);
bpf_program_unref(u->ip_bpf_ingress);
+ bpf_program_unref(u->ip_bpf_ingress_installed);
bpf_program_unref(u->ip_bpf_egress);
+ bpf_program_unref(u->ip_bpf_egress_installed);
condition_free_list(u->conditions);
condition_free_list(u->asserts);
int ipv4_deny_map_fd;
int ipv6_deny_map_fd;
- BPFProgram *ip_bpf_ingress;
- BPFProgram *ip_bpf_egress;
+ BPFProgram *ip_bpf_ingress, *ip_bpf_ingress_installed;
+ BPFProgram *ip_bpf_egress, *ip_bpf_egress_installed;
uint64_t ip_accounting_extra[_CGROUP_IP_ACCOUNTING_METRIC_MAX];