[thirdparty/systemd.git] / src / core / bpf-firewall.c

/***
  This file is part of systemd.

  Copyright 2016 Daniel Mack

  systemd is free software; you can redistribute it and/or modify it
  under the terms of the GNU Lesser General Public License as published by
  the Free Software Foundation; either version 2.1 of the License, or
  (at your option) any later version.

  systemd is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  Lesser General Public License for more details.

  You should have received a copy of the GNU Lesser General Public License
  along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/

#include <arpa/inet.h>
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/libbpf.h>
#include <net/ethernet.h>
#include <net/if.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
#include <stddef.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "alloc-util.h"
#include "bpf-firewall.h"
#include "bpf-program.h"
#include "fd-util.h"
#include "ip-address-access.h"
#include "unit.h"

enum {
        MAP_KEY_PACKETS,
        MAP_KEY_BYTES,
};

enum {
        ACCESS_ALLOWED = 1,
        ACCESS_DENIED  = 2,
};

/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */

static int add_lookup_instructions(
                BPFProgram *p,
                int map_fd,
                int protocol,
                bool is_ingress,
                int verdict) {

        int r, addr_offset, addr_size;

        assert(p);
        assert(map_fd >= 0);

        switch (protocol) {

        case ETH_P_IP:
                addr_size = sizeof(uint32_t);
                addr_offset = is_ingress ?
                        offsetof(struct iphdr, saddr) :
                        offsetof(struct iphdr, daddr);
                break;

        case ETH_P_IPV6:
                addr_size = 4 * sizeof(uint32_t);
                addr_offset = is_ingress ?
                        offsetof(struct ip6_hdr, ip6_src.s6_addr) :
                        offsetof(struct ip6_hdr, ip6_dst.s6_addr);
                break;

        default:
                return -EAFNOSUPPORT;
        }

        do {
                /* Compare IPv4 with one word instruction (32bit) */
                struct bpf_insn insn[] = {
                        /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
                        BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),

                        /*
                         * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
                         *
                         * R1: Pointer to the skb
                         * R2: Data offset
                         * R3: Destination buffer on the stack (r10 - 4)
                         * R4: Number of bytes to read (4)
                         */

                        BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
                        BPF_MOV32_IMM(BPF_REG_2, addr_offset),

                        BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),

                        BPF_MOV32_IMM(BPF_REG_4, addr_size),
                        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),

                        /*
                         * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
                         * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
                         * has to be set to the maximum possible value.
                         *
                         * On success, the looked up value is stored in R0. For this application, the actual
                         * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
                         * matching value.
                         */

                        BPF_LD_MAP_FD(BPF_REG_1, map_fd),
                        BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
                        BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),

                        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
                        BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
                        BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
                };

                /* Jump label fixup */
                insn[0].off = ELEMENTSOF(insn) - 1;

                r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
                if (r < 0)
                        return r;

        } while (false);

        return 0;
}

static int bpf_firewall_compile_bpf(
                Unit *u,
                bool is_ingress,
                BPFProgram **ret) {

        struct bpf_insn pre_insn[] = {
                /*
                 * When the eBPF program is entered, R1 contains the address of the skb.
                 * However, R1-R5 are scratch registers that are not preserved when calling
                 * into kernel functions, so we need to save anything that's supposed to
                 * stay around to R6-R9. Save the skb to R6.
                 */
                BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),

                /*
                 * Although we cannot access the skb data directly from eBPF programs used in this
                 * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
                 * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
                 * for later use.
                 */
                BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),

                /*
                 * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
                 * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
                 */
                BPF_MOV32_IMM(BPF_REG_8, 0),
        };

        /*
         * The access checkers compiled for the configured allowance and denial lists
         * write to R8 at runtime. The following code prepares for an early exit that
         * skip the accounting if the packet is denied.
         *
         * R0 = 1
         * if (R8 == ACCESS_DENIED)
         *     R0 = 0
         *
         * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
         * is allowed to pass.
         */
        struct bpf_insn post_insn[] = {
                BPF_MOV64_IMM(BPF_REG_0, 1),
                BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
                BPF_MOV64_IMM(BPF_REG_0, 0),
        };

        _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
        int accounting_map_fd, r;
        bool access_enabled;

        assert(u);
        assert(ret);

        accounting_map_fd = is_ingress ?
                u->ip_accounting_ingress_map_fd :
                u->ip_accounting_egress_map_fd;

        access_enabled =
                u->ipv4_allow_map_fd >= 0 ||
                u->ipv6_allow_map_fd >= 0 ||
                u->ipv4_deny_map_fd >= 0 ||
                u->ipv6_deny_map_fd >= 0;

        if (accounting_map_fd < 0 && !access_enabled) {
                *ret = NULL;
                return 0;
        }

        r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
        if (r < 0)
                return r;

        r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
        if (r < 0)
                return r;

        if (access_enabled) {
                /*
                 * The simple rule this function translates into eBPF instructions is:
                 *
                 * - Access will be granted when an address matches an entry in @list_allow
                 * - Otherwise, access will be denied when an address matches an entry in @list_deny
                 * - Otherwise, access will be granted
                 */

                if (u->ipv4_deny_map_fd >= 0) {
                        r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
                        if (r < 0)
                                return r;
                }

                if (u->ipv6_deny_map_fd >= 0) {
                        r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
                        if (r < 0)
                                return r;
                }

                if (u->ipv4_allow_map_fd >= 0) {
                        r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
                        if (r < 0)
                                return r;
                }

                if (u->ipv6_allow_map_fd >= 0) {
                        r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
                        if (r < 0)
                                return r;
                }
        }

        r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
        if (r < 0)
                return r;

        if (accounting_map_fd >= 0) {
                struct bpf_insn insn[] = {
                        /*
                         * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
                         * The jump label will be fixed up later.
                         */
                        BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),

                        /* Count packets */
                        BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
                        BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
                        BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
                        BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
                        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
                        BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
                        BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
                        BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */

                        /* Count bytes */
                        BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
                        BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
                        BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
                        BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
                        BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
                        BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
                        BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
                        BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
                        BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */

                        /* Allow the packet to pass */
                        BPF_MOV64_IMM(BPF_REG_0, 1),
                };

                /* Jump label fixup */
                insn[0].off = ELEMENTSOF(insn) - 1;

                r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
                if (r < 0)
                        return r;
        }

        do {
                /*
                 * Exit from the eBPF program, R0 contains the verdict.
                 * 0 means the packet is denied, 1 means the packet may pass.
                 */
                struct bpf_insn insn[] = {
                        BPF_EXIT_INSN()
                };

                r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
                if (r < 0)
                        return r;
        } while (false);

        *ret = p;
        p = NULL;

        return 0;
}

static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) {
        IPAddressAccessItem *a;

        assert(n_ipv4);
        assert(n_ipv6);

        LIST_FOREACH(items, a, list) {
                switch (a->family) {

                case AF_INET:
                        (*n_ipv4)++;
                        break;

                case AF_INET6:
                        (*n_ipv6)++;
                        break;

                default:
                        return -EAFNOSUPPORT;
                }
        }

        return 0;
}

static int bpf_firewall_add_access_items(
                IPAddressAccessItem *list,
                int ipv4_map_fd,
                int ipv6_map_fd,
                int verdict) {

        struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
        uint64_t value = verdict;
        IPAddressAccessItem *a;
        int r;

        key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
        key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);

        LIST_FOREACH(items, a, list) {
                switch (a->family) {

                case AF_INET:
                        key_ipv4->prefixlen = a->prefixlen;
                        memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));

                        r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
                        if (r < 0)
                                return r;

                        break;

                case AF_INET6:
                        key_ipv6->prefixlen = a->prefixlen;
                        memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));

                        r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
                        if (r < 0)
                                return r;

                        break;

                default:
                        return -EAFNOSUPPORT;
                }
        }

        return 0;
}

static int bpf_firewall_prepare_access_maps(
                Unit *u,
                int verdict,
                int *ret_ipv4_map_fd,
                int *ret_ipv6_map_fd) {

        _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
        size_t n_ipv4 = 0, n_ipv6 = 0;
        Unit *p;
        int r;

        assert(ret_ipv4_map_fd);
        assert(ret_ipv6_map_fd);

        for (p = u; p; p = UNIT_DEREF(p->slice)) {
                CGroupContext *cc;

                cc = unit_get_cgroup_context(p);
                if (!cc)
                        continue;

                bpf_firewall_count_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, &n_ipv4, &n_ipv6);
        }

        if (n_ipv4 > 0) {
                ipv4_map_fd = bpf_map_new(
                                BPF_MAP_TYPE_LPM_TRIE,
                                offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
                                sizeof(uint64_t),
                                n_ipv4,
                                BPF_F_NO_PREALLOC);
                if (ipv4_map_fd < 0)
                        return ipv4_map_fd;
        }

        if (n_ipv6 > 0) {
                ipv6_map_fd = bpf_map_new(
                                BPF_MAP_TYPE_LPM_TRIE,
                                offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
                                sizeof(uint64_t),
                                n_ipv6,
                                BPF_F_NO_PREALLOC);
                if (ipv6_map_fd < 0)
                        return ipv6_map_fd;
        }

        for (p = u; p; p = UNIT_DEREF(p->slice)) {
                CGroupContext *cc;

                cc = unit_get_cgroup_context(p);
                if (!cc)
                        continue;

                r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
                                                  ipv4_map_fd, ipv6_map_fd, verdict);
                if (r < 0)
                        return r;
        }

        *ret_ipv4_map_fd = ipv4_map_fd;
        *ret_ipv6_map_fd = ipv6_map_fd;

        ipv4_map_fd = ipv6_map_fd = -1;
        return 0;
}

static int bpf_firewall_prepare_accounting_maps(bool enabled, int *fd_ingress, int *fd_egress) {
        int r;

        assert(fd_ingress);
        assert(fd_egress);

        if (enabled) {
                if (*fd_ingress < 0) {
                        r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
                        if (r < 0)
                                return r;

                        *fd_ingress = r;
                }

                if (*fd_egress < 0) {

                        r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
                        if (r < 0)
                                return r;

                        *fd_egress = r;
                }
        } else {
                *fd_ingress = safe_close(*fd_ingress);
                *fd_egress = safe_close(*fd_egress);
        }

        return 0;
}

int bpf_firewall_compile(Unit *u) {
        CGroupContext *cc;
        int r;

        assert(u);

        r = bpf_firewall_supported();
        if (r < 0)
                return r;
        if (r == 0) {
                log_debug("BPF firewalling not supported on this systemd, proceeding without.");
                return -EOPNOTSUPP;
        }

        /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
         * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
         * configuration, but we don't flush out the accounting unnecessarily */

        u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
        u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);

        u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
        u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);

        u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
        u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);

        cc = unit_get_cgroup_context(u);
        if (!cc)
                return -EINVAL;

        r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd);
        if (r < 0)
                return log_error_errno(r, "Preparation of eBPF allow maps failed: %m");

        r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd);
        if (r < 0)
                return log_error_errno(r, "Preparation of eBPF deny maps failed: %m");

        r = bpf_firewall_prepare_accounting_maps(cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
        if (r < 0)
                return log_error_errno(r, "Preparation of eBPF accounting maps failed: %m");

        r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress);
        if (r < 0)
                return log_error_errno(r, "Compilation for ingress BPF program failed: %m");

        r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress);
        if (r < 0)
                return log_error_errno(r, "Compilation for egress BPF program failed: %m");

        return 0;
}

int bpf_firewall_install(Unit *u) {
        _cleanup_free_ char *path = NULL;
        CGroupContext *cc;
        int r;

        assert(u);

        if (!u->cgroup_path)
                return -EINVAL;

        cc = unit_get_cgroup_context(u);
        if (!cc)
                return -EINVAL;

        r = bpf_firewall_supported();
        if (r < 0)
                return r;
        if (r == 0) {
                log_debug("BPF firewalling not supported on this systemd, proceeding without.");
                return -EOPNOTSUPP;
        }

        r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
        if (r < 0)
                return log_error_errno(r, "Failed to determine cgroup path: %m");

        if (u->ip_bpf_egress) {
                r = bpf_program_load_kernel(u->ip_bpf_egress, NULL, 0);
                if (r < 0)
                        return log_error_errno(r, "Kernel upload of egress BPF program failed: %m");

                r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, cc->delegate ? BPF_F_ALLOW_OVERRIDE : 0);
                if (r < 0)
                        return log_error_errno(r, "Attaching egress BPF program to cgroup %s failed: %m", path);
        } else {
                r = bpf_program_cgroup_detach(BPF_CGROUP_INET_EGRESS, path);
                if (r < 0)
                        return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_ERR, r,
                                              "Detaching egress BPF program from cgroup failed: %m");
        }

        if (u->ip_bpf_ingress) {
                r = bpf_program_load_kernel(u->ip_bpf_ingress, NULL, 0);
                if (r < 0)
                        return log_error_errno(r, "Kernel upload of ingress BPF program failed: %m");

                r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, cc->delegate ? BPF_F_ALLOW_OVERRIDE : 0);
                if (r < 0)
                        return log_error_errno(r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
        } else {
                r = bpf_program_cgroup_detach(BPF_CGROUP_INET_INGRESS, path);
                if (r < 0)
                        return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_ERR, r,
                                              "Detaching ingress BPF program from cgroup failed: %m");
        }

        return 0;
}

int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
        uint64_t key, packets;
        int r;

        if (map_fd < 0)
                return -EBADF;

        if (ret_packets) {
                key = MAP_KEY_PACKETS;
                r = bpf_map_lookup_element(map_fd, &key, &packets);
                if (r < 0)
                        return r;
        }

        if (ret_bytes) {
                key = MAP_KEY_BYTES;
                r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
                if (r < 0)
                        return r;
        }

        if (ret_packets)
                *ret_packets = packets;

        return 0;
}

int bpf_firewall_reset_accounting(int map_fd) {
        uint64_t key, value = 0;
        int r;

        if (map_fd < 0)
                return -EBADF;

        key = MAP_KEY_PACKETS;
        r = bpf_map_update_element(map_fd, &key, &value);
        if (r < 0)
                return r;

        key = MAP_KEY_BYTES;
        return bpf_map_update_element(map_fd, &key, &value);
}


int bpf_firewall_supported(void) {
        struct bpf_insn trivial[] = {
                BPF_MOV64_IMM(BPF_REG_0, 1),
                BPF_EXIT_INSN()
        };

        _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
        static int supported = -1;
        int fd, r;

        /* Checks whether BPF firewalling is supported. For this, we check three things:
         *
         * a) whether we are privileged
         * b) whether the unified hierarchy is being used
         * c) the BPF implementation in the kernel supports BPF LPM TRIE maps, which we require
         *
         */

        if (supported >= 0)
                return supported;

        if (geteuid() != 0) {
                log_debug("Not enough privileges, BPF firewalling is not supported.");
                return supported = false;
        }

        r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
        if (r < 0)
                return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
        if (r == 0)
                return supported = false;

        fd = bpf_map_new(BPF_MAP_TYPE_LPM_TRIE,
                         offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint64_t),
                         sizeof(uint64_t),
                         1,
                         BPF_F_NO_PREALLOC);
        if (fd < 0) {
                log_debug_errno(r, "Can't allocate BPF LPM TRIE map, BPF firewalling is not supported: %m");
                return supported = false;
        }

        safe_close(fd);

        if (bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program) < 0) {
                log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
                return supported = false;
        }

        r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
        if (r < 0) {
                log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
                return supported = false;
        }

        r = bpf_program_load_kernel(program, NULL, 0);
        if (r < 0) {
                log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
                return supported = false;
        }

        return supported = true;
}
Commit	Line	Data
1988a9d1 DM	1	/***
	2	This file is part of systemd.
	3
	4	Copyright 2016 Daniel Mack
	5
	6	systemd is free software; you can redistribute it and/or modify it
	7	under the terms of the GNU Lesser General Public License as published by
	8	the Free Software Foundation; either version 2.1 of the License, or
	9	(at your option) any later version.
	10
	11	systemd is distributed in the hope that it will be useful, but
	12	WITHOUT ANY WARRANTY; without even the implied warranty of
	13	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
	14	Lesser General Public License for more details.
	15
	16	You should have received a copy of the GNU Lesser General Public License
	17	along with systemd; If not, see <http://www.gnu.org/licenses/>.
	18	***/
	19
	20	#include <arpa/inet.h>
	21	#include <assert.h>
	22	#include <errno.h>
	23	#include <fcntl.h>
	24	#include <linux/libbpf.h>
	25	#include <net/ethernet.h>
	26	#include <net/if.h>
	27	#include <netinet/ip.h>
	28	#include <netinet/ip6.h>
	29	#include <stddef.h>
	30	#include <stdio.h>
	31	#include <stdlib.h>
	32	#include <string.h>
	33	#include <unistd.h>
	34
	35	#include "alloc-util.h"
	36	#include "bpf-firewall.h"
	37	#include "bpf-program.h"
	38	#include "fd-util.h"
	39	#include "ip-address-access.h"
	40	#include "unit.h"
	41
	42	enum {
	43	MAP_KEY_PACKETS,
	44	MAP_KEY_BYTES,
	45	};
	46
	47	enum {
	48	ACCESS_ALLOWED = 1,
	49	ACCESS_DENIED = 2,
	50	};
	51
	52	/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
	53
	54	static int add_lookup_instructions(
	55	BPFProgram *p,
	56	int map_fd,
	57	int protocol,
	58	bool is_ingress,
	59	int verdict) {
	60
	61	int r, addr_offset, addr_size;
	62
	63	assert(p);
	64	assert(map_fd >= 0);
65
66	switch (protocol) {
67
68	case ETH_P_IP:
69	addr_size = sizeof(uint32_t);
70	addr_offset = is_ingress ?
71	offsetof(struct iphdr, saddr) :
72	offsetof(struct iphdr, daddr);
73	break;
74
75	case ETH_P_IPV6:
76	addr_size = 4 * sizeof(uint32_t);
77	addr_offset = is_ingress ?
78	offsetof(struct ip6_hdr, ip6_src.s6_addr) :
79	offsetof(struct ip6_hdr, ip6_dst.s6_addr);
80	break;
81
82	default:
83	return -EAFNOSUPPORT;
84	}
85
86	do {
87	/* Compare IPv4 with one word instruction (32bit) */
88	struct bpf_insn insn[] = {
89	/* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
90	BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
91
92	/*
93	* Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
94	*
95	* R1: Pointer to the skb
96	* R2: Data offset
97	* R3: Destination buffer on the stack (r10 - 4)
98	* R4: Number of bytes to read (4)
99	*/
100
101	BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
102	BPF_MOV32_IMM(BPF_REG_2, addr_offset),
103
104	BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
105	BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
106
107	BPF_MOV32_IMM(BPF_REG_4, addr_size),
108	BPF_RAW_INSN(BPF_JMP \| BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
109
110	/*
111	* Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
112	* LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
113	* has to be set to the maximum possible value.
114	*
115	* On success, the looked up value is stored in R0. For this application, the actual
116	* value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
117	* matching value.
118	*/
119
120	BPF_LD_MAP_FD(BPF_REG_1, map_fd),
121	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
122	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
123	BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
124
125	BPF_RAW_INSN(BPF_JMP \| BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
126	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
127	BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
128	};
129
130	/* Jump label fixup */
131	insn[0].off = ELEMENTSOF(insn) - 1;
132
133	r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
134	if (r < 0)
135	return r;
136
137	} while (false);
138
139	return 0;
140	}
141
142	static int bpf_firewall_compile_bpf(
143	Unit *u,
144	bool is_ingress,
145	BPFProgram **ret) {
146
147	struct bpf_insn pre_insn[] = {
148	/*
149	* When the eBPF program is entered, R1 contains the address of the skb.
150	* However, R1-R5 are scratch registers that are not preserved when calling
151	* into kernel functions, so we need to save anything that's supposed to
152	* stay around to R6-R9. Save the skb to R6.
153	*/
154	BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
155
156	/*
157	* Although we cannot access the skb data directly from eBPF programs used in this
158	* scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
159	* Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
160	* for later use.
161	*/
162	BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
163
164	/*
165	* R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
166	* through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
167	*/
168	BPF_MOV32_IMM(BPF_REG_8, 0),
169	};
170
171	/*
172	* The access checkers compiled for the configured allowance and denial lists
173	* write to R8 at runtime. The following code prepares for an early exit that
174	* skip the accounting if the packet is denied.
175	*
176	* R0 = 1
177	* if (R8 == ACCESS_DENIED)
178	* R0 = 0
179	*
180	* This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
181	* is allowed to pass.
182	*/
183	struct bpf_insn post_insn[] = {
184	BPF_MOV64_IMM(BPF_REG_0, 1),
185	BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
186	BPF_MOV64_IMM(BPF_REG_0, 0),
187	};
188
189	_cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
190	int accounting_map_fd, r;
191	bool access_enabled;
192
193	assert(u);
194	assert(ret);
195
196	accounting_map_fd = is_ingress ?
197	u->ip_accounting_ingress_map_fd :
198	u->ip_accounting_egress_map_fd;
199
200	access_enabled =
201	u->ipv4_allow_map_fd >= 0 \|\|
202	u->ipv6_allow_map_fd >= 0 \|\|
203	u->ipv4_deny_map_fd >= 0 \|\|
204	u->ipv6_deny_map_fd >= 0;
205
206	if (accounting_map_fd < 0 && !access_enabled) {
207	*ret = NULL;
208	return 0;
209	}
210
211	r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
212	if (r < 0)
213	return r;
214
215	r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
216	if (r < 0)
217	return r;
218
219	if (access_enabled) {
220	/*
221	* The simple rule this function translates into eBPF instructions is:
222	*
223	* - Access will be granted when an address matches an entry in @list_allow
224	* - Otherwise, access will be denied when an address matches an entry in @list_deny
225	* - Otherwise, access will be granted
226	*/
227
228	if (u->ipv4_deny_map_fd >= 0) {
229	r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
230	if (r < 0)
231	return r;
232	}
233
234	if (u->ipv6_deny_map_fd >= 0) {
235	r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
236	if (r < 0)
237	return r;
238	}
239
240	if (u->ipv4_allow_map_fd >= 0) {
241	r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
242	if (r < 0)
243	return r;
244	}
245
246	if (u->ipv6_allow_map_fd >= 0) {
247	r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
248	if (r < 0)
249	return r;
250	}
251	}
252
253	r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
254	if (r < 0)
255	return r;
256
257	if (accounting_map_fd >= 0) {
258	struct bpf_insn insn[] = {
259	/*
260	* If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
261	* The jump label will be fixed up later.
262	*/
263	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
264
265	/* Count packets */
266	BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
267	BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* (u32 )(fp - 4) = r0 */
268	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
269	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
270	BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
271	BPF_RAW_INSN(BPF_JMP \| BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
272	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
273	BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
274	BPF_RAW_INSN(BPF_STX \| BPF_XADD \| BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
275
276	/* Count bytes */
277	BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
278	BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* (u32 )(fp - 4) = r0 */
279	BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
280	BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
281	BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
282	BPF_RAW_INSN(BPF_JMP \| BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
283	BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
284	BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
285	BPF_RAW_INSN(BPF_STX \| BPF_XADD \| BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
286
287	/* Allow the packet to pass */
288	BPF_MOV64_IMM(BPF_REG_0, 1),
289	};
290
291	/* Jump label fixup */
292	insn[0].off = ELEMENTSOF(insn) - 1;
293
294	r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
295	if (r < 0)
296	return r;
297	}
298
299	do {
300	/*
301	* Exit from the eBPF program, R0 contains the verdict.
302	* 0 means the packet is denied, 1 means the packet may pass.
303	*/
304	struct bpf_insn insn[] = {
305	BPF_EXIT_INSN()
306	};
307
308	r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
309	if (r < 0)
310	return r;
311	} while (false);
312
313	*ret = p;
314	p = NULL;
315
316	return 0;
317	}
318
319	static int bpf_firewall_count_access_items(IPAddressAccessItem list, size_t n_ipv4, size_t *n_ipv6) {
320	IPAddressAccessItem *a;
321
322	assert(n_ipv4);
323	assert(n_ipv6);
324
325	LIST_FOREACH(items, a, list) {
326	switch (a->family) {
327
328	case AF_INET:
329	(*n_ipv4)++;
330	break;
331
332	case AF_INET6:
333	(*n_ipv6)++;
334	break;
335
336	default:
337	return -EAFNOSUPPORT;
338	}
339	}
340
341	return 0;
342	}
343
344	static int bpf_firewall_add_access_items(
345	IPAddressAccessItem *list,
346	int ipv4_map_fd,
347	int ipv6_map_fd,
348	int verdict) {
349
350	struct bpf_lpm_trie_key key_ipv4, key_ipv6;
351	uint64_t value = verdict;
352	IPAddressAccessItem *a;
353	int r;
354
355	key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
356	key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
357
358	LIST_FOREACH(items, a, list) {
359	switch (a->family) {
360
361	case AF_INET:
362	key_ipv4->prefixlen = a->prefixlen;
363	memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
364
365	r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
366	if (r < 0)
367	return r;
368
369	break;
370
371	case AF_INET6:
372	key_ipv6->prefixlen = a->prefixlen;
373	memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
374
375	r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
376	if (r < 0)
377	return r;
378
379	break;
380
381	default:
382	return -EAFNOSUPPORT;
383	}
384	}
385
386	return 0;
387	}
388
389	static int bpf_firewall_prepare_access_maps(
390	Unit *u,
391	int verdict,
392	int *ret_ipv4_map_fd,
393	int *ret_ipv6_map_fd) {
394
395	_cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
396	size_t n_ipv4 = 0, n_ipv6 = 0;
397	Unit *p;
398	int r;
399
400	assert(ret_ipv4_map_fd);
401	assert(ret_ipv6_map_fd);
402
403	for (p = u; p; p = UNIT_DEREF(p->slice)) {
404	CGroupContext *cc;
405
406	cc = unit_get_cgroup_context(p);
407	if (!cc)
408	continue;
409
410	bpf_firewall_count_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, &n_ipv4, &n_ipv6);
411	}
412
413	if (n_ipv4 > 0) {
414	ipv4_map_fd = bpf_map_new(
415	BPF_MAP_TYPE_LPM_TRIE,
416	offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
417	sizeof(uint64_t),
418	n_ipv4,
419	BPF_F_NO_PREALLOC);
420	if (ipv4_map_fd < 0)
421	return ipv4_map_fd;
422	}
423
424	if (n_ipv6 > 0) {
425	ipv6_map_fd = bpf_map_new(
426	BPF_MAP_TYPE_LPM_TRIE,
427	offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
428	sizeof(uint64_t),
429	n_ipv6,
430	BPF_F_NO_PREALLOC);
431	if (ipv6_map_fd < 0)
432	return ipv6_map_fd;
433	}
434
435	for (p = u; p; p = UNIT_DEREF(p->slice)) {
436	CGroupContext *cc;
437
438	cc = unit_get_cgroup_context(p);
439	if (!cc)
440	continue;
441
442	r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
443	ipv4_map_fd, ipv6_map_fd, verdict);
444	if (r < 0)
445	return r;
446	}
447
448	*ret_ipv4_map_fd = ipv4_map_fd;
449	*ret_ipv6_map_fd = ipv6_map_fd;
450
451	ipv4_map_fd = ipv6_map_fd = -1;
452	return 0;
453	}
454
455	static int bpf_firewall_prepare_accounting_maps(bool enabled, int fd_ingress, int fd_egress) {
456	int r;
457
458	assert(fd_ingress);
459	assert(fd_egress);
460
461	if (enabled) {
462	if (*fd_ingress < 0) {
463	r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
464	if (r < 0)
465	return r;
466
467	*fd_ingress = r;
468	}
469
470	if (*fd_egress < 0) {
471
472	r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
473	if (r < 0)
474	return r;
475
476	*fd_egress = r;
477	}
478	} else {
479	fd_ingress = safe_close(fd_ingress);
480	fd_egress = safe_close(fd_egress);
481	}
482
483	return 0;
484	}
485
486	int bpf_firewall_compile(Unit *u) {
487	CGroupContext *cc;
488	int r;
489
490	assert(u);
491
492	r = bpf_firewall_supported();
493	if (r < 0)
494	return r;
495	if (r == 0) {
496	log_debug("BPF firewalling not supported on this systemd, proceeding without.");
497	return -EOPNOTSUPP;
498	}
499
500	/* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
501	* but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
502	* configuration, but we don't flush out the accounting unnecessarily */
503
504	u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
505	u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);
506
507	u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
508	u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
509
510	u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
511	u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
512
513	cc = unit_get_cgroup_context(u);
514	if (!cc)
515	return -EINVAL;
516
517	r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd);
518	if (r < 0)
519	return log_error_errno(r, "Preparation of eBPF allow maps failed: %m");
520
521	r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd);
522	if (r < 0)
523	return log_error_errno(r, "Preparation of eBPF deny maps failed: %m");
524
525	r = bpf_firewall_prepare_accounting_maps(cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
526	if (r < 0)
527	return log_error_errno(r, "Preparation of eBPF accounting maps failed: %m");
528
529	r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress);
530	if (r < 0)
531	return log_error_errno(r, "Compilation for ingress BPF program failed: %m");
532
533	r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress);
534	if (r < 0)
535	return log_error_errno(r, "Compilation for egress BPF program failed: %m");
536
537	return 0;
538	}
539
540	int bpf_firewall_install(Unit *u) {
541	_cleanup_free_ char *path = NULL;
9f2e6892	542	CGroupContext *cc;
1988a9d1 DM	543	int r;
	544
	545	assert(u);
	546
9f2e6892 LP	547	if (!u->cgroup_path)
	548	return -EINVAL;
	549
	550	cc = unit_get_cgroup_context(u);
	551	if (!cc)
	552	return -EINVAL;
	553
1988a9d1 DM	554	r = bpf_firewall_supported();
	555	if (r < 0)
	556	return r;
	557	if (r == 0) {
	558	log_debug("BPF firewalling not supported on this systemd, proceeding without.");
	559	return -EOPNOTSUPP;
	560	}
	561
	562	r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
	563	if (r < 0)
	564	return log_error_errno(r, "Failed to determine cgroup path: %m");
	565
	566	if (u->ip_bpf_egress) {
	567	r = bpf_program_load_kernel(u->ip_bpf_egress, NULL, 0);
	568	if (r < 0)
	569	return log_error_errno(r, "Kernel upload of egress BPF program failed: %m");
	570
9f2e6892	571	r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, cc->delegate ? BPF_F_ALLOW_OVERRIDE : 0);
1988a9d1 DM	572	if (r < 0)
	573	return log_error_errno(r, "Attaching egress BPF program to cgroup %s failed: %m", path);
	574	} else {
	575	r = bpf_program_cgroup_detach(BPF_CGROUP_INET_EGRESS, path);
	576	if (r < 0)
	577	return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_ERR, r,
	578	"Detaching egress BPF program from cgroup failed: %m");
	579	}
	580
	581	if (u->ip_bpf_ingress) {
	582	r = bpf_program_load_kernel(u->ip_bpf_ingress, NULL, 0);
	583	if (r < 0)
	584	return log_error_errno(r, "Kernel upload of ingress BPF program failed: %m");
	585
9f2e6892	586	r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, cc->delegate ? BPF_F_ALLOW_OVERRIDE : 0);
1988a9d1 DM	587	if (r < 0)
	588	return log_error_errno(r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
	589	} else {
	590	r = bpf_program_cgroup_detach(BPF_CGROUP_INET_INGRESS, path);
	591	if (r < 0)
	592	return log_full_errno(r == -ENOENT ? LOG_DEBUG : LOG_ERR, r,
	593	"Detaching ingress BPF program from cgroup failed: %m");
	594	}
	595
	596	return 0;
	597	}
	598
	599	int bpf_firewall_read_accounting(int map_fd, uint64_t ret_bytes, uint64_t ret_packets) {
	600	uint64_t key, packets;
	601	int r;
	602
	603	if (map_fd < 0)
	604	return -EBADF;
	605
	606	if (ret_packets) {
	607	key = MAP_KEY_PACKETS;
	608	r = bpf_map_lookup_element(map_fd, &key, &packets);
	609	if (r < 0)
	610	return r;
	611	}
	612
	613	if (ret_bytes) {
	614	key = MAP_KEY_BYTES;
	615	r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
	616	if (r < 0)
	617	return r;
	618	}
	619
	620	if (ret_packets)
	621	*ret_packets = packets;
	622
	623	return 0;
	624	}
	625
	626	int bpf_firewall_reset_accounting(int map_fd) {
	627	uint64_t key, value = 0;
	628	int r;
	629
	630	if (map_fd < 0)
	631	return -EBADF;
	632
	633	key = MAP_KEY_PACKETS;
	634	r = bpf_map_update_element(map_fd, &key, &value);
	635	if (r < 0)
	636	return r;
	637
	638	key = MAP_KEY_BYTES;
	639	return bpf_map_update_element(map_fd, &key, &value);
	640	}
	641
	642
	643	int bpf_firewall_supported(void) {
93e93da5 LP	644	struct bpf_insn trivial[] = {
	645	BPF_MOV64_IMM(BPF_REG_0, 1),
	646	BPF_EXIT_INSN()
	647	};
	648
	649	_cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
1988a9d1 DM	650	static int supported = -1;
	651	int fd, r;
	652
	653	/* Checks whether BPF firewalling is supported. For this, we check three things:
	654	*
	655	* a) whether we are privileged
	656	* b) whether the unified hierarchy is being used
	657	* c) the BPF implementation in the kernel supports BPF LPM TRIE maps, which we require
	658	*
	659	*/
	660
	661	if (supported >= 0)
	662	return supported;
	663
93e93da5 LP	664	if (geteuid() != 0) {
93e93da5 LP	665	log_debug("Not enough privileges, BPF firewalling is not supported.");
1988a9d1	666	return supported = false;
93e93da5	667	}
1988a9d1 DM	668
	669	r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
	670	if (r < 0)
	671	return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
	672	if (r == 0)
	673	return supported = false;
	674
	675	fd = bpf_map_new(BPF_MAP_TYPE_LPM_TRIE,
	676	offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint64_t),
	677	sizeof(uint64_t),
	678	1,
	679	BPF_F_NO_PREALLOC);
	680	if (fd < 0) {
	681	log_debug_errno(r, "Can't allocate BPF LPM TRIE map, BPF firewalling is not supported: %m");
	682	return supported = false;
	683	}
	684
	685	safe_close(fd);
	686
93e93da5 LP	687	if (bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program) < 0) {
	688	log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
	689	return supported = false;
	690	}
	691
	692	r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
	693	if (r < 0) {
	694	log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
	695	return supported = false;
	696	}
	697
	698	r = bpf_program_load_kernel(program, NULL, 0);
	699	if (r < 0) {
	700	log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
	701	return supported = false;
	702	}
	703
1988a9d1 DM	704	return supported = true;
1988a9d1 DM	705	}