src/core/bpf-firewall.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <arpa/inet.h>
   4 #include <assert.h>
   5 #include <errno.h>
   6 #include <fcntl.h>
   7 #include <linux/libbpf.h>
   8 #include <net/ethernet.h>
   9 #include <net/if.h>
  10 #include <netinet/ip.h>
  11 #include <netinet/ip6.h>
  12 #include <stddef.h>
  13 #include <stdio.h>
  14 #include <stdlib.h>
  15 #include <string.h>
  16 #include <unistd.h>
  17
  18 #include "alloc-util.h"
  19 #include "bpf-firewall.h"
  20 #include "bpf-program.h"
  21 #include "fd-util.h"
  22 #include "ip-address-access.h"
  23 #include "missing_syscall.h"
  24 #include "unit.h"
  25
  26 enum {
  27         MAP_KEY_PACKETS,
  28         MAP_KEY_BYTES,
  29 };
  30
  31 enum {
  32         ACCESS_ALLOWED = 1,
  33         ACCESS_DENIED  = 2,
  34 };
  35
  36 /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
  37
  38 static int add_lookup_instructions(
  39                 BPFProgram *p,
  40                 int map_fd,
  41                 int protocol,
  42                 bool is_ingress,
  43                 int verdict) {
  44
  45         int r, addr_offset, addr_size;
  46
  47         assert(p);
  48         assert(map_fd >= 0);
  49
  50         switch (protocol) {
  51
  52         case ETH_P_IP:
  53                 addr_size = sizeof(uint32_t);
  54                 addr_offset = is_ingress ?
  55                         offsetof(struct iphdr, saddr) :
  56                         offsetof(struct iphdr, daddr);
  57                 break;
  58
  59         case ETH_P_IPV6:
  60                 addr_size = 4 * sizeof(uint32_t);
  61                 addr_offset = is_ingress ?
  62                         offsetof(struct ip6_hdr, ip6_src.s6_addr) :
  63                         offsetof(struct ip6_hdr, ip6_dst.s6_addr);
  64                 break;
  65
  66         default:
  67                 return -EAFNOSUPPORT;
  68         }
  69
  70         do {
  71                 /* Compare IPv4 with one word instruction (32bit) */
  72                 struct bpf_insn insn[] = {
  73                         /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
  74                         BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
  75
  76                         /*
  77                          * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
  78                          *
  79                          * R1: Pointer to the skb
  80                          * R2: Data offset
  81                          * R3: Destination buffer on the stack (r10 - 4)
  82                          * R4: Number of bytes to read (4)
  83                          */
  84
  85                         BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
  86                         BPF_MOV32_IMM(BPF_REG_2, addr_offset),
  87
  88                         BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
  89                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
  90
  91                         BPF_MOV32_IMM(BPF_REG_4, addr_size),
  92                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
  93
  94                         /*
  95                          * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
  96                          * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
  97                          * has to be set to the maximum possible value.
  98                          *
  99                          * On success, the looked up value is stored in R0. For this application, the actual
 100                          * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
 101                          * matching value.
 102                          */
 103
 104                         BPF_LD_MAP_FD(BPF_REG_1, map_fd),
 105                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 106                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
 107                         BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
 108
 109                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 110                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
 111                         BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
 112                 };
 113
 114                 /* Jump label fixup */
 115                 insn[0].off = ELEMENTSOF(insn) - 1;
 116
 117                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 118                 if (r < 0)
 119                         return r;
 120
 121         } while (false);
 122
 123         return 0;
 124 }
 125
 126 static int bpf_firewall_compile_bpf(
 127                 Unit *u,
 128                 bool is_ingress,
 129                 BPFProgram **ret) {
 130
 131         struct bpf_insn pre_insn[] = {
 132                 /*
 133                  * When the eBPF program is entered, R1 contains the address of the skb.
 134                  * However, R1-R5 are scratch registers that are not preserved when calling
 135                  * into kernel functions, so we need to save anything that's supposed to
 136                  * stay around to R6-R9. Save the skb to R6.
 137                  */
 138                 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 139
 140                 /*
 141                  * Although we cannot access the skb data directly from eBPF programs used in this
 142                  * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
 143                  * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
 144                  * for later use.
 145                  */
 146                 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
 147
 148                 /*
 149                  * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
 150                  * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
 151                  */
 152                 BPF_MOV32_IMM(BPF_REG_8, 0),
 153         };
 154
 155         /*
 156          * The access checkers compiled for the configured allowance and denial lists
 157          * write to R8 at runtime. The following code prepares for an early exit that
 158          * skip the accounting if the packet is denied.
 159          *
 160          * R0 = 1
 161          * if (R8 == ACCESS_DENIED)
 162          *     R0 = 0
 163          *
 164          * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
 165          * is allowed to pass.
 166          */
 167         struct bpf_insn post_insn[] = {
 168                 BPF_MOV64_IMM(BPF_REG_0, 1),
 169                 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
 170                 BPF_MOV64_IMM(BPF_REG_0, 0),
 171         };
 172
 173         _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
 174         int accounting_map_fd, r;
 175         bool access_enabled;
 176
 177         assert(u);
 178         assert(ret);
 179
 180         accounting_map_fd = is_ingress ?
 181                 u->ip_accounting_ingress_map_fd :
 182                 u->ip_accounting_egress_map_fd;
 183
 184         access_enabled =
 185                 u->ipv4_allow_map_fd >= 0 ||
 186                 u->ipv6_allow_map_fd >= 0 ||
 187                 u->ipv4_deny_map_fd >= 0 ||
 188                 u->ipv6_deny_map_fd >= 0;
 189
 190         if (accounting_map_fd < 0 && !access_enabled) {
 191                 *ret = NULL;
 192                 return 0;
 193         }
 194
 195         r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
 196         if (r < 0)
 197                 return r;
 198
 199         r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
 200         if (r < 0)
 201                 return r;
 202
 203         if (access_enabled) {
 204                 /*
 205                  * The simple rule this function translates into eBPF instructions is:
 206                  *
 207                  * - Access will be granted when an address matches an entry in @list_allow
 208                  * - Otherwise, access will be denied when an address matches an entry in @list_deny
 209                  * - Otherwise, access will be granted
 210                  */
 211
 212                 if (u->ipv4_deny_map_fd >= 0) {
 213                         r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
 214                         if (r < 0)
 215                                 return r;
 216                 }
 217
 218                 if (u->ipv6_deny_map_fd >= 0) {
 219                         r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
 220                         if (r < 0)
 221                                 return r;
 222                 }
 223
 224                 if (u->ipv4_allow_map_fd >= 0) {
 225                         r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
 226                         if (r < 0)
 227                                 return r;
 228                 }
 229
 230                 if (u->ipv6_allow_map_fd >= 0) {
 231                         r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
 232                         if (r < 0)
 233                                 return r;
 234                 }
 235         }
 236
 237         r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
 238         if (r < 0)
 239                 return r;
 240
 241         if (accounting_map_fd >= 0) {
 242                 struct bpf_insn insn[] = {
 243                         /*
 244                          * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
 245                          * The jump label will be fixed up later.
 246                          */
 247                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
 248
 249                         /* Count packets */
 250                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
 251                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
 252                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 253                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
 254                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
 255                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 256                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 257                         BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
 258                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
 259
 260                         /* Count bytes */
 261                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
 262                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
 263                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 264                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
 265                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
 266                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 267                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 268                         BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
 269                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
 270
 271                         /* Allow the packet to pass */
 272                         BPF_MOV64_IMM(BPF_REG_0, 1),
 273                 };
 274
 275                 /* Jump label fixup */
 276                 insn[0].off = ELEMENTSOF(insn) - 1;
 277
 278                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 279                 if (r < 0)
 280                         return r;
 281         }
 282
 283         do {
 284                 /*
 285                  * Exit from the eBPF program, R0 contains the verdict.
 286                  * 0 means the packet is denied, 1 means the packet may pass.
 287                  */
 288                 struct bpf_insn insn[] = {
 289                         BPF_EXIT_INSN()
 290                 };
 291
 292                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 293                 if (r < 0)
 294                         return r;
 295         } while (false);
 296
 297         *ret = TAKE_PTR(p);
 298
 299         return 0;
 300 }
 301
 302 static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) {
 303         IPAddressAccessItem *a;
 304
 305         assert(n_ipv4);
 306         assert(n_ipv6);
 307
 308         LIST_FOREACH(items, a, list) {
 309                 switch (a->family) {
 310
 311                 case AF_INET:
 312                         (*n_ipv4)++;
 313                         break;
 314
 315                 case AF_INET6:
 316                         (*n_ipv6)++;
 317                         break;
 318
 319                 default:
 320                         return -EAFNOSUPPORT;
 321                 }
 322         }
 323
 324         return 0;
 325 }
 326
 327 static int bpf_firewall_add_access_items(
 328                 IPAddressAccessItem *list,
 329                 int ipv4_map_fd,
 330                 int ipv6_map_fd,
 331                 int verdict) {
 332
 333         struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
 334         uint64_t value = verdict;
 335         IPAddressAccessItem *a;
 336         int r;
 337
 338         key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
 339         key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
 340
 341         LIST_FOREACH(items, a, list) {
 342                 switch (a->family) {
 343
 344                 case AF_INET:
 345                         key_ipv4->prefixlen = a->prefixlen;
 346                         memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
 347
 348                         r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
 349                         if (r < 0)
 350                                 return r;
 351
 352                         break;
 353
 354                 case AF_INET6:
 355                         key_ipv6->prefixlen = a->prefixlen;
 356                         memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
 357
 358                         r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
 359                         if (r < 0)
 360                                 return r;
 361
 362                         break;
 363
 364                 default:
 365                         return -EAFNOSUPPORT;
 366                 }
 367         }
 368
 369         return 0;
 370 }
 371
 372 static int bpf_firewall_prepare_access_maps(
 373                 Unit *u,
 374                 int verdict,
 375                 int *ret_ipv4_map_fd,
 376                 int *ret_ipv6_map_fd) {
 377
 378         _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
 379         size_t n_ipv4 = 0, n_ipv6 = 0;
 380         Unit *p;
 381         int r;
 382
 383         assert(ret_ipv4_map_fd);
 384         assert(ret_ipv6_map_fd);
 385
 386         for (p = u; p; p = UNIT_DEREF(p->slice)) {
 387                 CGroupContext *cc;
 388
 389                 cc = unit_get_cgroup_context(p);
 390                 if (!cc)
 391                         continue;
 392
 393                 bpf_firewall_count_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, &n_ipv4, &n_ipv6);
 394         }
 395
 396         if (n_ipv4 > 0) {
 397                 ipv4_map_fd = bpf_map_new(
 398                                 BPF_MAP_TYPE_LPM_TRIE,
 399                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
 400                                 sizeof(uint64_t),
 401                                 n_ipv4,
 402                                 BPF_F_NO_PREALLOC);
 403                 if (ipv4_map_fd < 0)
 404                         return ipv4_map_fd;
 405         }
 406
 407         if (n_ipv6 > 0) {
 408                 ipv6_map_fd = bpf_map_new(
 409                                 BPF_MAP_TYPE_LPM_TRIE,
 410                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
 411                                 sizeof(uint64_t),
 412                                 n_ipv6,
 413                                 BPF_F_NO_PREALLOC);
 414                 if (ipv6_map_fd < 0)
 415                         return ipv6_map_fd;
 416         }
 417
 418         for (p = u; p; p = UNIT_DEREF(p->slice)) {
 419                 CGroupContext *cc;
 420
 421                 cc = unit_get_cgroup_context(p);
 422                 if (!cc)
 423                         continue;
 424
 425                 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
 426                                                   ipv4_map_fd, ipv6_map_fd, verdict);
 427                 if (r < 0)
 428                         return r;
 429         }
 430
 431         *ret_ipv4_map_fd = ipv4_map_fd;
 432         *ret_ipv6_map_fd = ipv6_map_fd;
 433
 434         ipv4_map_fd = ipv6_map_fd = -1;
 435         return 0;
 436 }
 437
 438 static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
 439         int r;
 440
 441         assert(u);
 442         assert(fd_ingress);
 443         assert(fd_egress);
 444
 445         if (enabled) {
 446                 if (*fd_ingress < 0) {
 447                         r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
 448                         if (r < 0)
 449                                 return r;
 450
 451                         *fd_ingress = r;
 452                 }
 453
 454                 if (*fd_egress < 0) {
 455
 456                         r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
 457                         if (r < 0)
 458                                 return r;
 459
 460                         *fd_egress = r;
 461                 }
 462
 463         } else {
 464                 *fd_ingress = safe_close(*fd_ingress);
 465                 *fd_egress = safe_close(*fd_egress);
 466
 467                 zero(u->ip_accounting_extra);
 468         }
 469
 470         return 0;
 471 }
 472
 473 int bpf_firewall_compile(Unit *u) {
 474         CGroupContext *cc;
 475         int r, supported;
 476
 477         assert(u);
 478
 479         cc = unit_get_cgroup_context(u);
 480         if (!cc)
 481                 return -EINVAL;
 482
 483         supported = bpf_firewall_supported();
 484         if (supported < 0)
 485                 return supported;
 486         if (supported == BPF_FIREWALL_UNSUPPORTED) {
 487                 log_unit_debug(u, "BPF firewalling not supported on this manager, proceeding without.");
 488                 return -EOPNOTSUPP;
 489         }
 490         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
 491                 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
 492                  * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
 493                  * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
 494                  * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
 495                  * all, either. */
 496                 log_unit_debug(u, "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
 497                 return -EOPNOTSUPP;
 498         }
 499
 500         /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
 501          * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
 502          * configuration, but we don't flush out the accounting unnecessarily */
 503
 504         u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
 505         u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);
 506
 507         u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
 508         u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
 509
 510         u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
 511         u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
 512
 513         if (u->type != UNIT_SLICE) {
 514                 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
 515                  * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
 516                  * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
 517                  * means that all configure IP access rules *will* take effect on processes, even though we never
 518                  * compile them for inner nodes. */
 519
 520                 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd);
 521                 if (r < 0)
 522                         return log_unit_error_errno(u, r, "Preparation of eBPF allow maps failed: %m");
 523
 524                 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd);
 525                 if (r < 0)
 526                         return log_unit_error_errno(u, r, "Preparation of eBPF deny maps failed: %m");
 527         }
 528
 529         r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
 530         if (r < 0)
 531                 return log_unit_error_errno(u, r, "Preparation of eBPF accounting maps failed: %m");
 532
 533         r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress);
 534         if (r < 0)
 535                 return log_unit_error_errno(u, r, "Compilation for ingress BPF program failed: %m");
 536
 537         r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress);
 538         if (r < 0)
 539                 return log_unit_error_errno(u, r, "Compilation for egress BPF program failed: %m");
 540
 541         return 0;
 542 }
 543
 544 int bpf_firewall_install(Unit *u) {
 545         _cleanup_free_ char *path = NULL;
 546         CGroupContext *cc;
 547         int r, supported;
 548         uint32_t flags;
 549
 550         assert(u);
 551
 552         cc = unit_get_cgroup_context(u);
 553         if (!cc)
 554                 return -EINVAL;
 555         if (!u->cgroup_path)
 556                 return -EINVAL;
 557         if (!u->cgroup_realized)
 558                 return -EINVAL;
 559
 560         supported = bpf_firewall_supported();
 561         if (supported < 0)
 562                 return supported;
 563         if (supported == BPF_FIREWALL_UNSUPPORTED) {
 564                 log_unit_debug(u, "BPF firewalling not supported on this manager, proceeding without.");
 565                 return -EOPNOTSUPP;
 566         }
 567         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
 568                 log_unit_debug(u, "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
 569                 return -EOPNOTSUPP;
 570         }
 571
 572         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
 573         if (r < 0)
 574                 return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m");
 575
 576         flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
 577                  (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
 578
 579         /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
 580          * minimize the time window when we don't account for IP traffic. */
 581         u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
 582         u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
 583
 584         if (u->ip_bpf_egress) {
 585                 r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
 586                 if (r < 0)
 587                         return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path);
 588
 589                 /* Remember that this BPF program is installed now. */
 590                 u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
 591         }
 592
 593         if (u->ip_bpf_ingress) {
 594                 r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
 595                 if (r < 0)
 596                         return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
 597
 598                 u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
 599         }
 600
 601         return 0;
 602 }
 603
 604 int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
 605         uint64_t key, packets;
 606         int r;
 607
 608         if (map_fd < 0)
 609                 return -EBADF;
 610
 611         if (ret_packets) {
 612                 key = MAP_KEY_PACKETS;
 613                 r = bpf_map_lookup_element(map_fd, &key, &packets);
 614                 if (r < 0)
 615                         return r;
 616         }
 617
 618         if (ret_bytes) {
 619                 key = MAP_KEY_BYTES;
 620                 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
 621                 if (r < 0)
 622                         return r;
 623         }
 624
 625         if (ret_packets)
 626                 *ret_packets = packets;
 627
 628         return 0;
 629 }
 630
 631 int bpf_firewall_reset_accounting(int map_fd) {
 632         uint64_t key, value = 0;
 633         int r;
 634
 635         if (map_fd < 0)
 636                 return -EBADF;
 637
 638         key = MAP_KEY_PACKETS;
 639         r = bpf_map_update_element(map_fd, &key, &value);
 640         if (r < 0)
 641                 return r;
 642
 643         key = MAP_KEY_BYTES;
 644         return bpf_map_update_element(map_fd, &key, &value);
 645 }
 646
 647 int bpf_firewall_supported(void) {
 648         struct bpf_insn trivial[] = {
 649                 BPF_MOV64_IMM(BPF_REG_0, 1),
 650                 BPF_EXIT_INSN()
 651         };
 652
 653         _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
 654         static int supported = -1;
 655         union bpf_attr attr;
 656         int fd, r;
 657
 658         /* Checks whether BPF firewalling is supported. For this, we check five things:
 659          *
 660          * a) whether we are privileged
 661          * b) whether the unified hierarchy is being used
 662          * c) the BPF implementation in the kernel supports BPF LPM TRIE maps, which we require
 663          * d) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
 664          * e) the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
 665          */
 666
 667         if (supported >= 0)
 668                 return supported;
 669
 670         if (geteuid() != 0) {
 671                 log_debug("Not enough privileges, BPF firewalling is not supported.");
 672                 return supported = BPF_FIREWALL_UNSUPPORTED;
 673         }
 674
 675         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
 676         if (r < 0)
 677                 return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
 678         if (r == 0) {
 679                 log_debug("Not running with unified cgroups, BPF firewalling is not supported.");
 680                 return supported = BPF_FIREWALL_UNSUPPORTED;
 681         }
 682
 683         fd = bpf_map_new(BPF_MAP_TYPE_LPM_TRIE,
 684                          offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint64_t),
 685                          sizeof(uint64_t),
 686                          1,
 687                          BPF_F_NO_PREALLOC);
 688         if (fd < 0) {
 689                 log_debug_errno(fd, "Can't allocate BPF LPM TRIE map, BPF firewalling is not supported: %m");
 690                 return supported = BPF_FIREWALL_UNSUPPORTED;
 691         }
 692
 693         safe_close(fd);
 694
 695         r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program);
 696         if (r < 0) {
 697                 log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
 698                 return supported = BPF_FIREWALL_UNSUPPORTED;
 699         }
 700
 701         r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
 702         if (r < 0) {
 703                 log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
 704                 return supported = BPF_FIREWALL_UNSUPPORTED;
 705         }
 706
 707         r = bpf_program_load_kernel(program, NULL, 0);
 708         if (r < 0) {
 709                 log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
 710                 return supported = BPF_FIREWALL_UNSUPPORTED;
 711         }
 712
 713         /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
 714          * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
 715          * program if we can't do a thing with it later?
 716          *
 717          * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
 718          * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
 719          * parameters are validated however, and that'll fail with EBADF then. */
 720
 721         attr = (union bpf_attr) {
 722                 .attach_type = BPF_CGROUP_INET_EGRESS,
 723                 .target_fd = -1,
 724                 .attach_bpf_fd = -1,
 725         };
 726
 727         if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
 728                 if (errno != EBADF) {
 729                         log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
 730                         return supported = BPF_FIREWALL_UNSUPPORTED;
 731                 }
 732
 733                 /* YAY! */
 734         } else {
 735                 log_debug("Wut? Kernel accepted our invalid BPF_PROG_DETACH call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
 736                 return supported = BPF_FIREWALL_UNSUPPORTED;
 737         }
 738
 739         /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
 740          * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
 741          * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
 742          * get EINVAL if it's not supported, and EBADF as before if it is available. */
 743
 744         attr = (union bpf_attr) {
 745                 .attach_type = BPF_CGROUP_INET_EGRESS,
 746                 .target_fd = -1,
 747                 .attach_bpf_fd = -1,
 748                 .attach_flags = BPF_F_ALLOW_MULTI,
 749         };
 750
 751         if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
 752                 if (errno == EBADF) {
 753                         log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
 754                         return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
 755                 }
 756
 757                 if (errno == EINVAL)
 758                         log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
 759                 else
 760                         log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
 761
 762                 return supported = BPF_FIREWALL_SUPPORTED;
 763         } else {
 764                 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
 765                 return supported = BPF_FIREWALL_UNSUPPORTED;
 766         }
 767 }