src/core/bpf-firewall.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   Copyright 2016 Daniel Mack
   4 ***/
   5
   6 #include <arpa/inet.h>
   7 #include <assert.h>
   8 #include <errno.h>
   9 #include <fcntl.h>
  10 #include <linux/libbpf.h>
  11 #include <net/ethernet.h>
  12 #include <net/if.h>
  13 #include <netinet/ip.h>
  14 #include <netinet/ip6.h>
  15 #include <stddef.h>
  16 #include <stdio.h>
  17 #include <stdlib.h>
  18 #include <string.h>
  19 #include <unistd.h>
  20
  21 #include "alloc-util.h"
  22 #include "bpf-firewall.h"
  23 #include "bpf-program.h"
  24 #include "fd-util.h"
  25 #include "ip-address-access.h"
  26 #include "unit.h"
  27
  28 enum {
  29         MAP_KEY_PACKETS,
  30         MAP_KEY_BYTES,
  31 };
  32
  33 enum {
  34         ACCESS_ALLOWED = 1,
  35         ACCESS_DENIED  = 2,
  36 };
  37
  38 /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
  39
  40 static int add_lookup_instructions(
  41                 BPFProgram *p,
  42                 int map_fd,
  43                 int protocol,
  44                 bool is_ingress,
  45                 int verdict) {
  46
  47         int r, addr_offset, addr_size;
  48
  49         assert(p);
  50         assert(map_fd >= 0);
  51
  52         switch (protocol) {
  53
  54         case ETH_P_IP:
  55                 addr_size = sizeof(uint32_t);
  56                 addr_offset = is_ingress ?
  57                         offsetof(struct iphdr, saddr) :
  58                         offsetof(struct iphdr, daddr);
  59                 break;
  60
  61         case ETH_P_IPV6:
  62                 addr_size = 4 * sizeof(uint32_t);
  63                 addr_offset = is_ingress ?
  64                         offsetof(struct ip6_hdr, ip6_src.s6_addr) :
  65                         offsetof(struct ip6_hdr, ip6_dst.s6_addr);
  66                 break;
  67
  68         default:
  69                 return -EAFNOSUPPORT;
  70         }
  71
  72         do {
  73                 /* Compare IPv4 with one word instruction (32bit) */
  74                 struct bpf_insn insn[] = {
  75                         /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
  76                         BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
  77
  78                         /*
  79                          * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
  80                          *
  81                          * R1: Pointer to the skb
  82                          * R2: Data offset
  83                          * R3: Destination buffer on the stack (r10 - 4)
  84                          * R4: Number of bytes to read (4)
  85                          */
  86
  87                         BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
  88                         BPF_MOV32_IMM(BPF_REG_2, addr_offset),
  89
  90                         BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
  91                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
  92
  93                         BPF_MOV32_IMM(BPF_REG_4, addr_size),
  94                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
  95
  96                         /*
  97                          * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
  98                          * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
  99                          * has to be set to the maximum possible value.
 100                          *
 101                          * On success, the looked up value is stored in R0. For this application, the actual
 102                          * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
 103                          * matching value.
 104                          */
 105
 106                         BPF_LD_MAP_FD(BPF_REG_1, map_fd),
 107                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 108                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
 109                         BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
 110
 111                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 112                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
 113                         BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
 114                 };
 115
 116                 /* Jump label fixup */
 117                 insn[0].off = ELEMENTSOF(insn) - 1;
 118
 119                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 120                 if (r < 0)
 121                         return r;
 122
 123         } while (false);
 124
 125         return 0;
 126 }
 127
 128 static int bpf_firewall_compile_bpf(
 129                 Unit *u,
 130                 bool is_ingress,
 131                 BPFProgram **ret) {
 132
 133         struct bpf_insn pre_insn[] = {
 134                 /*
 135                  * When the eBPF program is entered, R1 contains the address of the skb.
 136                  * However, R1-R5 are scratch registers that are not preserved when calling
 137                  * into kernel functions, so we need to save anything that's supposed to
 138                  * stay around to R6-R9. Save the skb to R6.
 139                  */
 140                 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 141
 142                 /*
 143                  * Although we cannot access the skb data directly from eBPF programs used in this
 144                  * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
 145                  * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
 146                  * for later use.
 147                  */
 148                 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
 149
 150                 /*
 151                  * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
 152                  * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
 153                  */
 154                 BPF_MOV32_IMM(BPF_REG_8, 0),
 155         };
 156
 157         /*
 158          * The access checkers compiled for the configured allowance and denial lists
 159          * write to R8 at runtime. The following code prepares for an early exit that
 160          * skip the accounting if the packet is denied.
 161          *
 162          * R0 = 1
 163          * if (R8 == ACCESS_DENIED)
 164          *     R0 = 0
 165          *
 166          * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
 167          * is allowed to pass.
 168          */
 169         struct bpf_insn post_insn[] = {
 170                 BPF_MOV64_IMM(BPF_REG_0, 1),
 171                 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
 172                 BPF_MOV64_IMM(BPF_REG_0, 0),
 173         };
 174
 175         _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
 176         int accounting_map_fd, r;
 177         bool access_enabled;
 178
 179         assert(u);
 180         assert(ret);
 181
 182         accounting_map_fd = is_ingress ?
 183                 u->ip_accounting_ingress_map_fd :
 184                 u->ip_accounting_egress_map_fd;
 185
 186         access_enabled =
 187                 u->ipv4_allow_map_fd >= 0 ||
 188                 u->ipv6_allow_map_fd >= 0 ||
 189                 u->ipv4_deny_map_fd >= 0 ||
 190                 u->ipv6_deny_map_fd >= 0;
 191
 192         if (accounting_map_fd < 0 && !access_enabled) {
 193                 *ret = NULL;
 194                 return 0;
 195         }
 196
 197         r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
 198         if (r < 0)
 199                 return r;
 200
 201         r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
 202         if (r < 0)
 203                 return r;
 204
 205         if (access_enabled) {
 206                 /*
 207                  * The simple rule this function translates into eBPF instructions is:
 208                  *
 209                  * - Access will be granted when an address matches an entry in @list_allow
 210                  * - Otherwise, access will be denied when an address matches an entry in @list_deny
 211                  * - Otherwise, access will be granted
 212                  */
 213
 214                 if (u->ipv4_deny_map_fd >= 0) {
 215                         r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
 216                         if (r < 0)
 217                                 return r;
 218                 }
 219
 220                 if (u->ipv6_deny_map_fd >= 0) {
 221                         r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
 222                         if (r < 0)
 223                                 return r;
 224                 }
 225
 226                 if (u->ipv4_allow_map_fd >= 0) {
 227                         r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
 228                         if (r < 0)
 229                                 return r;
 230                 }
 231
 232                 if (u->ipv6_allow_map_fd >= 0) {
 233                         r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
 234                         if (r < 0)
 235                                 return r;
 236                 }
 237         }
 238
 239         r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
 240         if (r < 0)
 241                 return r;
 242
 243         if (accounting_map_fd >= 0) {
 244                 struct bpf_insn insn[] = {
 245                         /*
 246                          * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
 247                          * The jump label will be fixed up later.
 248                          */
 249                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
 250
 251                         /* Count packets */
 252                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
 253                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
 254                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 255                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
 256                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
 257                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 258                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 259                         BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
 260                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
 261
 262                         /* Count bytes */
 263                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
 264                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
 265                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 266                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
 267                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
 268                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 269                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 270                         BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
 271                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
 272
 273                         /* Allow the packet to pass */
 274                         BPF_MOV64_IMM(BPF_REG_0, 1),
 275                 };
 276
 277                 /* Jump label fixup */
 278                 insn[0].off = ELEMENTSOF(insn) - 1;
 279
 280                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 281                 if (r < 0)
 282                         return r;
 283         }
 284
 285         do {
 286                 /*
 287                  * Exit from the eBPF program, R0 contains the verdict.
 288                  * 0 means the packet is denied, 1 means the packet may pass.
 289                  */
 290                 struct bpf_insn insn[] = {
 291                         BPF_EXIT_INSN()
 292                 };
 293
 294                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 295                 if (r < 0)
 296                         return r;
 297         } while (false);
 298
 299         *ret = TAKE_PTR(p);
 300
 301         return 0;
 302 }
 303
 304 static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) {
 305         IPAddressAccessItem *a;
 306
 307         assert(n_ipv4);
 308         assert(n_ipv6);
 309
 310         LIST_FOREACH(items, a, list) {
 311                 switch (a->family) {
 312
 313                 case AF_INET:
 314                         (*n_ipv4)++;
 315                         break;
 316
 317                 case AF_INET6:
 318                         (*n_ipv6)++;
 319                         break;
 320
 321                 default:
 322                         return -EAFNOSUPPORT;
 323                 }
 324         }
 325
 326         return 0;
 327 }
 328
 329 static int bpf_firewall_add_access_items(
 330                 IPAddressAccessItem *list,
 331                 int ipv4_map_fd,
 332                 int ipv6_map_fd,
 333                 int verdict) {
 334
 335         struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
 336         uint64_t value = verdict;
 337         IPAddressAccessItem *a;
 338         int r;
 339
 340         key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
 341         key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
 342
 343         LIST_FOREACH(items, a, list) {
 344                 switch (a->family) {
 345
 346                 case AF_INET:
 347                         key_ipv4->prefixlen = a->prefixlen;
 348                         memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
 349
 350                         r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
 351                         if (r < 0)
 352                                 return r;
 353
 354                         break;
 355
 356                 case AF_INET6:
 357                         key_ipv6->prefixlen = a->prefixlen;
 358                         memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
 359
 360                         r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
 361                         if (r < 0)
 362                                 return r;
 363
 364                         break;
 365
 366                 default:
 367                         return -EAFNOSUPPORT;
 368                 }
 369         }
 370
 371         return 0;
 372 }
 373
 374 static int bpf_firewall_prepare_access_maps(
 375                 Unit *u,
 376                 int verdict,
 377                 int *ret_ipv4_map_fd,
 378                 int *ret_ipv6_map_fd) {
 379
 380         _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
 381         size_t n_ipv4 = 0, n_ipv6 = 0;
 382         Unit *p;
 383         int r;
 384
 385         assert(ret_ipv4_map_fd);
 386         assert(ret_ipv6_map_fd);
 387
 388         for (p = u; p; p = UNIT_DEREF(p->slice)) {
 389                 CGroupContext *cc;
 390
 391                 cc = unit_get_cgroup_context(p);
 392                 if (!cc)
 393                         continue;
 394
 395                 bpf_firewall_count_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, &n_ipv4, &n_ipv6);
 396         }
 397
 398         if (n_ipv4 > 0) {
 399                 ipv4_map_fd = bpf_map_new(
 400                                 BPF_MAP_TYPE_LPM_TRIE,
 401                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
 402                                 sizeof(uint64_t),
 403                                 n_ipv4,
 404                                 BPF_F_NO_PREALLOC);
 405                 if (ipv4_map_fd < 0)
 406                         return ipv4_map_fd;
 407         }
 408
 409         if (n_ipv6 > 0) {
 410                 ipv6_map_fd = bpf_map_new(
 411                                 BPF_MAP_TYPE_LPM_TRIE,
 412                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
 413                                 sizeof(uint64_t),
 414                                 n_ipv6,
 415                                 BPF_F_NO_PREALLOC);
 416                 if (ipv6_map_fd < 0)
 417                         return ipv6_map_fd;
 418         }
 419
 420         for (p = u; p; p = UNIT_DEREF(p->slice)) {
 421                 CGroupContext *cc;
 422
 423                 cc = unit_get_cgroup_context(p);
 424                 if (!cc)
 425                         continue;
 426
 427                 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
 428                                                   ipv4_map_fd, ipv6_map_fd, verdict);
 429                 if (r < 0)
 430                         return r;
 431         }
 432
 433         *ret_ipv4_map_fd = ipv4_map_fd;
 434         *ret_ipv6_map_fd = ipv6_map_fd;
 435
 436         ipv4_map_fd = ipv6_map_fd = -1;
 437         return 0;
 438 }
 439
 440 static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
 441         int r;
 442
 443         assert(u);
 444         assert(fd_ingress);
 445         assert(fd_egress);
 446
 447         if (enabled) {
 448                 if (*fd_ingress < 0) {
 449                         r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
 450                         if (r < 0)
 451                                 return r;
 452
 453                         *fd_ingress = r;
 454                 }
 455
 456                 if (*fd_egress < 0) {
 457
 458                         r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
 459                         if (r < 0)
 460                                 return r;
 461
 462                         *fd_egress = r;
 463                 }
 464
 465         } else {
 466                 *fd_ingress = safe_close(*fd_ingress);
 467                 *fd_egress = safe_close(*fd_egress);
 468
 469                 zero(u->ip_accounting_extra);
 470         }
 471
 472         return 0;
 473 }
 474
 475 int bpf_firewall_compile(Unit *u) {
 476         CGroupContext *cc;
 477         int r, supported;
 478
 479         assert(u);
 480
 481         cc = unit_get_cgroup_context(u);
 482         if (!cc)
 483                 return -EINVAL;
 484
 485         supported = bpf_firewall_supported();
 486         if (supported < 0)
 487                 return supported;
 488         if (supported == BPF_FIREWALL_UNSUPPORTED) {
 489                 log_debug("BPF firewalling not supported on this manager, proceeding without.");
 490                 return -EOPNOTSUPP;
 491         }
 492         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
 493                 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
 494                  * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
 495                  * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
 496                  * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
 497                  * all, either. */
 498                 log_debug("BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
 499                 return -EOPNOTSUPP;
 500         }
 501
 502         /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
 503          * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
 504          * configuration, but we don't flush out the accounting unnecessarily */
 505
 506         u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
 507         u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);
 508
 509         u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
 510         u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
 511
 512         u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
 513         u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
 514
 515         if (u->type != UNIT_SLICE) {
 516                 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
 517                  * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
 518                  * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
 519                  * means that all configure IP access rules *will* take effect on processes, even though we never
 520                  * compile them for inner nodes. */
 521
 522                 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd);
 523                 if (r < 0)
 524                         return log_error_errno(r, "Preparation of eBPF allow maps failed: %m");
 525
 526                 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd);
 527                 if (r < 0)
 528                         return log_error_errno(r, "Preparation of eBPF deny maps failed: %m");
 529         }
 530
 531         r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
 532         if (r < 0)
 533                 return log_error_errno(r, "Preparation of eBPF accounting maps failed: %m");
 534
 535         r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress);
 536         if (r < 0)
 537                 return log_error_errno(r, "Compilation for ingress BPF program failed: %m");
 538
 539         r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress);
 540         if (r < 0)
 541                 return log_error_errno(r, "Compilation for egress BPF program failed: %m");
 542
 543         return 0;
 544 }
 545
 546 int bpf_firewall_install(Unit *u) {
 547         _cleanup_free_ char *path = NULL;
 548         CGroupContext *cc;
 549         int r, supported;
 550         uint32_t flags;
 551
 552         assert(u);
 553
 554         cc = unit_get_cgroup_context(u);
 555         if (!cc)
 556                 return -EINVAL;
 557         if (!u->cgroup_path)
 558                 return -EINVAL;
 559         if (!u->cgroup_realized)
 560                 return -EINVAL;
 561
 562         supported = bpf_firewall_supported();
 563         if (supported < 0)
 564                 return supported;
 565         if (supported == BPF_FIREWALL_UNSUPPORTED) {
 566                 log_debug("BPF firewalling not supported on this manager, proceeding without.");
 567                 return -EOPNOTSUPP;
 568         }
 569         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
 570                 log_debug("BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
 571                 return -EOPNOTSUPP;
 572         }
 573
 574         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
 575         if (r < 0)
 576                 return log_error_errno(r, "Failed to determine cgroup path: %m");
 577
 578         flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
 579                  (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
 580
 581         /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
 582          * minimize the time window when we don't account for IP traffic. */
 583         u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
 584         u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
 585
 586         if (u->ip_bpf_egress) {
 587                 r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
 588                 if (r < 0)
 589                         return log_error_errno(r, "Attaching egress BPF program to cgroup %s failed: %m", path);
 590
 591                 /* Remember that this BPF program is installed now. */
 592                 u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
 593         }
 594
 595         if (u->ip_bpf_ingress) {
 596                 r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
 597                 if (r < 0)
 598                         return log_error_errno(r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
 599
 600                 u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
 601         }
 602
 603         return 0;
 604 }
 605
 606 int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
 607         uint64_t key, packets;
 608         int r;
 609
 610         if (map_fd < 0)
 611                 return -EBADF;
 612
 613         if (ret_packets) {
 614                 key = MAP_KEY_PACKETS;
 615                 r = bpf_map_lookup_element(map_fd, &key, &packets);
 616                 if (r < 0)
 617                         return r;
 618         }
 619
 620         if (ret_bytes) {
 621                 key = MAP_KEY_BYTES;
 622                 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
 623                 if (r < 0)
 624                         return r;
 625         }
 626
 627         if (ret_packets)
 628                 *ret_packets = packets;
 629
 630         return 0;
 631 }
 632
 633 int bpf_firewall_reset_accounting(int map_fd) {
 634         uint64_t key, value = 0;
 635         int r;
 636
 637         if (map_fd < 0)
 638                 return -EBADF;
 639
 640         key = MAP_KEY_PACKETS;
 641         r = bpf_map_update_element(map_fd, &key, &value);
 642         if (r < 0)
 643                 return r;
 644
 645         key = MAP_KEY_BYTES;
 646         return bpf_map_update_element(map_fd, &key, &value);
 647 }
 648
 649 int bpf_firewall_supported(void) {
 650         struct bpf_insn trivial[] = {
 651                 BPF_MOV64_IMM(BPF_REG_0, 1),
 652                 BPF_EXIT_INSN()
 653         };
 654
 655         _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
 656         static int supported = -1;
 657         union bpf_attr attr;
 658         int fd, r;
 659
 660         /* Checks whether BPF firewalling is supported. For this, we check five things:
 661          *
 662          * a) whether we are privileged
 663          * b) whether the unified hierarchy is being used
 664          * c) the BPF implementation in the kernel supports BPF LPM TRIE maps, which we require
 665          * d) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
 666          * e) the BPF implementation in the kernel supports the BPF_PROG_ATTACH call, which we require
 667          *
 668          */
 669
 670         if (supported >= 0)
 671                 return supported;
 672
 673         if (geteuid() != 0) {
 674                 log_debug("Not enough privileges, BPF firewalling is not supported.");
 675                 return supported = BPF_FIREWALL_UNSUPPORTED;
 676         }
 677
 678         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
 679         if (r < 0)
 680                 return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
 681         if (r == 0) {
 682                 log_debug("Not running with unified cgroups, BPF firewalling is not supported.");
 683                 return supported = BPF_FIREWALL_UNSUPPORTED;
 684         }
 685
 686         fd = bpf_map_new(BPF_MAP_TYPE_LPM_TRIE,
 687                          offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint64_t),
 688                          sizeof(uint64_t),
 689                          1,
 690                          BPF_F_NO_PREALLOC);
 691         if (fd < 0) {
 692                 log_debug_errno(fd, "Can't allocate BPF LPM TRIE map, BPF firewalling is not supported: %m");
 693                 return supported = BPF_FIREWALL_UNSUPPORTED;
 694         }
 695
 696         safe_close(fd);
 697
 698         r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program);
 699         if (r < 0) {
 700                 log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
 701                 return supported = BPF_FIREWALL_UNSUPPORTED;
 702         }
 703
 704         r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
 705         if (r < 0) {
 706                 log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
 707                 return supported = BPF_FIREWALL_UNSUPPORTED;
 708         }
 709
 710         r = bpf_program_load_kernel(program, NULL, 0);
 711         if (r < 0) {
 712                 log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
 713                 return supported = BPF_FIREWALL_UNSUPPORTED;
 714         }
 715
 716         /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
 717          * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
 718          * program if we can't do a thing with it later?
 719          *
 720          * We detect this case by issuing the BPF_PROG_ATTACH bpf() call with invalid file descriptors: if
 721          * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
 722          * parameters are validated however, and that'll fail with EBADF then. */
 723
 724         attr = (union bpf_attr) {
 725                 .attach_type = BPF_CGROUP_INET_EGRESS,
 726                 .target_fd = -1,
 727                 .attach_bpf_fd = -1,
 728         };
 729
 730         if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
 731                 if (errno != EBADF) {
 732                         log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_ATTACH, BPF firewalling is not supported: %m");
 733                         return supported = BPF_FIREWALL_UNSUPPORTED;
 734                 }
 735
 736                 /* YAY! */
 737         } else {
 738                 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
 739                 return supported = BPF_FIREWALL_UNSUPPORTED;
 740         }
 741
 742         /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
 743          * (which was added in kernel 4.15). We use a similar logic as before, but this time we use
 744          * BPF_F_ALLOW_MULTI. Since the flags are checked early in the system call we'll get EINVAL if it's not
 745          * supported, and EBADF as before if it is available. */
 746
 747         attr = (union bpf_attr) {
 748                 .attach_type = BPF_CGROUP_INET_EGRESS,
 749                 .target_fd = -1,
 750                 .attach_bpf_fd = -1,
 751                 .attach_flags = BPF_F_ALLOW_MULTI,
 752         };
 753
 754         if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
 755                 if (errno == EBADF) {
 756                         log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
 757                         return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
 758                 }
 759
 760                 if (errno == EINVAL)
 761                         log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
 762                 else
 763                         log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
 764
 765                 return supported = BPF_FIREWALL_SUPPORTED;
 766         } else {
 767                 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
 768                 return supported = BPF_FIREWALL_UNSUPPORTED;
 769         }
 770 }