src/core/bpf-firewall.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #include <arpa/inet.h>
   4 #include <assert.h>
   5 #include <errno.h>
   6 #include <fcntl.h>
   7 #include <linux/bpf_insn.h>
   8 #include <net/ethernet.h>
   9 #include <net/if.h>
  10 #include <netinet/ip.h>
  11 #include <netinet/ip6.h>
  12 #include <stddef.h>
  13 #include <stdio.h>
  14 #include <stdlib.h>
  15 #include <unistd.h>
  16
  17 #include "alloc-util.h"
  18 #include "bpf-firewall.h"
  19 #include "bpf-program.h"
  20 #include "fd-util.h"
  21 #include "ip-address-access.h"
  22 #include "memory-util.h"
  23 #include "missing_syscall.h"
  24 #include "unit.h"
  25 #include "strv.h"
  26 #include "virt.h"
  27
  28 enum {
  29         MAP_KEY_PACKETS,
  30         MAP_KEY_BYTES,
  31 };
  32
  33 enum {
  34         ACCESS_ALLOWED = 1,
  35         ACCESS_DENIED  = 2,
  36 };
  37
  38 /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
  39
  40 static int add_lookup_instructions(
  41                 BPFProgram *p,
  42                 int map_fd,
  43                 int protocol,
  44                 bool is_ingress,
  45                 int verdict) {
  46
  47         int r, addr_offset, addr_size;
  48
  49         assert(p);
  50         assert(map_fd >= 0);
  51
  52         switch (protocol) {
  53
  54         case ETH_P_IP:
  55                 addr_size = sizeof(uint32_t);
  56                 addr_offset = is_ingress ?
  57                         offsetof(struct iphdr, saddr) :
  58                         offsetof(struct iphdr, daddr);
  59                 break;
  60
  61         case ETH_P_IPV6:
  62                 addr_size = 4 * sizeof(uint32_t);
  63                 addr_offset = is_ingress ?
  64                         offsetof(struct ip6_hdr, ip6_src.s6_addr) :
  65                         offsetof(struct ip6_hdr, ip6_dst.s6_addr);
  66                 break;
  67
  68         default:
  69                 return -EAFNOSUPPORT;
  70         }
  71
  72         do {
  73                 /* Compare IPv4 with one word instruction (32bit) */
  74                 struct bpf_insn insn[] = {
  75                         /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
  76                         BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
  77
  78                         /*
  79                          * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
  80                          *
  81                          * R1: Pointer to the skb
  82                          * R2: Data offset
  83                          * R3: Destination buffer on the stack (r10 - 4)
  84                          * R4: Number of bytes to read (4)
  85                          */
  86
  87                         BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
  88                         BPF_MOV32_IMM(BPF_REG_2, addr_offset),
  89
  90                         BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
  91                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
  92
  93                         BPF_MOV32_IMM(BPF_REG_4, addr_size),
  94                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
  95
  96                         /*
  97                          * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
  98                          * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
  99                          * has to be set to the maximum possible value.
 100                          *
 101                          * On success, the looked up value is stored in R0. For this application, the actual
 102                          * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
 103                          * matching value.
 104                          */
 105
 106                         BPF_LD_MAP_FD(BPF_REG_1, map_fd),
 107                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 108                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
 109                         BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
 110
 111                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 112                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
 113                         BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
 114                 };
 115
 116                 /* Jump label fixup */
 117                 insn[0].off = ELEMENTSOF(insn) - 1;
 118
 119                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 120                 if (r < 0)
 121                         return r;
 122
 123         } while (false);
 124
 125         return 0;
 126 }
 127
 128 static int add_instructions_for_ip_any(
 129                 BPFProgram *p,
 130                 int verdict) {
 131         int r;
 132
 133         assert(p);
 134
 135         const struct bpf_insn insn[] = {
 136                 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
 137         };
 138
 139         r = bpf_program_add_instructions(p, insn, 1);
 140         if (r < 0)
 141                 return r;
 142
 143         return 0;
 144 }
 145
 146 static int bpf_firewall_compile_bpf(
 147                 Unit *u,
 148                 bool is_ingress,
 149                 BPFProgram **ret,
 150                 bool ip_allow_any,
 151                 bool ip_deny_any) {
 152
 153         const struct bpf_insn pre_insn[] = {
 154                 /*
 155                  * When the eBPF program is entered, R1 contains the address of the skb.
 156                  * However, R1-R5 are scratch registers that are not preserved when calling
 157                  * into kernel functions, so we need to save anything that's supposed to
 158                  * stay around to R6-R9. Save the skb to R6.
 159                  */
 160                 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 161
 162                 /*
 163                  * Although we cannot access the skb data directly from eBPF programs used in this
 164                  * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
 165                  * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
 166                  * for later use.
 167                  */
 168                 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
 169
 170                 /*
 171                  * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
 172                  * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
 173                  */
 174                 BPF_MOV32_IMM(BPF_REG_8, 0),
 175         };
 176
 177         /*
 178          * The access checkers compiled for the configured allowance and denial lists
 179          * write to R8 at runtime. The following code prepares for an early exit that
 180          * skip the accounting if the packet is denied.
 181          *
 182          * R0 = 1
 183          * if (R8 == ACCESS_DENIED)
 184          *     R0 = 0
 185          *
 186          * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
 187          * is allowed to pass.
 188          */
 189         const struct bpf_insn post_insn[] = {
 190                 BPF_MOV64_IMM(BPF_REG_0, 1),
 191                 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
 192                 BPF_MOV64_IMM(BPF_REG_0, 0),
 193         };
 194
 195         _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
 196         int accounting_map_fd, r;
 197         bool access_enabled;
 198
 199         assert(u);
 200         assert(ret);
 201
 202         accounting_map_fd = is_ingress ?
 203                 u->ip_accounting_ingress_map_fd :
 204                 u->ip_accounting_egress_map_fd;
 205
 206         access_enabled =
 207                 u->ipv4_allow_map_fd >= 0 ||
 208                 u->ipv6_allow_map_fd >= 0 ||
 209                 u->ipv4_deny_map_fd >= 0 ||
 210                 u->ipv6_deny_map_fd >= 0 ||
 211                 ip_allow_any ||
 212                 ip_deny_any;
 213
 214         if (accounting_map_fd < 0 && !access_enabled) {
 215                 *ret = NULL;
 216                 return 0;
 217         }
 218
 219         r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
 220         if (r < 0)
 221                 return r;
 222
 223         r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
 224         if (r < 0)
 225                 return r;
 226
 227         if (access_enabled) {
 228                 /*
 229                  * The simple rule this function translates into eBPF instructions is:
 230                  *
 231                  * - Access will be granted when an address matches an entry in @list_allow
 232                  * - Otherwise, access will be denied when an address matches an entry in @list_deny
 233                  * - Otherwise, access will be granted
 234                  */
 235
 236                 if (u->ipv4_deny_map_fd >= 0) {
 237                         r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
 238                         if (r < 0)
 239                                 return r;
 240                 }
 241
 242                 if (u->ipv6_deny_map_fd >= 0) {
 243                         r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
 244                         if (r < 0)
 245                                 return r;
 246                 }
 247
 248                 if (u->ipv4_allow_map_fd >= 0) {
 249                         r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
 250                         if (r < 0)
 251                                 return r;
 252                 }
 253
 254                 if (u->ipv6_allow_map_fd >= 0) {
 255                         r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
 256                         if (r < 0)
 257                                 return r;
 258                 }
 259
 260                 if (ip_allow_any) {
 261                         r = add_instructions_for_ip_any(p, ACCESS_ALLOWED);
 262                         if (r < 0)
 263                                 return r;
 264                 }
 265
 266                 if (ip_deny_any) {
 267                         r = add_instructions_for_ip_any(p, ACCESS_DENIED);
 268                         if (r < 0)
 269                                 return r;
 270                 }
 271         }
 272
 273         r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
 274         if (r < 0)
 275                 return r;
 276
 277         if (accounting_map_fd >= 0) {
 278                 struct bpf_insn insn[] = {
 279                         /*
 280                          * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
 281                          * The jump label will be fixed up later.
 282                          */
 283                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
 284
 285                         /* Count packets */
 286                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
 287                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
 288                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 289                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
 290                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
 291                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 292                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 293                         BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
 294                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
 295
 296                         /* Count bytes */
 297                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
 298                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
 299                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 300                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
 301                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
 302                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 303                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 304                         BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
 305                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
 306
 307                         /* Allow the packet to pass */
 308                         BPF_MOV64_IMM(BPF_REG_0, 1),
 309                 };
 310
 311                 /* Jump label fixup */
 312                 insn[0].off = ELEMENTSOF(insn) - 1;
 313
 314                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 315                 if (r < 0)
 316                         return r;
 317         }
 318
 319         do {
 320                 /*
 321                  * Exit from the eBPF program, R0 contains the verdict.
 322                  * 0 means the packet is denied, 1 means the packet may pass.
 323                  */
 324                 const struct bpf_insn insn[] = {
 325                         BPF_EXIT_INSN()
 326                 };
 327
 328                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 329                 if (r < 0)
 330                         return r;
 331         } while (false);
 332
 333         *ret = TAKE_PTR(p);
 334
 335         return 0;
 336 }
 337
 338 static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) {
 339         IPAddressAccessItem *a;
 340
 341         assert(n_ipv4);
 342         assert(n_ipv6);
 343
 344         LIST_FOREACH(items, a, list) {
 345                 switch (a->family) {
 346
 347                 case AF_INET:
 348                         (*n_ipv4)++;
 349                         break;
 350
 351                 case AF_INET6:
 352                         (*n_ipv6)++;
 353                         break;
 354
 355                 default:
 356                         return -EAFNOSUPPORT;
 357                 }
 358         }
 359
 360         return 0;
 361 }
 362
 363 static int bpf_firewall_add_access_items(
 364                 IPAddressAccessItem *list,
 365                 int ipv4_map_fd,
 366                 int ipv6_map_fd,
 367                 int verdict) {
 368
 369         struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
 370         uint64_t value = verdict;
 371         IPAddressAccessItem *a;
 372         int r;
 373
 374         key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
 375         key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
 376
 377         LIST_FOREACH(items, a, list) {
 378                 switch (a->family) {
 379
 380                 case AF_INET:
 381                         key_ipv4->prefixlen = a->prefixlen;
 382                         memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
 383
 384                         r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
 385                         if (r < 0)
 386                                 return r;
 387
 388                         break;
 389
 390                 case AF_INET6:
 391                         key_ipv6->prefixlen = a->prefixlen;
 392                         memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
 393
 394                         r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
 395                         if (r < 0)
 396                                 return r;
 397
 398                         break;
 399
 400                 default:
 401                         return -EAFNOSUPPORT;
 402                 }
 403         }
 404
 405         return 0;
 406 }
 407
 408 static int bpf_firewall_prepare_access_maps(
 409                 Unit *u,
 410                 int verdict,
 411                 int *ret_ipv4_map_fd,
 412                 int *ret_ipv6_map_fd,
 413                 bool *ret_has_any) {
 414
 415         _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
 416         size_t n_ipv4 = 0, n_ipv6 = 0;
 417         IPAddressAccessItem *list;
 418         Unit *p;
 419         int r;
 420
 421         assert(ret_ipv4_map_fd);
 422         assert(ret_ipv6_map_fd);
 423         assert(ret_has_any);
 424
 425         for (p = u; p; p = UNIT_DEREF(p->slice)) {
 426                 CGroupContext *cc;
 427
 428                 cc = unit_get_cgroup_context(p);
 429                 if (!cc)
 430                         continue;
 431
 432                 list = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny;
 433
 434                 bpf_firewall_count_access_items(list, &n_ipv4, &n_ipv6);
 435
 436                 /* Skip making the LPM trie map in cases where we are using "any" in order to hack around
 437                  * needing CAP_SYS_ADMIN for allocating LPM trie map. */
 438                 if (ip_address_access_item_is_any(list)) {
 439                         *ret_has_any = true;
 440                         return 0;
 441                 }
 442         }
 443
 444         if (n_ipv4 > 0) {
 445                 ipv4_map_fd = bpf_map_new(
 446                                 BPF_MAP_TYPE_LPM_TRIE,
 447                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
 448                                 sizeof(uint64_t),
 449                                 n_ipv4,
 450                                 BPF_F_NO_PREALLOC);
 451                 if (ipv4_map_fd < 0)
 452                         return ipv4_map_fd;
 453         }
 454
 455         if (n_ipv6 > 0) {
 456                 ipv6_map_fd = bpf_map_new(
 457                                 BPF_MAP_TYPE_LPM_TRIE,
 458                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
 459                                 sizeof(uint64_t),
 460                                 n_ipv6,
 461                                 BPF_F_NO_PREALLOC);
 462                 if (ipv6_map_fd < 0)
 463                         return ipv6_map_fd;
 464         }
 465
 466         for (p = u; p; p = UNIT_DEREF(p->slice)) {
 467                 CGroupContext *cc;
 468
 469                 cc = unit_get_cgroup_context(p);
 470                 if (!cc)
 471                         continue;
 472
 473                 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
 474                                                   ipv4_map_fd, ipv6_map_fd, verdict);
 475                 if (r < 0)
 476                         return r;
 477         }
 478
 479         *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd);
 480         *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd);
 481         *ret_has_any = false;
 482         return 0;
 483 }
 484
 485 static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
 486         int r;
 487
 488         assert(u);
 489         assert(fd_ingress);
 490         assert(fd_egress);
 491
 492         if (enabled) {
 493                 if (*fd_ingress < 0) {
 494                         r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
 495                         if (r < 0)
 496                                 return r;
 497
 498                         *fd_ingress = r;
 499                 }
 500
 501                 if (*fd_egress < 0) {
 502
 503                         r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
 504                         if (r < 0)
 505                                 return r;
 506
 507                         *fd_egress = r;
 508                 }
 509
 510         } else {
 511                 *fd_ingress = safe_close(*fd_ingress);
 512                 *fd_egress = safe_close(*fd_egress);
 513
 514                 zero(u->ip_accounting_extra);
 515         }
 516
 517         return 0;
 518 }
 519
 520 int bpf_firewall_compile(Unit *u) {
 521         CGroupContext *cc;
 522         int r, supported;
 523         bool ip_allow_any = false, ip_deny_any = false;
 524
 525         assert(u);
 526
 527         cc = unit_get_cgroup_context(u);
 528         if (!cc)
 529                 return -EINVAL;
 530
 531         supported = bpf_firewall_supported();
 532         if (supported < 0)
 533                 return supported;
 534         if (supported == BPF_FIREWALL_UNSUPPORTED)
 535                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
 536                                             "BPF firewalling not supported on this manager, proceeding without.");
 537         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
 538                 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
 539                  * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
 540                  * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
 541                  * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
 542                  * all, either. */
 543                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
 544                                             "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
 545
 546         /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
 547          * but we reuse the accounting maps. That way the firewall in effect always maps to the actual
 548          * configuration, but we don't flush out the accounting unnecessarily */
 549
 550         u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
 551         u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);
 552
 553         u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
 554         u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
 555
 556         u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
 557         u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
 558
 559         if (u->type != UNIT_SLICE) {
 560                 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
 561                  * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
 562                  * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
 563                  * means that all configure IP access rules *will* take effect on processes, even though we never
 564                  * compile them for inner nodes. */
 565
 566                 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any);
 567                 if (r < 0)
 568                         return log_unit_error_errno(u, r, "Preparation of eBPF allow maps failed: %m");
 569
 570                 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any);
 571                 if (r < 0)
 572                         return log_unit_error_errno(u, r, "Preparation of eBPF deny maps failed: %m");
 573         }
 574
 575         r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
 576         if (r < 0)
 577                 return log_unit_error_errno(u, r, "Preparation of eBPF accounting maps failed: %m");
 578
 579         r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any);
 580         if (r < 0)
 581                 return log_unit_error_errno(u, r, "Compilation for ingress BPF program failed: %m");
 582
 583         r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any);
 584         if (r < 0)
 585                 return log_unit_error_errno(u, r, "Compilation for egress BPF program failed: %m");
 586
 587         return 0;
 588 }
 589
 590 DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(filter_prog_hash_ops, void, trivial_hash_func, trivial_compare_func, BPFProgram, bpf_program_unref);
 591
 592 static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) {
 593         char **bpf_fs_path;
 594
 595         set_clear(*set);
 596
 597         STRV_FOREACH(bpf_fs_path, filter_paths) {
 598                 _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL;
 599                 int r;
 600
 601                 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &prog);
 602                 if (r < 0)
 603                         return log_unit_error_errno(u, r, "Can't allocate CGROUP SKB BPF program: %m");
 604
 605                 r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path);
 606                 if (r < 0)
 607                         return log_unit_error_errno(u, r, "Loading of ingress BPF program %s failed: %m", *bpf_fs_path);
 608
 609                 r = set_ensure_consume(set, &filter_prog_hash_ops, TAKE_PTR(prog));
 610                 if (r < 0)
 611                         return log_unit_error_errno(u, r, "Can't add program to BPF program set: %m");
 612         }
 613
 614         return 0;
 615 }
 616
 617 int bpf_firewall_load_custom(Unit *u) {
 618         CGroupContext *cc;
 619         int r, supported;
 620
 621         assert(u);
 622
 623         cc = unit_get_cgroup_context(u);
 624         if (!cc)
 625                 return 0;
 626
 627         if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
 628                 return 0;
 629
 630         supported = bpf_firewall_supported();
 631         if (supported < 0)
 632                 return supported;
 633
 634         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI)
 635                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs.");
 636
 637         r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress);
 638         if (r < 0)
 639                 return r;
 640         r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress);
 641         if (r < 0)
 642                 return r;
 643
 644         return 0;
 645 }
 646
 647 static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) {
 648         BPFProgram *prog;
 649         int r;
 650
 651         assert(u);
 652
 653         set_clear(*set_installed);
 654
 655         SET_FOREACH(prog, *set) {
 656                 r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI);
 657                 if (r < 0)
 658                         return log_unit_error_errno(u, r, "Attaching custom egress BPF program to cgroup %s failed: %m", path);
 659
 660                 /* Remember that these BPF programs are installed now. */
 661                 r = set_ensure_put(set_installed, &filter_prog_hash_ops, prog);
 662                 if (r < 0)
 663                         return log_unit_error_errno(u, r, "Can't add program to BPF program set: %m");
 664                 bpf_program_ref(prog);
 665         }
 666
 667         return 0;
 668 }
 669
 670 int bpf_firewall_install(Unit *u) {
 671         _cleanup_free_ char *path = NULL;
 672         CGroupContext *cc;
 673         int r, supported;
 674         uint32_t flags;
 675
 676         assert(u);
 677
 678         cc = unit_get_cgroup_context(u);
 679         if (!cc)
 680                 return -EINVAL;
 681         if (!u->cgroup_path)
 682                 return -EINVAL;
 683         if (!u->cgroup_realized)
 684                 return -EINVAL;
 685
 686         supported = bpf_firewall_supported();
 687         if (supported < 0)
 688                 return supported;
 689         if (supported == BPF_FIREWALL_UNSUPPORTED) {
 690                 log_unit_debug(u, "BPF firewalling not supported on this manager, proceeding without.");
 691                 return -EOPNOTSUPP;
 692         }
 693         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
 694                 log_unit_debug(u, "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
 695                 return -EOPNOTSUPP;
 696         }
 697         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
 698             (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress)))
 699                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs.");
 700
 701         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
 702         if (r < 0)
 703                 return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m");
 704
 705         flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
 706                  (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
 707
 708         /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
 709          * minimize the time window when we don't account for IP traffic. */
 710         u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
 711         u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
 712
 713         if (u->ip_bpf_egress) {
 714                 r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path,
 715                                               flags | (set_isempty(u->ip_bpf_custom_egress) ? 0 : BPF_F_ALLOW_MULTI));
 716                 if (r < 0)
 717                         return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path);
 718
 719                 /* Remember that this BPF program is installed now. */
 720                 u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
 721         }
 722
 723         if (u->ip_bpf_ingress) {
 724                 r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path,
 725                                               flags | (set_isempty(u->ip_bpf_custom_ingress) ? 0 : BPF_F_ALLOW_MULTI));
 726                 if (r < 0)
 727                         return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
 728
 729                 u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
 730         }
 731
 732         r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed);
 733         if (r < 0)
 734                 return r;
 735
 736         r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed);
 737         if (r < 0)
 738                 return r;
 739
 740         return 0;
 741 }
 742
 743 int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
 744         uint64_t key, packets;
 745         int r;
 746
 747         if (map_fd < 0)
 748                 return -EBADF;
 749
 750         if (ret_packets) {
 751                 key = MAP_KEY_PACKETS;
 752                 r = bpf_map_lookup_element(map_fd, &key, &packets);
 753                 if (r < 0)
 754                         return r;
 755         }
 756
 757         if (ret_bytes) {
 758                 key = MAP_KEY_BYTES;
 759                 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
 760                 if (r < 0)
 761                         return r;
 762         }
 763
 764         if (ret_packets)
 765                 *ret_packets = packets;
 766
 767         return 0;
 768 }
 769
 770 int bpf_firewall_reset_accounting(int map_fd) {
 771         uint64_t key, value = 0;
 772         int r;
 773
 774         if (map_fd < 0)
 775                 return -EBADF;
 776
 777         key = MAP_KEY_PACKETS;
 778         r = bpf_map_update_element(map_fd, &key, &value);
 779         if (r < 0)
 780                 return r;
 781
 782         key = MAP_KEY_BYTES;
 783         return bpf_map_update_element(map_fd, &key, &value);
 784 }
 785
 786 static int bpf_firewall_unsupported_reason = 0;
 787
 788 int bpf_firewall_supported(void) {
 789         const struct bpf_insn trivial[] = {
 790                 BPF_MOV64_IMM(BPF_REG_0, 1),
 791                 BPF_EXIT_INSN()
 792         };
 793
 794         _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
 795         static int supported = -1;
 796         union bpf_attr attr;
 797         int r;
 798
 799         /* Checks whether BPF firewalling is supported. For this, we check the following things:
 800          *
 801          * - whether the unified hierarchy is being used
 802          * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
 803          * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
 804          */
 805         if (supported >= 0)
 806                 return supported;
 807
 808         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
 809         if (r < 0)
 810                 return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
 811         if (r == 0) {
 812                 bpf_firewall_unsupported_reason =
 813                         log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN),
 814                                         "Not running with unified cgroups, BPF firewalling is not supported.");
 815                 return supported = BPF_FIREWALL_UNSUPPORTED;
 816         }
 817
 818         r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program);
 819         if (r < 0) {
 820                 bpf_firewall_unsupported_reason =
 821                         log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
 822                 return supported = BPF_FIREWALL_UNSUPPORTED;
 823         }
 824
 825         r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
 826         if (r < 0) {
 827                 bpf_firewall_unsupported_reason =
 828                         log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
 829                 return supported = BPF_FIREWALL_UNSUPPORTED;
 830         }
 831
 832         r = bpf_program_load_kernel(program, NULL, 0);
 833         if (r < 0) {
 834                 bpf_firewall_unsupported_reason =
 835                         log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
 836                 return supported = BPF_FIREWALL_UNSUPPORTED;
 837         }
 838
 839         /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
 840          * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
 841          * program if we can't do a thing with it later?
 842          *
 843          * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
 844          * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
 845          * parameters are validated however, and that'll fail with EBADF then. */
 846
 847         attr = (union bpf_attr) {
 848                 .attach_type = BPF_CGROUP_INET_EGRESS,
 849                 .target_fd = -1,
 850                 .attach_bpf_fd = -1,
 851         };
 852
 853         if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
 854                 if (errno != EBADF) {
 855                         bpf_firewall_unsupported_reason =
 856                                 log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
 857                         return supported = BPF_FIREWALL_UNSUPPORTED;
 858                 }
 859
 860                 /* YAY! */
 861         } else {
 862                 log_debug("Wut? Kernel accepted our invalid BPF_PROG_DETACH call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
 863                 return supported = BPF_FIREWALL_UNSUPPORTED;
 864         }
 865
 866         /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
 867          * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
 868          * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
 869          * get EINVAL if it's not supported, and EBADF as before if it is available. */
 870
 871         attr = (union bpf_attr) {
 872                 .attach_type = BPF_CGROUP_INET_EGRESS,
 873                 .target_fd = -1,
 874                 .attach_bpf_fd = -1,
 875                 .attach_flags = BPF_F_ALLOW_MULTI,
 876         };
 877
 878         if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
 879                 if (errno == EBADF) {
 880                         log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
 881                         return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
 882                 }
 883
 884                 if (errno == EINVAL)
 885                         log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
 886                 else
 887                         log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
 888
 889                 return supported = BPF_FIREWALL_SUPPORTED;
 890         } else {
 891                 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
 892                 return supported = BPF_FIREWALL_UNSUPPORTED;
 893         }
 894 }
 895
 896 void emit_bpf_firewall_warning(Unit *u) {
 897         static bool warned = false;
 898
 899         if (!warned) {
 900                 bool quiet = bpf_firewall_unsupported_reason == -EPERM && detect_container();
 901
 902                 log_unit_full(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason,
 903                               "unit configures an IP firewall, but %s.\n"
 904                               "(This warning is only shown for the first unit using IP firewalling.)",
 905                               getuid() != 0 ? "not running as root" :
 906                                               "the local system does not support BPF/cgroup firewalling");
 907                 warned = true;
 908         }
 909 }