src/core/bpf-firewall.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <linux/bpf.h>
   4 #include <linux/bpf_insn.h>
   5 #include <linux/if_ether.h>
   6 #include <net/if.h>
   7 #include <netinet/ip.h>
   8 #include <netinet/ip6.h>
   9 #include <stdio.h>
  10 #include <unistd.h>
  11
  12 #include "alloc-util.h"
  13 #include "bpf-firewall.h"
  14 #include "bpf-program.h"
  15 #include "errno-util.h"
  16 #include "fd-util.h"
  17 #include "in-addr-prefix-util.h"
  18 #include "manager.h"
  19 #include "memory-util.h"
  20 #include "set.h"
  21 #include "string-util.h"
  22 #include "strv.h"
  23 #include "unit.h"
  24 #include "virt.h"
  25
  26 enum {
  27         MAP_KEY_PACKETS,
  28         MAP_KEY_BYTES,
  29 };
  30
  31 enum {
  32         ACCESS_ALLOWED = 1,
  33         ACCESS_DENIED  = 2,
  34 };
  35
  36 /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
  37
  38 static int add_lookup_instructions(
  39                 BPFProgram *p,
  40                 int map_fd,
  41                 int protocol,
  42                 bool is_ingress,
  43                 int verdict) {
  44
  45         int r, addr_offset, addr_size;
  46
  47         assert(p);
  48         assert(map_fd >= 0);
  49
  50         switch (protocol) {
  51
  52         case ETH_P_IP:
  53                 addr_size = sizeof(uint32_t);
  54                 addr_offset = is_ingress ?
  55                         offsetof(struct iphdr, saddr) :
  56                         offsetof(struct iphdr, daddr);
  57                 break;
  58
  59         case ETH_P_IPV6:
  60                 addr_size = 4 * sizeof(uint32_t);
  61                 addr_offset = is_ingress ?
  62                         offsetof(struct ip6_hdr, ip6_src.s6_addr) :
  63                         offsetof(struct ip6_hdr, ip6_dst.s6_addr);
  64                 break;
  65
  66         default:
  67                 return -EAFNOSUPPORT;
  68         }
  69
  70         do {
  71                 /* Compare IPv4 with one word instruction (32-bit) */
  72                 struct bpf_insn insn[] = {
  73                         /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
  74                         BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
  75
  76                         /*
  77                          * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
  78                          *
  79                          * R1: Pointer to the skb
  80                          * R2: Data offset
  81                          * R3: Destination buffer on the stack (r10 - 4)
  82                          * R4: Number of bytes to read (4)
  83                          */
  84
  85                         BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
  86                         BPF_MOV32_IMM(BPF_REG_2, addr_offset),
  87
  88                         BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
  89                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
  90
  91                         BPF_MOV32_IMM(BPF_REG_4, addr_size),
  92                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
  93
  94                         /*
  95                          * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
  96                          * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
  97                          * has to be set to the maximum possible value.
  98                          *
  99                          * On success, the looked up value is stored in R0. For this application, the actual
 100                          * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
 101                          * matching value.
 102                          */
 103
 104                         BPF_LD_MAP_FD(BPF_REG_1, map_fd),
 105                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 106                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
 107                         BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
 108
 109                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 110                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
 111                         BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
 112                 };
 113
 114                 /* Jump label fixup */
 115                 insn[0].off = ELEMENTSOF(insn) - 1;
 116
 117                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 118                 if (r < 0)
 119                         return r;
 120
 121         } while (false);
 122
 123         return 0;
 124 }
 125
 126 static int add_instructions_for_ip_any(
 127                 BPFProgram *p,
 128                 int verdict) {
 129         int r;
 130
 131         assert(p);
 132
 133         const struct bpf_insn insn[] = {
 134                 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
 135         };
 136
 137         r = bpf_program_add_instructions(p, insn, 1);
 138         if (r < 0)
 139                 return r;
 140
 141         return 0;
 142 }
 143
 144 static int bpf_firewall_compile_bpf(
 145                 Unit *u,
 146                 const char *prog_name,
 147                 bool is_ingress,
 148                 BPFProgram **ret,
 149                 bool ip_allow_any,
 150                 bool ip_deny_any) {
 151
 152         const struct bpf_insn pre_insn[] = {
 153                 /*
 154                  * When the eBPF program is entered, R1 contains the address of the skb.
 155                  * However, R1-R5 are scratch registers that are not preserved when calling
 156                  * into kernel functions, so we need to save anything that's supposed to
 157                  * stay around to R6-R9. Save the skb to R6.
 158                  */
 159                 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 160
 161                 /*
 162                  * Although we cannot access the skb data directly from eBPF programs used in this
 163                  * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
 164                  * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
 165                  * for later use.
 166                  */
 167                 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
 168
 169                 /*
 170                  * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
 171                  * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
 172                  */
 173                 BPF_MOV32_IMM(BPF_REG_8, 0),
 174         };
 175
 176         /*
 177          * The access checkers compiled for the configured allowance and denial lists
 178          * write to R8 at runtime. The following code prepares for an early exit that
 179          * skip the accounting if the packet is denied.
 180          *
 181          * R0 = 1
 182          * if (R8 == ACCESS_DENIED)
 183          *     R0 = 0
 184          *
 185          * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
 186          * is allowed to pass.
 187          */
 188         const struct bpf_insn post_insn[] = {
 189                 BPF_MOV64_IMM(BPF_REG_0, 1),
 190                 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
 191                 BPF_MOV64_IMM(BPF_REG_0, 0),
 192         };
 193
 194         _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
 195         int accounting_map_fd, r;
 196         bool access_enabled;
 197         CGroupRuntime *crt;
 198
 199         assert(u);
 200         assert(ret);
 201
 202         crt = unit_get_cgroup_runtime(u);
 203         if (!crt) {
 204                 *ret = NULL;
 205                 return 0;
 206         }
 207
 208         accounting_map_fd = is_ingress ?
 209                 crt->ip_accounting_ingress_map_fd :
 210                 crt->ip_accounting_egress_map_fd;
 211
 212         access_enabled =
 213                 crt->ipv4_allow_map_fd >= 0 ||
 214                 crt->ipv6_allow_map_fd >= 0 ||
 215                 crt->ipv4_deny_map_fd >= 0 ||
 216                 crt->ipv6_deny_map_fd >= 0 ||
 217                 ip_allow_any ||
 218                 ip_deny_any;
 219
 220         if (accounting_map_fd < 0 && !access_enabled) {
 221                 *ret = NULL;
 222                 return 0;
 223         }
 224
 225         r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, prog_name, &p);
 226         if (r < 0)
 227                 return r;
 228
 229         r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
 230         if (r < 0)
 231                 return r;
 232
 233         if (access_enabled) {
 234                 /*
 235                  * The simple rule this function translates into eBPF instructions is:
 236                  *
 237                  * - Access will be granted when an address matches an entry in @list_allow
 238                  * - Otherwise, access will be denied when an address matches an entry in @list_deny
 239                  * - Otherwise, access will be granted
 240                  */
 241
 242                 if (crt->ipv4_deny_map_fd >= 0) {
 243                         r = add_lookup_instructions(p, crt->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
 244                         if (r < 0)
 245                                 return r;
 246                 }
 247
 248                 if (crt->ipv6_deny_map_fd >= 0) {
 249                         r = add_lookup_instructions(p, crt->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
 250                         if (r < 0)
 251                                 return r;
 252                 }
 253
 254                 if (crt->ipv4_allow_map_fd >= 0) {
 255                         r = add_lookup_instructions(p, crt->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
 256                         if (r < 0)
 257                                 return r;
 258                 }
 259
 260                 if (crt->ipv6_allow_map_fd >= 0) {
 261                         r = add_lookup_instructions(p, crt->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
 262                         if (r < 0)
 263                                 return r;
 264                 }
 265
 266                 if (ip_allow_any) {
 267                         r = add_instructions_for_ip_any(p, ACCESS_ALLOWED);
 268                         if (r < 0)
 269                                 return r;
 270                 }
 271
 272                 if (ip_deny_any) {
 273                         r = add_instructions_for_ip_any(p, ACCESS_DENIED);
 274                         if (r < 0)
 275                                 return r;
 276                 }
 277         }
 278
 279         r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
 280         if (r < 0)
 281                 return r;
 282
 283         if (accounting_map_fd >= 0) {
 284                 struct bpf_insn insn[] = {
 285                         /*
 286                          * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
 287                          * The jump label will be fixed up later.
 288                          */
 289                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
 290
 291                         /* Count packets */
 292                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
 293                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
 294                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 295                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
 296                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
 297                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 298                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 299                         BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
 300                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
 301
 302                         /* Count bytes */
 303                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
 304                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
 305                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 306                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
 307                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
 308                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 309                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 310                         BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
 311                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
 312
 313                         /* Allow the packet to pass */
 314                         BPF_MOV64_IMM(BPF_REG_0, 1),
 315                 };
 316
 317                 /* Jump label fixup */
 318                 insn[0].off = ELEMENTSOF(insn) - 1;
 319
 320                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 321                 if (r < 0)
 322                         return r;
 323         }
 324
 325         do {
 326                 /*
 327                  * Exit from the eBPF program, R0 contains the verdict.
 328                  * 0 means the packet is denied, 1 means the packet may pass.
 329                  */
 330                 const struct bpf_insn insn[] = {
 331                         BPF_EXIT_INSN()
 332                 };
 333
 334                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 335                 if (r < 0)
 336                         return r;
 337         } while (false);
 338
 339         *ret = TAKE_PTR(p);
 340
 341         return 0;
 342 }
 343
 344 static int bpf_firewall_count_access_items(Set *prefixes, size_t *n_ipv4, size_t *n_ipv6) {
 345         struct in_addr_prefix *a;
 346
 347         assert(n_ipv4);
 348         assert(n_ipv6);
 349
 350         SET_FOREACH(a, prefixes)
 351                 switch (a->family) {
 352
 353                 case AF_INET:
 354                         (*n_ipv4)++;
 355                         break;
 356
 357                 case AF_INET6:
 358                         (*n_ipv6)++;
 359                         break;
 360
 361                 default:
 362                         return -EAFNOSUPPORT;
 363                 }
 364
 365         return 0;
 366 }
 367
 368 static int bpf_firewall_add_access_items(
 369                 Set *prefixes,
 370                 int ipv4_map_fd,
 371                 int ipv6_map_fd,
 372                 int verdict) {
 373
 374         struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
 375         struct in_addr_prefix *a;
 376         uint64_t value = verdict;
 377         int r;
 378
 379         key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
 380         key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
 381
 382         SET_FOREACH(a, prefixes)
 383                 switch (a->family) {
 384
 385                 case AF_INET:
 386                         key_ipv4->prefixlen = a->prefixlen;
 387                         memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
 388
 389                         r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
 390                         if (r < 0)
 391                                 return r;
 392
 393                         break;
 394
 395                 case AF_INET6:
 396                         key_ipv6->prefixlen = a->prefixlen;
 397                         memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
 398
 399                         r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
 400                         if (r < 0)
 401                                 return r;
 402
 403                         break;
 404
 405                 default:
 406                         return -EAFNOSUPPORT;
 407                 }
 408
 409         return 0;
 410 }
 411
 412 static int bpf_firewall_prepare_access_maps(
 413                 Unit *u,
 414                 int verdict,
 415                 int *ret_ipv4_map_fd,
 416                 int *ret_ipv6_map_fd,
 417                 bool *ret_has_any) {
 418
 419         _cleanup_close_ int ipv4_map_fd = -EBADF, ipv6_map_fd = -EBADF;
 420         size_t n_ipv4 = 0, n_ipv6 = 0;
 421         int r;
 422
 423         assert(ret_ipv4_map_fd);
 424         assert(ret_ipv6_map_fd);
 425         assert(ret_has_any);
 426
 427         for (Unit *p = u; p; p = UNIT_GET_SLICE(p)) {
 428                 CGroupContext *cc;
 429                 Set *prefixes;
 430                 bool *reduced;
 431
 432                 cc = unit_get_cgroup_context(p);
 433                 if (!cc)
 434                         continue;
 435
 436                 prefixes = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny;
 437                 reduced = verdict == ACCESS_ALLOWED ? &cc->ip_address_allow_reduced : &cc->ip_address_deny_reduced;
 438
 439                 if (!*reduced) {
 440                         r = in_addr_prefixes_reduce(prefixes);
 441                         if (r < 0)
 442                                 return r;
 443
 444                         *reduced = true;
 445                 }
 446
 447                 bpf_firewall_count_access_items(prefixes, &n_ipv4, &n_ipv6);
 448
 449                 /* Skip making the LPM trie map in cases where we are using "any" in order to hack around
 450                  * needing CAP_SYS_ADMIN for allocating LPM trie map. */
 451                 if (in_addr_prefixes_is_any(prefixes)) {
 452                         *ret_has_any = true;
 453                         return 0;
 454                 }
 455         }
 456
 457         if (n_ipv4 > 0) {
 458                 const char *name = strjoina("4_", u->id);
 459                 ipv4_map_fd = bpf_map_new(
 460                                 name,
 461                                 BPF_MAP_TYPE_LPM_TRIE,
 462                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
 463                                 sizeof(uint64_t),
 464                                 n_ipv4,
 465                                 BPF_F_NO_PREALLOC);
 466                 if (ipv4_map_fd < 0)
 467                         return ipv4_map_fd;
 468         }
 469
 470         if (n_ipv6 > 0) {
 471                 const char *name = strjoina("6_", u->id);
 472                 ipv6_map_fd = bpf_map_new(
 473                                 name,
 474                                 BPF_MAP_TYPE_LPM_TRIE,
 475                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
 476                                 sizeof(uint64_t),
 477                                 n_ipv6,
 478                                 BPF_F_NO_PREALLOC);
 479                 if (ipv6_map_fd < 0)
 480                         return ipv6_map_fd;
 481         }
 482
 483         for (Unit *p = u; p; p = UNIT_GET_SLICE(p)) {
 484                 CGroupContext *cc;
 485
 486                 cc = unit_get_cgroup_context(p);
 487                 if (!cc)
 488                         continue;
 489
 490                 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
 491                                                   ipv4_map_fd, ipv6_map_fd, verdict);
 492                 if (r < 0)
 493                         return r;
 494         }
 495
 496         *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd);
 497         *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd);
 498         *ret_has_any = false;
 499         return 0;
 500 }
 501
 502 static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, CGroupRuntime *crt) {
 503         int r;
 504
 505         assert(u);
 506         assert(crt);
 507
 508         if (enabled) {
 509                 if (crt->ip_accounting_ingress_map_fd < 0) {
 510                         const char *name = strjoina("I_", u->id);
 511                         r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
 512                         if (r < 0)
 513                                 return r;
 514
 515                         crt->ip_accounting_ingress_map_fd = r;
 516                 }
 517
 518                 if (crt->ip_accounting_egress_map_fd < 0) {
 519                         const char *name = strjoina("E_", u->id);
 520                         r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
 521                         if (r < 0)
 522                                 return r;
 523
 524                         crt->ip_accounting_egress_map_fd = r;
 525                 }
 526
 527         } else {
 528                 crt->ip_accounting_ingress_map_fd = safe_close(crt->ip_accounting_ingress_map_fd);
 529                 crt->ip_accounting_egress_map_fd = safe_close(crt->ip_accounting_egress_map_fd);
 530
 531                 zero(crt->ip_accounting_extra);
 532         }
 533
 534         return 0;
 535 }
 536
 537 int bpf_firewall_compile(Unit *u) {
 538         const char *ingress_name = NULL, *egress_name = NULL;
 539         bool ip_allow_any = false, ip_deny_any = false;
 540         CGroupContext *cc;
 541         CGroupRuntime *crt;
 542         int r;
 543
 544         assert(u);
 545
 546         cc = unit_get_cgroup_context(u);
 547         if (!cc)
 548                 return -EINVAL;
 549
 550         crt = unit_setup_cgroup_runtime(u);
 551         if (!crt)
 552                 return -ENOMEM;
 553
 554         if (bpf_program_supported() <= 0)
 555                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
 556                                             "bpf-firewall: BPF firewalling not supported, proceeding without.");
 557
 558         ingress_name = "sd_fw_ingress";
 559         egress_name = "sd_fw_egress";
 560
 561         /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
 562          * but we reuse the accounting maps. That way the firewall in effect always maps to the actual
 563          * configuration, but we don't flush out the accounting unnecessarily */
 564
 565         crt->ip_bpf_ingress = bpf_program_free(crt->ip_bpf_ingress);
 566         crt->ip_bpf_egress = bpf_program_free(crt->ip_bpf_egress);
 567
 568         crt->ipv4_allow_map_fd = safe_close(crt->ipv4_allow_map_fd);
 569         crt->ipv4_deny_map_fd = safe_close(crt->ipv4_deny_map_fd);
 570
 571         crt->ipv6_allow_map_fd = safe_close(crt->ipv6_allow_map_fd);
 572         crt->ipv6_deny_map_fd = safe_close(crt->ipv6_deny_map_fd);
 573
 574         if (u->type != UNIT_SLICE) {
 575                 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
 576                  * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
 577                  * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
 578                  * means that all configure IP access rules *will* take effect on processes, even though we never
 579                  * compile them for inner nodes. */
 580
 581                 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &crt->ipv4_allow_map_fd, &crt->ipv6_allow_map_fd, &ip_allow_any);
 582                 if (r < 0)
 583                         return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF allow maps failed: %m");
 584
 585                 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &crt->ipv4_deny_map_fd, &crt->ipv6_deny_map_fd, &ip_deny_any);
 586                 if (r < 0)
 587                         return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF deny maps failed: %m");
 588         }
 589
 590         r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, crt);
 591         if (r < 0)
 592                 return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF accounting maps failed: %m");
 593
 594         r = bpf_firewall_compile_bpf(u, ingress_name, true, &crt->ip_bpf_ingress, ip_allow_any, ip_deny_any);
 595         if (r < 0)
 596                 return log_unit_error_errno(u, r, "bpf-firewall: Compilation of ingress BPF program failed: %m");
 597
 598         r = bpf_firewall_compile_bpf(u, egress_name, false, &crt->ip_bpf_egress, ip_allow_any, ip_deny_any);
 599         if (r < 0)
 600                 return log_unit_error_errno(u, r, "bpf-firewall: Compilation of egress BPF program failed: %m");
 601
 602         return 0;
 603 }
 604
 605 static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) {
 606         set_clear(*set);
 607
 608         STRV_FOREACH(bpf_fs_path, filter_paths) {
 609                 _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
 610                 int r;
 611
 612                 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &prog);
 613                 if (r < 0)
 614                         return log_unit_error_errno(u, r, "bpf-firewall: Allocation of SKB BPF program failed: %m");
 615
 616                 r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path);
 617                 if (r < 0)
 618                         return log_unit_error_errno(u, r, "bpf-firewall: Loading of ingress BPF program %s failed: %m", *bpf_fs_path);
 619
 620                 r = set_ensure_consume(set, &bpf_program_hash_ops, TAKE_PTR(prog));
 621                 if (r < 0)
 622                         return log_oom();
 623         }
 624
 625         return 0;
 626 }
 627
 628 int bpf_firewall_load_custom(Unit *u) {
 629         CGroupContext *cc;
 630         CGroupRuntime *crt;
 631         int r;
 632
 633         assert(u);
 634
 635         cc = unit_get_cgroup_context(u);
 636         if (!cc)
 637                 return 0;
 638         crt = unit_get_cgroup_runtime(u);
 639         if (!crt)
 640                 return 0;
 641
 642         if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
 643                 return 0;
 644
 645         if (bpf_program_supported() <= 0)
 646                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
 647                                             "bpf-firewall: BPF firewalling not supported, cannot attach custom BPF programs.");
 648
 649         r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &crt->ip_bpf_custom_ingress);
 650         if (r < 0)
 651                 return r;
 652         r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &crt->ip_bpf_custom_egress);
 653         if (r < 0)
 654                 return r;
 655
 656         return 0;
 657 }
 658
 659 static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) {
 660         BPFProgram *prog;
 661         int r;
 662
 663         assert(u);
 664
 665         set_clear(*set_installed);
 666         r = set_ensure_allocated(set_installed, &bpf_program_hash_ops);
 667         if (r < 0)
 668                 return log_oom();
 669
 670         SET_FOREACH_MOVE(prog, *set_installed, *set) {
 671                 r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI);
 672                 if (r < 0)
 673                         return log_unit_error_errno(u, r, "bpf-firewall: Attaching custom egress BPF program to cgroup %s failed: %m", path);
 674         }
 675         return 0;
 676 }
 677
 678 int bpf_firewall_install(Unit *u) {
 679         _cleanup_(bpf_program_freep) BPFProgram *ip_bpf_ingress_uninstall = NULL, *ip_bpf_egress_uninstall = NULL;
 680         _cleanup_free_ char *path = NULL;
 681         CGroupContext *cc;
 682         CGroupRuntime *crt;
 683         int r;
 684
 685         assert(u);
 686
 687         cc = unit_get_cgroup_context(u);
 688         if (!cc)
 689                 return -EINVAL;
 690
 691         crt = unit_get_cgroup_runtime(u);
 692         if (!crt || !crt->cgroup_path)
 693                 return -EOWNERDEAD;
 694
 695         if (bpf_program_supported() <= 0)
 696                 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
 697                                             "bpf-firewall: BPF firewalling not supported, proceeding without.");
 698
 699         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &path);
 700         if (r < 0)
 701                 return log_unit_error_errno(u, r, "bpf-firewall: Failed to determine cgroup path: %m");
 702
 703         /* Let's clear the fields, but destroy the programs only after attaching the new programs, so that
 704          * there's no time window where neither program is attached. (There will be a program where both are
 705          * attached, but that's OK, since this is a security feature where we rather want to lock down too
 706          * much than too little. */
 707         ip_bpf_egress_uninstall = TAKE_PTR(crt->ip_bpf_egress_installed);
 708         ip_bpf_ingress_uninstall = TAKE_PTR(crt->ip_bpf_ingress_installed);
 709
 710         if (crt->ip_bpf_egress) {
 711                 r = bpf_program_cgroup_attach(crt->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, BPF_F_ALLOW_MULTI);
 712                 if (r < 0)
 713                         return log_unit_error_errno(u, r,
 714                                 "bpf-firewall: Attaching egress BPF program to cgroup %s failed: %m", path);
 715
 716                 /* Remember that this BPF program is installed now. */
 717                 crt->ip_bpf_egress_installed = TAKE_PTR(crt->ip_bpf_egress);
 718         }
 719
 720         if (crt->ip_bpf_ingress) {
 721                 r = bpf_program_cgroup_attach(crt->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, BPF_F_ALLOW_MULTI);
 722                 if (r < 0)
 723                         return log_unit_error_errno(u, r,
 724                                 "bpf-firewall: Attaching ingress BPF program to cgroup %s failed: %m", path);
 725
 726                 crt->ip_bpf_ingress_installed = TAKE_PTR(crt->ip_bpf_ingress);
 727         }
 728
 729         /* And now, definitely get rid of the old programs, and detach them */
 730         ip_bpf_egress_uninstall = bpf_program_free(ip_bpf_egress_uninstall);
 731         ip_bpf_ingress_uninstall = bpf_program_free(ip_bpf_ingress_uninstall);
 732
 733         r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &crt->ip_bpf_custom_egress, &crt->ip_bpf_custom_egress_installed);
 734         if (r < 0)
 735                 return r;
 736
 737         r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &crt->ip_bpf_custom_ingress, &crt->ip_bpf_custom_ingress_installed);
 738         if (r < 0)
 739                 return r;
 740
 741         return 0;
 742 }
 743
 744 int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
 745         uint64_t key, packets;
 746         int r;
 747
 748         if (map_fd < 0)
 749                 return -EBADF;
 750
 751         if (ret_packets) {
 752                 key = MAP_KEY_PACKETS;
 753                 r = bpf_map_lookup_element(map_fd, &key, &packets);
 754                 if (r < 0)
 755                         return r;
 756         }
 757
 758         if (ret_bytes) {
 759                 key = MAP_KEY_BYTES;
 760                 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
 761                 if (r < 0)
 762                         return r;
 763         }
 764
 765         if (ret_packets)
 766                 *ret_packets = packets;
 767
 768         return 0;
 769 }
 770
 771 int bpf_firewall_reset_accounting(int map_fd) {
 772         uint64_t key, value = 0;
 773         int r;
 774
 775         if (map_fd < 0)
 776                 return -EBADF;
 777
 778         key = MAP_KEY_PACKETS;
 779         r = bpf_map_update_element(map_fd, &key, &value);
 780         if (r < 0)
 781                 return r;
 782
 783         key = MAP_KEY_BYTES;
 784         return bpf_map_update_element(map_fd, &key, &value);
 785 }
 786
 787 void emit_bpf_firewall_warning(Unit *u) {
 788         static bool warned = false;
 789         int r;
 790
 791         assert(u);
 792         assert(u->manager);
 793
 794         if (warned || MANAGER_IS_TEST_RUN(u->manager))
 795                 return;
 796
 797         r = bpf_program_supported();
 798         assert(r < 0);
 799
 800         bool quiet = ERRNO_IS_NEG_PRIVILEGE(r) && detect_container() > 0;
 801
 802         log_unit_full_errno(u, quiet ? LOG_DEBUG : LOG_WARNING, r,
 803                             "unit configures an IP firewall, but %s.\n"
 804                             "(This warning is only shown for the first unit using IP firewalling.)",
 805                             getuid() != 0 ? "not running as root" :
 806                             "the local system does not support BPF/cgroup firewalling");
 807         warned = true;
 808 }
 809
 810 void bpf_firewall_close(CGroupRuntime *crt) {
 811         assert(crt);
 812
 813         crt->ip_accounting_ingress_map_fd = safe_close(crt->ip_accounting_ingress_map_fd);
 814         crt->ip_accounting_egress_map_fd = safe_close(crt->ip_accounting_egress_map_fd);
 815
 816         crt->ipv4_allow_map_fd = safe_close(crt->ipv4_allow_map_fd);
 817         crt->ipv6_allow_map_fd = safe_close(crt->ipv6_allow_map_fd);
 818         crt->ipv4_deny_map_fd = safe_close(crt->ipv4_deny_map_fd);
 819         crt->ipv6_deny_map_fd = safe_close(crt->ipv6_deny_map_fd);
 820
 821         crt->ip_bpf_ingress = bpf_program_free(crt->ip_bpf_ingress);
 822         crt->ip_bpf_ingress_installed = bpf_program_free(crt->ip_bpf_ingress_installed);
 823         crt->ip_bpf_egress = bpf_program_free(crt->ip_bpf_egress);
 824         crt->ip_bpf_egress_installed = bpf_program_free(crt->ip_bpf_egress_installed);
 825
 826         crt->ip_bpf_custom_ingress = set_free(crt->ip_bpf_custom_ingress);
 827         crt->ip_bpf_custom_egress = set_free(crt->ip_bpf_custom_egress);
 828         crt->ip_bpf_custom_ingress_installed = set_free(crt->ip_bpf_custom_ingress_installed);
 829         crt->ip_bpf_custom_egress_installed = set_free(crt->ip_bpf_custom_egress_installed);
 830 }