src/core/bpf-firewall.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2016 Daniel Mack
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <arpa/inet.h>
  22 #include <assert.h>
  23 #include <errno.h>
  24 #include <fcntl.h>
  25 #include <linux/libbpf.h>
  26 #include <net/ethernet.h>
  27 #include <net/if.h>
  28 #include <netinet/ip.h>
  29 #include <netinet/ip6.h>
  30 #include <stddef.h>
  31 #include <stdio.h>
  32 #include <stdlib.h>
  33 #include <string.h>
  34 #include <unistd.h>
  35
  36 #include "alloc-util.h"
  37 #include "bpf-firewall.h"
  38 #include "bpf-program.h"
  39 #include "fd-util.h"
  40 #include "ip-address-access.h"
  41 #include "unit.h"
  42
  43 enum {
  44         MAP_KEY_PACKETS,
  45         MAP_KEY_BYTES,
  46 };
  47
  48 enum {
  49         ACCESS_ALLOWED = 1,
  50         ACCESS_DENIED  = 2,
  51 };
  52
  53 /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
  54
  55 static int add_lookup_instructions(
  56                 BPFProgram *p,
  57                 int map_fd,
  58                 int protocol,
  59                 bool is_ingress,
  60                 int verdict) {
  61
  62         int r, addr_offset, addr_size;
  63
  64         assert(p);
  65         assert(map_fd >= 0);
  66
  67         switch (protocol) {
  68
  69         case ETH_P_IP:
  70                 addr_size = sizeof(uint32_t);
  71                 addr_offset = is_ingress ?
  72                         offsetof(struct iphdr, saddr) :
  73                         offsetof(struct iphdr, daddr);
  74                 break;
  75
  76         case ETH_P_IPV6:
  77                 addr_size = 4 * sizeof(uint32_t);
  78                 addr_offset = is_ingress ?
  79                         offsetof(struct ip6_hdr, ip6_src.s6_addr) :
  80                         offsetof(struct ip6_hdr, ip6_dst.s6_addr);
  81                 break;
  82
  83         default:
  84                 return -EAFNOSUPPORT;
  85         }
  86
  87         do {
  88                 /* Compare IPv4 with one word instruction (32bit) */
  89                 struct bpf_insn insn[] = {
  90                         /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
  91                         BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
  92
  93                         /*
  94                          * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
  95                          *
  96                          * R1: Pointer to the skb
  97                          * R2: Data offset
  98                          * R3: Destination buffer on the stack (r10 - 4)
  99                          * R4: Number of bytes to read (4)
 100                          */
 101
 102                         BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
 103                         BPF_MOV32_IMM(BPF_REG_2, addr_offset),
 104
 105                         BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
 106                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
 107
 108                         BPF_MOV32_IMM(BPF_REG_4, addr_size),
 109                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
 110
 111                         /*
 112                          * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
 113                          * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
 114                          * has to be set to the maximum possible value.
 115                          *
 116                          * On success, the looked up value is stored in R0. For this application, the actual
 117                          * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
 118                          * matching value.
 119                          */
 120
 121                         BPF_LD_MAP_FD(BPF_REG_1, map_fd),
 122                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 123                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
 124                         BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
 125
 126                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 127                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
 128                         BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
 129                 };
 130
 131                 /* Jump label fixup */
 132                 insn[0].off = ELEMENTSOF(insn) - 1;
 133
 134                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 135                 if (r < 0)
 136                         return r;
 137
 138         } while (false);
 139
 140         return 0;
 141 }
 142
 143 static int bpf_firewall_compile_bpf(
 144                 Unit *u,
 145                 bool is_ingress,
 146                 BPFProgram **ret) {
 147
 148         struct bpf_insn pre_insn[] = {
 149                 /*
 150                  * When the eBPF program is entered, R1 contains the address of the skb.
 151                  * However, R1-R5 are scratch registers that are not preserved when calling
 152                  * into kernel functions, so we need to save anything that's supposed to
 153                  * stay around to R6-R9. Save the skb to R6.
 154                  */
 155                 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
 156
 157                 /*
 158                  * Although we cannot access the skb data directly from eBPF programs used in this
 159                  * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
 160                  * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
 161                  * for later use.
 162                  */
 163                 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
 164
 165                 /*
 166                  * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
 167                  * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
 168                  */
 169                 BPF_MOV32_IMM(BPF_REG_8, 0),
 170         };
 171
 172         /*
 173          * The access checkers compiled for the configured allowance and denial lists
 174          * write to R8 at runtime. The following code prepares for an early exit that
 175          * skip the accounting if the packet is denied.
 176          *
 177          * R0 = 1
 178          * if (R8 == ACCESS_DENIED)
 179          *     R0 = 0
 180          *
 181          * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
 182          * is allowed to pass.
 183          */
 184         struct bpf_insn post_insn[] = {
 185                 BPF_MOV64_IMM(BPF_REG_0, 1),
 186                 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
 187                 BPF_MOV64_IMM(BPF_REG_0, 0),
 188         };
 189
 190         _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
 191         int accounting_map_fd, r;
 192         bool access_enabled;
 193
 194         assert(u);
 195         assert(ret);
 196
 197         accounting_map_fd = is_ingress ?
 198                 u->ip_accounting_ingress_map_fd :
 199                 u->ip_accounting_egress_map_fd;
 200
 201         access_enabled =
 202                 u->ipv4_allow_map_fd >= 0 ||
 203                 u->ipv6_allow_map_fd >= 0 ||
 204                 u->ipv4_deny_map_fd >= 0 ||
 205                 u->ipv6_deny_map_fd >= 0;
 206
 207         if (accounting_map_fd < 0 && !access_enabled) {
 208                 *ret = NULL;
 209                 return 0;
 210         }
 211
 212         r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
 213         if (r < 0)
 214                 return r;
 215
 216         r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
 217         if (r < 0)
 218                 return r;
 219
 220         if (access_enabled) {
 221                 /*
 222                  * The simple rule this function translates into eBPF instructions is:
 223                  *
 224                  * - Access will be granted when an address matches an entry in @list_allow
 225                  * - Otherwise, access will be denied when an address matches an entry in @list_deny
 226                  * - Otherwise, access will be granted
 227                  */
 228
 229                 if (u->ipv4_deny_map_fd >= 0) {
 230                         r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
 231                         if (r < 0)
 232                                 return r;
 233                 }
 234
 235                 if (u->ipv6_deny_map_fd >= 0) {
 236                         r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
 237                         if (r < 0)
 238                                 return r;
 239                 }
 240
 241                 if (u->ipv4_allow_map_fd >= 0) {
 242                         r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
 243                         if (r < 0)
 244                                 return r;
 245                 }
 246
 247                 if (u->ipv6_allow_map_fd >= 0) {
 248                         r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
 249                         if (r < 0)
 250                                 return r;
 251                 }
 252         }
 253
 254         r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
 255         if (r < 0)
 256                 return r;
 257
 258         if (accounting_map_fd >= 0) {
 259                 struct bpf_insn insn[] = {
 260                         /*
 261                          * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
 262                          * The jump label will be fixed up later.
 263                          */
 264                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
 265
 266                         /* Count packets */
 267                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
 268                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
 269                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 270                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
 271                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
 272                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 273                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 274                         BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
 275                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
 276
 277                         /* Count bytes */
 278                         BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
 279                         BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
 280                         BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
 281                         BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
 282                         BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
 283                         BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
 284                         BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
 285                         BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
 286                         BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
 287
 288                         /* Allow the packet to pass */
 289                         BPF_MOV64_IMM(BPF_REG_0, 1),
 290                 };
 291
 292                 /* Jump label fixup */
 293                 insn[0].off = ELEMENTSOF(insn) - 1;
 294
 295                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 296                 if (r < 0)
 297                         return r;
 298         }
 299
 300         do {
 301                 /*
 302                  * Exit from the eBPF program, R0 contains the verdict.
 303                  * 0 means the packet is denied, 1 means the packet may pass.
 304                  */
 305                 struct bpf_insn insn[] = {
 306                         BPF_EXIT_INSN()
 307                 };
 308
 309                 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
 310                 if (r < 0)
 311                         return r;
 312         } while (false);
 313
 314         *ret = TAKE_PTR(p);
 315
 316         return 0;
 317 }
 318
 319 static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) {
 320         IPAddressAccessItem *a;
 321
 322         assert(n_ipv4);
 323         assert(n_ipv6);
 324
 325         LIST_FOREACH(items, a, list) {
 326                 switch (a->family) {
 327
 328                 case AF_INET:
 329                         (*n_ipv4)++;
 330                         break;
 331
 332                 case AF_INET6:
 333                         (*n_ipv6)++;
 334                         break;
 335
 336                 default:
 337                         return -EAFNOSUPPORT;
 338                 }
 339         }
 340
 341         return 0;
 342 }
 343
 344 static int bpf_firewall_add_access_items(
 345                 IPAddressAccessItem *list,
 346                 int ipv4_map_fd,
 347                 int ipv6_map_fd,
 348                 int verdict) {
 349
 350         struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
 351         uint64_t value = verdict;
 352         IPAddressAccessItem *a;
 353         int r;
 354
 355         key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
 356         key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
 357
 358         LIST_FOREACH(items, a, list) {
 359                 switch (a->family) {
 360
 361                 case AF_INET:
 362                         key_ipv4->prefixlen = a->prefixlen;
 363                         memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
 364
 365                         r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
 366                         if (r < 0)
 367                                 return r;
 368
 369                         break;
 370
 371                 case AF_INET6:
 372                         key_ipv6->prefixlen = a->prefixlen;
 373                         memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
 374
 375                         r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
 376                         if (r < 0)
 377                                 return r;
 378
 379                         break;
 380
 381                 default:
 382                         return -EAFNOSUPPORT;
 383                 }
 384         }
 385
 386         return 0;
 387 }
 388
 389 static int bpf_firewall_prepare_access_maps(
 390                 Unit *u,
 391                 int verdict,
 392                 int *ret_ipv4_map_fd,
 393                 int *ret_ipv6_map_fd) {
 394
 395         _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
 396         size_t n_ipv4 = 0, n_ipv6 = 0;
 397         Unit *p;
 398         int r;
 399
 400         assert(ret_ipv4_map_fd);
 401         assert(ret_ipv6_map_fd);
 402
 403         for (p = u; p; p = UNIT_DEREF(p->slice)) {
 404                 CGroupContext *cc;
 405
 406                 cc = unit_get_cgroup_context(p);
 407                 if (!cc)
 408                         continue;
 409
 410                 bpf_firewall_count_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, &n_ipv4, &n_ipv6);
 411         }
 412
 413         if (n_ipv4 > 0) {
 414                 ipv4_map_fd = bpf_map_new(
 415                                 BPF_MAP_TYPE_LPM_TRIE,
 416                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
 417                                 sizeof(uint64_t),
 418                                 n_ipv4,
 419                                 BPF_F_NO_PREALLOC);
 420                 if (ipv4_map_fd < 0)
 421                         return ipv4_map_fd;
 422         }
 423
 424         if (n_ipv6 > 0) {
 425                 ipv6_map_fd = bpf_map_new(
 426                                 BPF_MAP_TYPE_LPM_TRIE,
 427                                 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
 428                                 sizeof(uint64_t),
 429                                 n_ipv6,
 430                                 BPF_F_NO_PREALLOC);
 431                 if (ipv6_map_fd < 0)
 432                         return ipv6_map_fd;
 433         }
 434
 435         for (p = u; p; p = UNIT_DEREF(p->slice)) {
 436                 CGroupContext *cc;
 437
 438                 cc = unit_get_cgroup_context(p);
 439                 if (!cc)
 440                         continue;
 441
 442                 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
 443                                                   ipv4_map_fd, ipv6_map_fd, verdict);
 444                 if (r < 0)
 445                         return r;
 446         }
 447
 448         *ret_ipv4_map_fd = ipv4_map_fd;
 449         *ret_ipv6_map_fd = ipv6_map_fd;
 450
 451         ipv4_map_fd = ipv6_map_fd = -1;
 452         return 0;
 453 }
 454
 455 static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
 456         int r;
 457
 458         assert(u);
 459         assert(fd_ingress);
 460         assert(fd_egress);
 461
 462         if (enabled) {
 463                 if (*fd_ingress < 0) {
 464                         r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
 465                         if (r < 0)
 466                                 return r;
 467
 468                         *fd_ingress = r;
 469                 }
 470
 471                 if (*fd_egress < 0) {
 472
 473                         r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
 474                         if (r < 0)
 475                                 return r;
 476
 477                         *fd_egress = r;
 478                 }
 479
 480         } else {
 481                 *fd_ingress = safe_close(*fd_ingress);
 482                 *fd_egress = safe_close(*fd_egress);
 483
 484                 zero(u->ip_accounting_extra);
 485         }
 486
 487         return 0;
 488 }
 489
 490 int bpf_firewall_compile(Unit *u) {
 491         CGroupContext *cc;
 492         int r, supported;
 493
 494         assert(u);
 495
 496         cc = unit_get_cgroup_context(u);
 497         if (!cc)
 498                 return -EINVAL;
 499
 500         supported = bpf_firewall_supported();
 501         if (supported < 0)
 502                 return supported;
 503         if (supported == BPF_FIREWALL_UNSUPPORTED) {
 504                 log_debug("BPF firewalling not supported on this manager, proceeding without.");
 505                 return -EOPNOTSUPP;
 506         }
 507         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
 508                 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
 509                  * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
 510                  * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
 511                  * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
 512                  * all, either. */
 513                 log_debug("BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
 514                 return -EOPNOTSUPP;
 515         }
 516
 517         /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
 518          * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
 519          * configuration, but we don't flush out the accounting unnecessarily */
 520
 521         u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
 522         u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);
 523
 524         u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
 525         u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
 526
 527         u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
 528         u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
 529
 530         if (u->type != UNIT_SLICE) {
 531                 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
 532                  * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
 533                  * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
 534                  * means that all configure IP access rules *will* take effect on processes, even though we never
 535                  * compile them for inner nodes. */
 536
 537                 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd);
 538                 if (r < 0)
 539                         return log_error_errno(r, "Preparation of eBPF allow maps failed: %m");
 540
 541                 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd);
 542                 if (r < 0)
 543                         return log_error_errno(r, "Preparation of eBPF deny maps failed: %m");
 544         }
 545
 546         r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
 547         if (r < 0)
 548                 return log_error_errno(r, "Preparation of eBPF accounting maps failed: %m");
 549
 550         r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress);
 551         if (r < 0)
 552                 return log_error_errno(r, "Compilation for ingress BPF program failed: %m");
 553
 554         r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress);
 555         if (r < 0)
 556                 return log_error_errno(r, "Compilation for egress BPF program failed: %m");
 557
 558         return 0;
 559 }
 560
 561 int bpf_firewall_install(Unit *u) {
 562         _cleanup_free_ char *path = NULL;
 563         CGroupContext *cc;
 564         int r, supported;
 565         uint32_t flags;
 566
 567         assert(u);
 568
 569         cc = unit_get_cgroup_context(u);
 570         if (!cc)
 571                 return -EINVAL;
 572         if (!u->cgroup_path)
 573                 return -EINVAL;
 574         if (!u->cgroup_realized)
 575                 return -EINVAL;
 576
 577         supported = bpf_firewall_supported();
 578         if (supported < 0)
 579                 return supported;
 580         if (supported == BPF_FIREWALL_UNSUPPORTED) {
 581                 log_debug("BPF firewalling not supported on this manager, proceeding without.");
 582                 return -EOPNOTSUPP;
 583         }
 584         if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
 585                 log_debug("BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
 586                 return -EOPNOTSUPP;
 587         }
 588
 589         r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
 590         if (r < 0)
 591                 return log_error_errno(r, "Failed to determine cgroup path: %m");
 592
 593         flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
 594                  (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
 595
 596         /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
 597          * minimize the time window when we don't account for IP traffic. */
 598         u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
 599         u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
 600
 601         if (u->ip_bpf_egress) {
 602                 r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
 603                 if (r < 0)
 604                         return log_error_errno(r, "Attaching egress BPF program to cgroup %s failed: %m", path);
 605
 606                 /* Remember that this BPF program is installed now. */
 607                 u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
 608         }
 609
 610         if (u->ip_bpf_ingress) {
 611                 r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
 612                 if (r < 0)
 613                         return log_error_errno(r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
 614
 615                 u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
 616         }
 617
 618         return 0;
 619 }
 620
 621 int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
 622         uint64_t key, packets;
 623         int r;
 624
 625         if (map_fd < 0)
 626                 return -EBADF;
 627
 628         if (ret_packets) {
 629                 key = MAP_KEY_PACKETS;
 630                 r = bpf_map_lookup_element(map_fd, &key, &packets);
 631                 if (r < 0)
 632                         return r;
 633         }
 634
 635         if (ret_bytes) {
 636                 key = MAP_KEY_BYTES;
 637                 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
 638                 if (r < 0)
 639                         return r;
 640         }
 641
 642         if (ret_packets)
 643                 *ret_packets = packets;
 644
 645         return 0;
 646 }
 647
 648 int bpf_firewall_reset_accounting(int map_fd) {
 649         uint64_t key, value = 0;
 650         int r;
 651
 652         if (map_fd < 0)
 653                 return -EBADF;
 654
 655         key = MAP_KEY_PACKETS;
 656         r = bpf_map_update_element(map_fd, &key, &value);
 657         if (r < 0)
 658                 return r;
 659
 660         key = MAP_KEY_BYTES;
 661         return bpf_map_update_element(map_fd, &key, &value);
 662 }
 663
 664 int bpf_firewall_supported(void) {
 665         struct bpf_insn trivial[] = {
 666                 BPF_MOV64_IMM(BPF_REG_0, 1),
 667                 BPF_EXIT_INSN()
 668         };
 669
 670         _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
 671         static int supported = -1;
 672         union bpf_attr attr;
 673         int fd, r;
 674
 675         /* Checks whether BPF firewalling is supported. For this, we check five things:
 676          *
 677          * a) whether we are privileged
 678          * b) whether the unified hierarchy is being used
 679          * c) the BPF implementation in the kernel supports BPF LPM TRIE maps, which we require
 680          * d) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
 681          * e) the BPF implementation in the kernel supports the BPF_PROG_ATTACH call, which we require
 682          *
 683          */
 684
 685         if (supported >= 0)
 686                 return supported;
 687
 688         if (geteuid() != 0) {
 689                 log_debug("Not enough privileges, BPF firewalling is not supported.");
 690                 return supported = BPF_FIREWALL_UNSUPPORTED;
 691         }
 692
 693         r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
 694         if (r < 0)
 695                 return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
 696         if (r == 0) {
 697                 log_debug("Not running with unified cgroups, BPF firewalling is not supported.");
 698                 return supported = BPF_FIREWALL_UNSUPPORTED;
 699         }
 700
 701         fd = bpf_map_new(BPF_MAP_TYPE_LPM_TRIE,
 702                          offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint64_t),
 703                          sizeof(uint64_t),
 704                          1,
 705                          BPF_F_NO_PREALLOC);
 706         if (fd < 0) {
 707                 log_debug_errno(r, "Can't allocate BPF LPM TRIE map, BPF firewalling is not supported: %m");
 708                 return supported = BPF_FIREWALL_UNSUPPORTED;
 709         }
 710
 711         safe_close(fd);
 712
 713         if (bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program) < 0) {
 714                 log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
 715                 return supported = BPF_FIREWALL_UNSUPPORTED;
 716         }
 717
 718         r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
 719         if (r < 0) {
 720                 log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
 721                 return supported = BPF_FIREWALL_UNSUPPORTED;
 722         }
 723
 724         r = bpf_program_load_kernel(program, NULL, 0);
 725         if (r < 0) {
 726                 log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
 727                 return supported = BPF_FIREWALL_UNSUPPORTED;
 728         }
 729
 730         /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
 731          * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
 732          * program if we can't do a thing with it later?
 733          *
 734          * We detect this case by issuing the BPF_PROG_ATTACH bpf() call with invalid file descriptors: if
 735          * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
 736          * parameters are validated however, and that'll fail with EBADF then. */
 737
 738         attr = (union bpf_attr) {
 739                 .attach_type = BPF_CGROUP_INET_EGRESS,
 740                 .target_fd = -1,
 741                 .attach_bpf_fd = -1,
 742         };
 743
 744         r = bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
 745         if (r < 0) {
 746                 if (errno != EBADF) {
 747                         log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_ATTACH, BPF firewalling is not supported: %m");
 748                         return supported = BPF_FIREWALL_UNSUPPORTED;
 749                 }
 750
 751                 /* YAY! */
 752         } else {
 753                 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
 754                 return supported = BPF_FIREWALL_UNSUPPORTED;
 755         }
 756
 757         /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
 758          * (which was added in kernel 4.15). We use a similar logic as before, but this time we use
 759          * BPF_F_ALLOW_MULTI. Since the flags are checked early in the system call we'll get EINVAL if it's not
 760          * supported, and EBADF as before if it is available. */
 761
 762         attr = (union bpf_attr) {
 763                 .attach_type = BPF_CGROUP_INET_EGRESS,
 764                 .target_fd = -1,
 765                 .attach_bpf_fd = -1,
 766                 .attach_flags = BPF_F_ALLOW_MULTI,
 767         };
 768
 769         r = bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
 770         if (r < 0) {
 771                 if (errno == EBADF) {
 772                         log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
 773                         return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
 774                 }
 775
 776                 if (errno == EINVAL)
 777                         log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
 778                 else
 779                         log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
 780
 781                 return supported = BPF_FIREWALL_SUPPORTED;
 782         } else {
 783                 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
 784                 return supported = BPF_FIREWALL_UNSUPPORTED;
 785         }
 786 }