]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/bpf-firewall.c
tree-wide: use TAKE_PTR() and TAKE_FD() macros
[thirdparty/systemd.git] / src / core / bpf-firewall.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2 /***
3 This file is part of systemd.
4
5 Copyright 2016 Daniel Mack
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19 ***/
20
21 #include <arpa/inet.h>
22 #include <assert.h>
23 #include <errno.h>
24 #include <fcntl.h>
25 #include <linux/libbpf.h>
26 #include <net/ethernet.h>
27 #include <net/if.h>
28 #include <netinet/ip.h>
29 #include <netinet/ip6.h>
30 #include <stddef.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
34 #include <unistd.h>
35
36 #include "alloc-util.h"
37 #include "bpf-firewall.h"
38 #include "bpf-program.h"
39 #include "fd-util.h"
40 #include "ip-address-access.h"
41 #include "unit.h"
42
43 enum {
44 MAP_KEY_PACKETS,
45 MAP_KEY_BYTES,
46 };
47
48 enum {
49 ACCESS_ALLOWED = 1,
50 ACCESS_DENIED = 2,
51 };
52
53 /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
54
55 static int add_lookup_instructions(
56 BPFProgram *p,
57 int map_fd,
58 int protocol,
59 bool is_ingress,
60 int verdict) {
61
62 int r, addr_offset, addr_size;
63
64 assert(p);
65 assert(map_fd >= 0);
66
67 switch (protocol) {
68
69 case ETH_P_IP:
70 addr_size = sizeof(uint32_t);
71 addr_offset = is_ingress ?
72 offsetof(struct iphdr, saddr) :
73 offsetof(struct iphdr, daddr);
74 break;
75
76 case ETH_P_IPV6:
77 addr_size = 4 * sizeof(uint32_t);
78 addr_offset = is_ingress ?
79 offsetof(struct ip6_hdr, ip6_src.s6_addr) :
80 offsetof(struct ip6_hdr, ip6_dst.s6_addr);
81 break;
82
83 default:
84 return -EAFNOSUPPORT;
85 }
86
87 do {
88 /* Compare IPv4 with one word instruction (32bit) */
89 struct bpf_insn insn[] = {
90 /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
91 BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
92
93 /*
94 * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
95 *
96 * R1: Pointer to the skb
97 * R2: Data offset
98 * R3: Destination buffer on the stack (r10 - 4)
99 * R4: Number of bytes to read (4)
100 */
101
102 BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
103 BPF_MOV32_IMM(BPF_REG_2, addr_offset),
104
105 BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
106 BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
107
108 BPF_MOV32_IMM(BPF_REG_4, addr_size),
109 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
110
111 /*
112 * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
113 * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
114 * has to be set to the maximum possible value.
115 *
116 * On success, the looked up value is stored in R0. For this application, the actual
117 * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
118 * matching value.
119 */
120
121 BPF_LD_MAP_FD(BPF_REG_1, map_fd),
122 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
123 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
124 BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
125
126 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
127 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
128 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
129 };
130
131 /* Jump label fixup */
132 insn[0].off = ELEMENTSOF(insn) - 1;
133
134 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
135 if (r < 0)
136 return r;
137
138 } while (false);
139
140 return 0;
141 }
142
143 static int bpf_firewall_compile_bpf(
144 Unit *u,
145 bool is_ingress,
146 BPFProgram **ret) {
147
148 struct bpf_insn pre_insn[] = {
149 /*
150 * When the eBPF program is entered, R1 contains the address of the skb.
151 * However, R1-R5 are scratch registers that are not preserved when calling
152 * into kernel functions, so we need to save anything that's supposed to
153 * stay around to R6-R9. Save the skb to R6.
154 */
155 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
156
157 /*
158 * Although we cannot access the skb data directly from eBPF programs used in this
159 * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
160 * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
161 * for later use.
162 */
163 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
164
165 /*
166 * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
167 * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
168 */
169 BPF_MOV32_IMM(BPF_REG_8, 0),
170 };
171
172 /*
173 * The access checkers compiled for the configured allowance and denial lists
174 * write to R8 at runtime. The following code prepares for an early exit that
175 * skip the accounting if the packet is denied.
176 *
177 * R0 = 1
178 * if (R8 == ACCESS_DENIED)
179 * R0 = 0
180 *
181 * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
182 * is allowed to pass.
183 */
184 struct bpf_insn post_insn[] = {
185 BPF_MOV64_IMM(BPF_REG_0, 1),
186 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
187 BPF_MOV64_IMM(BPF_REG_0, 0),
188 };
189
190 _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
191 int accounting_map_fd, r;
192 bool access_enabled;
193
194 assert(u);
195 assert(ret);
196
197 accounting_map_fd = is_ingress ?
198 u->ip_accounting_ingress_map_fd :
199 u->ip_accounting_egress_map_fd;
200
201 access_enabled =
202 u->ipv4_allow_map_fd >= 0 ||
203 u->ipv6_allow_map_fd >= 0 ||
204 u->ipv4_deny_map_fd >= 0 ||
205 u->ipv6_deny_map_fd >= 0;
206
207 if (accounting_map_fd < 0 && !access_enabled) {
208 *ret = NULL;
209 return 0;
210 }
211
212 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
213 if (r < 0)
214 return r;
215
216 r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
217 if (r < 0)
218 return r;
219
220 if (access_enabled) {
221 /*
222 * The simple rule this function translates into eBPF instructions is:
223 *
224 * - Access will be granted when an address matches an entry in @list_allow
225 * - Otherwise, access will be denied when an address matches an entry in @list_deny
226 * - Otherwise, access will be granted
227 */
228
229 if (u->ipv4_deny_map_fd >= 0) {
230 r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
231 if (r < 0)
232 return r;
233 }
234
235 if (u->ipv6_deny_map_fd >= 0) {
236 r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
237 if (r < 0)
238 return r;
239 }
240
241 if (u->ipv4_allow_map_fd >= 0) {
242 r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
243 if (r < 0)
244 return r;
245 }
246
247 if (u->ipv6_allow_map_fd >= 0) {
248 r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
249 if (r < 0)
250 return r;
251 }
252 }
253
254 r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
255 if (r < 0)
256 return r;
257
258 if (accounting_map_fd >= 0) {
259 struct bpf_insn insn[] = {
260 /*
261 * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
262 * The jump label will be fixed up later.
263 */
264 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
265
266 /* Count packets */
267 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
268 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
269 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
270 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
271 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
272 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
273 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
274 BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
275 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
276
277 /* Count bytes */
278 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
279 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
280 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
281 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
282 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
283 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
284 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
285 BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
286 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
287
288 /* Allow the packet to pass */
289 BPF_MOV64_IMM(BPF_REG_0, 1),
290 };
291
292 /* Jump label fixup */
293 insn[0].off = ELEMENTSOF(insn) - 1;
294
295 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
296 if (r < 0)
297 return r;
298 }
299
300 do {
301 /*
302 * Exit from the eBPF program, R0 contains the verdict.
303 * 0 means the packet is denied, 1 means the packet may pass.
304 */
305 struct bpf_insn insn[] = {
306 BPF_EXIT_INSN()
307 };
308
309 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
310 if (r < 0)
311 return r;
312 } while (false);
313
314 *ret = TAKE_PTR(p);
315
316 return 0;
317 }
318
319 static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) {
320 IPAddressAccessItem *a;
321
322 assert(n_ipv4);
323 assert(n_ipv6);
324
325 LIST_FOREACH(items, a, list) {
326 switch (a->family) {
327
328 case AF_INET:
329 (*n_ipv4)++;
330 break;
331
332 case AF_INET6:
333 (*n_ipv6)++;
334 break;
335
336 default:
337 return -EAFNOSUPPORT;
338 }
339 }
340
341 return 0;
342 }
343
344 static int bpf_firewall_add_access_items(
345 IPAddressAccessItem *list,
346 int ipv4_map_fd,
347 int ipv6_map_fd,
348 int verdict) {
349
350 struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
351 uint64_t value = verdict;
352 IPAddressAccessItem *a;
353 int r;
354
355 key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
356 key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
357
358 LIST_FOREACH(items, a, list) {
359 switch (a->family) {
360
361 case AF_INET:
362 key_ipv4->prefixlen = a->prefixlen;
363 memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
364
365 r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
366 if (r < 0)
367 return r;
368
369 break;
370
371 case AF_INET6:
372 key_ipv6->prefixlen = a->prefixlen;
373 memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
374
375 r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
376 if (r < 0)
377 return r;
378
379 break;
380
381 default:
382 return -EAFNOSUPPORT;
383 }
384 }
385
386 return 0;
387 }
388
389 static int bpf_firewall_prepare_access_maps(
390 Unit *u,
391 int verdict,
392 int *ret_ipv4_map_fd,
393 int *ret_ipv6_map_fd) {
394
395 _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
396 size_t n_ipv4 = 0, n_ipv6 = 0;
397 Unit *p;
398 int r;
399
400 assert(ret_ipv4_map_fd);
401 assert(ret_ipv6_map_fd);
402
403 for (p = u; p; p = UNIT_DEREF(p->slice)) {
404 CGroupContext *cc;
405
406 cc = unit_get_cgroup_context(p);
407 if (!cc)
408 continue;
409
410 bpf_firewall_count_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, &n_ipv4, &n_ipv6);
411 }
412
413 if (n_ipv4 > 0) {
414 ipv4_map_fd = bpf_map_new(
415 BPF_MAP_TYPE_LPM_TRIE,
416 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
417 sizeof(uint64_t),
418 n_ipv4,
419 BPF_F_NO_PREALLOC);
420 if (ipv4_map_fd < 0)
421 return ipv4_map_fd;
422 }
423
424 if (n_ipv6 > 0) {
425 ipv6_map_fd = bpf_map_new(
426 BPF_MAP_TYPE_LPM_TRIE,
427 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
428 sizeof(uint64_t),
429 n_ipv6,
430 BPF_F_NO_PREALLOC);
431 if (ipv6_map_fd < 0)
432 return ipv6_map_fd;
433 }
434
435 for (p = u; p; p = UNIT_DEREF(p->slice)) {
436 CGroupContext *cc;
437
438 cc = unit_get_cgroup_context(p);
439 if (!cc)
440 continue;
441
442 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
443 ipv4_map_fd, ipv6_map_fd, verdict);
444 if (r < 0)
445 return r;
446 }
447
448 *ret_ipv4_map_fd = ipv4_map_fd;
449 *ret_ipv6_map_fd = ipv6_map_fd;
450
451 ipv4_map_fd = ipv6_map_fd = -1;
452 return 0;
453 }
454
455 static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
456 int r;
457
458 assert(u);
459 assert(fd_ingress);
460 assert(fd_egress);
461
462 if (enabled) {
463 if (*fd_ingress < 0) {
464 r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
465 if (r < 0)
466 return r;
467
468 *fd_ingress = r;
469 }
470
471 if (*fd_egress < 0) {
472
473 r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
474 if (r < 0)
475 return r;
476
477 *fd_egress = r;
478 }
479
480 } else {
481 *fd_ingress = safe_close(*fd_ingress);
482 *fd_egress = safe_close(*fd_egress);
483
484 zero(u->ip_accounting_extra);
485 }
486
487 return 0;
488 }
489
490 int bpf_firewall_compile(Unit *u) {
491 CGroupContext *cc;
492 int r, supported;
493
494 assert(u);
495
496 cc = unit_get_cgroup_context(u);
497 if (!cc)
498 return -EINVAL;
499
500 supported = bpf_firewall_supported();
501 if (supported < 0)
502 return supported;
503 if (supported == BPF_FIREWALL_UNSUPPORTED) {
504 log_debug("BPF firewalling not supported on this manager, proceeding without.");
505 return -EOPNOTSUPP;
506 }
507 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
508 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
509 * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
510 * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
511 * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
512 * all, either. */
513 log_debug("BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
514 return -EOPNOTSUPP;
515 }
516
517 /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
518 * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
519 * configuration, but we don't flush out the accounting unnecessarily */
520
521 u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
522 u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);
523
524 u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
525 u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
526
527 u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
528 u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
529
530 if (u->type != UNIT_SLICE) {
531 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
532 * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
533 * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
534 * means that all configure IP access rules *will* take effect on processes, even though we never
535 * compile them for inner nodes. */
536
537 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd);
538 if (r < 0)
539 return log_error_errno(r, "Preparation of eBPF allow maps failed: %m");
540
541 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd);
542 if (r < 0)
543 return log_error_errno(r, "Preparation of eBPF deny maps failed: %m");
544 }
545
546 r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
547 if (r < 0)
548 return log_error_errno(r, "Preparation of eBPF accounting maps failed: %m");
549
550 r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress);
551 if (r < 0)
552 return log_error_errno(r, "Compilation for ingress BPF program failed: %m");
553
554 r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress);
555 if (r < 0)
556 return log_error_errno(r, "Compilation for egress BPF program failed: %m");
557
558 return 0;
559 }
560
561 int bpf_firewall_install(Unit *u) {
562 _cleanup_free_ char *path = NULL;
563 CGroupContext *cc;
564 int r, supported;
565 uint32_t flags;
566
567 assert(u);
568
569 cc = unit_get_cgroup_context(u);
570 if (!cc)
571 return -EINVAL;
572 if (!u->cgroup_path)
573 return -EINVAL;
574 if (!u->cgroup_realized)
575 return -EINVAL;
576
577 supported = bpf_firewall_supported();
578 if (supported < 0)
579 return supported;
580 if (supported == BPF_FIREWALL_UNSUPPORTED) {
581 log_debug("BPF firewalling not supported on this manager, proceeding without.");
582 return -EOPNOTSUPP;
583 }
584 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
585 log_debug("BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
586 return -EOPNOTSUPP;
587 }
588
589 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
590 if (r < 0)
591 return log_error_errno(r, "Failed to determine cgroup path: %m");
592
593 flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
594 (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
595
596 /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
597 * minimize the time window when we don't account for IP traffic. */
598 u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
599 u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
600
601 if (u->ip_bpf_egress) {
602 r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
603 if (r < 0)
604 return log_error_errno(r, "Attaching egress BPF program to cgroup %s failed: %m", path);
605
606 /* Remember that this BPF program is installed now. */
607 u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
608 }
609
610 if (u->ip_bpf_ingress) {
611 r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
612 if (r < 0)
613 return log_error_errno(r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
614
615 u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
616 }
617
618 return 0;
619 }
620
621 int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
622 uint64_t key, packets;
623 int r;
624
625 if (map_fd < 0)
626 return -EBADF;
627
628 if (ret_packets) {
629 key = MAP_KEY_PACKETS;
630 r = bpf_map_lookup_element(map_fd, &key, &packets);
631 if (r < 0)
632 return r;
633 }
634
635 if (ret_bytes) {
636 key = MAP_KEY_BYTES;
637 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
638 if (r < 0)
639 return r;
640 }
641
642 if (ret_packets)
643 *ret_packets = packets;
644
645 return 0;
646 }
647
648 int bpf_firewall_reset_accounting(int map_fd) {
649 uint64_t key, value = 0;
650 int r;
651
652 if (map_fd < 0)
653 return -EBADF;
654
655 key = MAP_KEY_PACKETS;
656 r = bpf_map_update_element(map_fd, &key, &value);
657 if (r < 0)
658 return r;
659
660 key = MAP_KEY_BYTES;
661 return bpf_map_update_element(map_fd, &key, &value);
662 }
663
664 int bpf_firewall_supported(void) {
665 struct bpf_insn trivial[] = {
666 BPF_MOV64_IMM(BPF_REG_0, 1),
667 BPF_EXIT_INSN()
668 };
669
670 _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
671 static int supported = -1;
672 union bpf_attr attr;
673 int fd, r;
674
675 /* Checks whether BPF firewalling is supported. For this, we check five things:
676 *
677 * a) whether we are privileged
678 * b) whether the unified hierarchy is being used
679 * c) the BPF implementation in the kernel supports BPF LPM TRIE maps, which we require
680 * d) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
681 * e) the BPF implementation in the kernel supports the BPF_PROG_ATTACH call, which we require
682 *
683 */
684
685 if (supported >= 0)
686 return supported;
687
688 if (geteuid() != 0) {
689 log_debug("Not enough privileges, BPF firewalling is not supported.");
690 return supported = BPF_FIREWALL_UNSUPPORTED;
691 }
692
693 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
694 if (r < 0)
695 return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
696 if (r == 0) {
697 log_debug("Not running with unified cgroups, BPF firewalling is not supported.");
698 return supported = BPF_FIREWALL_UNSUPPORTED;
699 }
700
701 fd = bpf_map_new(BPF_MAP_TYPE_LPM_TRIE,
702 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint64_t),
703 sizeof(uint64_t),
704 1,
705 BPF_F_NO_PREALLOC);
706 if (fd < 0) {
707 log_debug_errno(r, "Can't allocate BPF LPM TRIE map, BPF firewalling is not supported: %m");
708 return supported = BPF_FIREWALL_UNSUPPORTED;
709 }
710
711 safe_close(fd);
712
713 if (bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program) < 0) {
714 log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
715 return supported = BPF_FIREWALL_UNSUPPORTED;
716 }
717
718 r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
719 if (r < 0) {
720 log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
721 return supported = BPF_FIREWALL_UNSUPPORTED;
722 }
723
724 r = bpf_program_load_kernel(program, NULL, 0);
725 if (r < 0) {
726 log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
727 return supported = BPF_FIREWALL_UNSUPPORTED;
728 }
729
730 /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
731 * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
732 * program if we can't do a thing with it later?
733 *
734 * We detect this case by issuing the BPF_PROG_ATTACH bpf() call with invalid file descriptors: if
735 * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
736 * parameters are validated however, and that'll fail with EBADF then. */
737
738 attr = (union bpf_attr) {
739 .attach_type = BPF_CGROUP_INET_EGRESS,
740 .target_fd = -1,
741 .attach_bpf_fd = -1,
742 };
743
744 r = bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
745 if (r < 0) {
746 if (errno != EBADF) {
747 log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_ATTACH, BPF firewalling is not supported: %m");
748 return supported = BPF_FIREWALL_UNSUPPORTED;
749 }
750
751 /* YAY! */
752 } else {
753 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
754 return supported = BPF_FIREWALL_UNSUPPORTED;
755 }
756
757 /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
758 * (which was added in kernel 4.15). We use a similar logic as before, but this time we use
759 * BPF_F_ALLOW_MULTI. Since the flags are checked early in the system call we'll get EINVAL if it's not
760 * supported, and EBADF as before if it is available. */
761
762 attr = (union bpf_attr) {
763 .attach_type = BPF_CGROUP_INET_EGRESS,
764 .target_fd = -1,
765 .attach_bpf_fd = -1,
766 .attach_flags = BPF_F_ALLOW_MULTI,
767 };
768
769 r = bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
770 if (r < 0) {
771 if (errno == EBADF) {
772 log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
773 return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
774 }
775
776 if (errno == EINVAL)
777 log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
778 else
779 log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
780
781 return supported = BPF_FIREWALL_SUPPORTED;
782 } else {
783 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
784 return supported = BPF_FIREWALL_UNSUPPORTED;
785 }
786 }