]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/bpf-firewall.c
Merge pull request #11827 from keszybz/pkgconfig-variables
[thirdparty/systemd.git] / src / core / bpf-firewall.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <arpa/inet.h>
4 #include <assert.h>
5 #include <errno.h>
6 #include <fcntl.h>
7 #include <linux/libbpf.h>
8 #include <net/ethernet.h>
9 #include <net/if.h>
10 #include <netinet/ip.h>
11 #include <netinet/ip6.h>
12 #include <stddef.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <unistd.h>
17
18 #include "alloc-util.h"
19 #include "bpf-firewall.h"
20 #include "bpf-program.h"
21 #include "fd-util.h"
22 #include "ip-address-access.h"
23 #include "missing_syscall.h"
24 #include "unit.h"
25
26 enum {
27 MAP_KEY_PACKETS,
28 MAP_KEY_BYTES,
29 };
30
31 enum {
32 ACCESS_ALLOWED = 1,
33 ACCESS_DENIED = 2,
34 };
35
36 /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
37
38 static int add_lookup_instructions(
39 BPFProgram *p,
40 int map_fd,
41 int protocol,
42 bool is_ingress,
43 int verdict) {
44
45 int r, addr_offset, addr_size;
46
47 assert(p);
48 assert(map_fd >= 0);
49
50 switch (protocol) {
51
52 case ETH_P_IP:
53 addr_size = sizeof(uint32_t);
54 addr_offset = is_ingress ?
55 offsetof(struct iphdr, saddr) :
56 offsetof(struct iphdr, daddr);
57 break;
58
59 case ETH_P_IPV6:
60 addr_size = 4 * sizeof(uint32_t);
61 addr_offset = is_ingress ?
62 offsetof(struct ip6_hdr, ip6_src.s6_addr) :
63 offsetof(struct ip6_hdr, ip6_dst.s6_addr);
64 break;
65
66 default:
67 return -EAFNOSUPPORT;
68 }
69
70 do {
71 /* Compare IPv4 with one word instruction (32bit) */
72 struct bpf_insn insn[] = {
73 /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
74 BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
75
76 /*
77 * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
78 *
79 * R1: Pointer to the skb
80 * R2: Data offset
81 * R3: Destination buffer on the stack (r10 - 4)
82 * R4: Number of bytes to read (4)
83 */
84
85 BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
86 BPF_MOV32_IMM(BPF_REG_2, addr_offset),
87
88 BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
89 BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
90
91 BPF_MOV32_IMM(BPF_REG_4, addr_size),
92 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
93
94 /*
95 * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
96 * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
97 * has to be set to the maximum possible value.
98 *
99 * On success, the looked up value is stored in R0. For this application, the actual
100 * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
101 * matching value.
102 */
103
104 BPF_LD_MAP_FD(BPF_REG_1, map_fd),
105 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
106 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
107 BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
108
109 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
110 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
111 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
112 };
113
114 /* Jump label fixup */
115 insn[0].off = ELEMENTSOF(insn) - 1;
116
117 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
118 if (r < 0)
119 return r;
120
121 } while (false);
122
123 return 0;
124 }
125
126 static int bpf_firewall_compile_bpf(
127 Unit *u,
128 bool is_ingress,
129 BPFProgram **ret) {
130
131 struct bpf_insn pre_insn[] = {
132 /*
133 * When the eBPF program is entered, R1 contains the address of the skb.
134 * However, R1-R5 are scratch registers that are not preserved when calling
135 * into kernel functions, so we need to save anything that's supposed to
136 * stay around to R6-R9. Save the skb to R6.
137 */
138 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
139
140 /*
141 * Although we cannot access the skb data directly from eBPF programs used in this
142 * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
143 * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
144 * for later use.
145 */
146 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
147
148 /*
149 * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
150 * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
151 */
152 BPF_MOV32_IMM(BPF_REG_8, 0),
153 };
154
155 /*
156 * The access checkers compiled for the configured allowance and denial lists
157 * write to R8 at runtime. The following code prepares for an early exit that
158 * skip the accounting if the packet is denied.
159 *
160 * R0 = 1
161 * if (R8 == ACCESS_DENIED)
162 * R0 = 0
163 *
164 * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
165 * is allowed to pass.
166 */
167 struct bpf_insn post_insn[] = {
168 BPF_MOV64_IMM(BPF_REG_0, 1),
169 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
170 BPF_MOV64_IMM(BPF_REG_0, 0),
171 };
172
173 _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
174 int accounting_map_fd, r;
175 bool access_enabled;
176
177 assert(u);
178 assert(ret);
179
180 accounting_map_fd = is_ingress ?
181 u->ip_accounting_ingress_map_fd :
182 u->ip_accounting_egress_map_fd;
183
184 access_enabled =
185 u->ipv4_allow_map_fd >= 0 ||
186 u->ipv6_allow_map_fd >= 0 ||
187 u->ipv4_deny_map_fd >= 0 ||
188 u->ipv6_deny_map_fd >= 0;
189
190 if (accounting_map_fd < 0 && !access_enabled) {
191 *ret = NULL;
192 return 0;
193 }
194
195 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
196 if (r < 0)
197 return r;
198
199 r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
200 if (r < 0)
201 return r;
202
203 if (access_enabled) {
204 /*
205 * The simple rule this function translates into eBPF instructions is:
206 *
207 * - Access will be granted when an address matches an entry in @list_allow
208 * - Otherwise, access will be denied when an address matches an entry in @list_deny
209 * - Otherwise, access will be granted
210 */
211
212 if (u->ipv4_deny_map_fd >= 0) {
213 r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
214 if (r < 0)
215 return r;
216 }
217
218 if (u->ipv6_deny_map_fd >= 0) {
219 r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
220 if (r < 0)
221 return r;
222 }
223
224 if (u->ipv4_allow_map_fd >= 0) {
225 r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
226 if (r < 0)
227 return r;
228 }
229
230 if (u->ipv6_allow_map_fd >= 0) {
231 r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
232 if (r < 0)
233 return r;
234 }
235 }
236
237 r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
238 if (r < 0)
239 return r;
240
241 if (accounting_map_fd >= 0) {
242 struct bpf_insn insn[] = {
243 /*
244 * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
245 * The jump label will be fixed up later.
246 */
247 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
248
249 /* Count packets */
250 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
251 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
252 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
253 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
254 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
255 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
256 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
257 BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
258 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
259
260 /* Count bytes */
261 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
262 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
263 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
264 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
265 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
266 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
267 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
268 BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
269 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
270
271 /* Allow the packet to pass */
272 BPF_MOV64_IMM(BPF_REG_0, 1),
273 };
274
275 /* Jump label fixup */
276 insn[0].off = ELEMENTSOF(insn) - 1;
277
278 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
279 if (r < 0)
280 return r;
281 }
282
283 do {
284 /*
285 * Exit from the eBPF program, R0 contains the verdict.
286 * 0 means the packet is denied, 1 means the packet may pass.
287 */
288 struct bpf_insn insn[] = {
289 BPF_EXIT_INSN()
290 };
291
292 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
293 if (r < 0)
294 return r;
295 } while (false);
296
297 *ret = TAKE_PTR(p);
298
299 return 0;
300 }
301
302 static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) {
303 IPAddressAccessItem *a;
304
305 assert(n_ipv4);
306 assert(n_ipv6);
307
308 LIST_FOREACH(items, a, list) {
309 switch (a->family) {
310
311 case AF_INET:
312 (*n_ipv4)++;
313 break;
314
315 case AF_INET6:
316 (*n_ipv6)++;
317 break;
318
319 default:
320 return -EAFNOSUPPORT;
321 }
322 }
323
324 return 0;
325 }
326
327 static int bpf_firewall_add_access_items(
328 IPAddressAccessItem *list,
329 int ipv4_map_fd,
330 int ipv6_map_fd,
331 int verdict) {
332
333 struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
334 uint64_t value = verdict;
335 IPAddressAccessItem *a;
336 int r;
337
338 key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
339 key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
340
341 LIST_FOREACH(items, a, list) {
342 switch (a->family) {
343
344 case AF_INET:
345 key_ipv4->prefixlen = a->prefixlen;
346 memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
347
348 r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
349 if (r < 0)
350 return r;
351
352 break;
353
354 case AF_INET6:
355 key_ipv6->prefixlen = a->prefixlen;
356 memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
357
358 r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
359 if (r < 0)
360 return r;
361
362 break;
363
364 default:
365 return -EAFNOSUPPORT;
366 }
367 }
368
369 return 0;
370 }
371
372 static int bpf_firewall_prepare_access_maps(
373 Unit *u,
374 int verdict,
375 int *ret_ipv4_map_fd,
376 int *ret_ipv6_map_fd) {
377
378 _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
379 size_t n_ipv4 = 0, n_ipv6 = 0;
380 Unit *p;
381 int r;
382
383 assert(ret_ipv4_map_fd);
384 assert(ret_ipv6_map_fd);
385
386 for (p = u; p; p = UNIT_DEREF(p->slice)) {
387 CGroupContext *cc;
388
389 cc = unit_get_cgroup_context(p);
390 if (!cc)
391 continue;
392
393 bpf_firewall_count_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, &n_ipv4, &n_ipv6);
394 }
395
396 if (n_ipv4 > 0) {
397 ipv4_map_fd = bpf_map_new(
398 BPF_MAP_TYPE_LPM_TRIE,
399 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
400 sizeof(uint64_t),
401 n_ipv4,
402 BPF_F_NO_PREALLOC);
403 if (ipv4_map_fd < 0)
404 return ipv4_map_fd;
405 }
406
407 if (n_ipv6 > 0) {
408 ipv6_map_fd = bpf_map_new(
409 BPF_MAP_TYPE_LPM_TRIE,
410 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
411 sizeof(uint64_t),
412 n_ipv6,
413 BPF_F_NO_PREALLOC);
414 if (ipv6_map_fd < 0)
415 return ipv6_map_fd;
416 }
417
418 for (p = u; p; p = UNIT_DEREF(p->slice)) {
419 CGroupContext *cc;
420
421 cc = unit_get_cgroup_context(p);
422 if (!cc)
423 continue;
424
425 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
426 ipv4_map_fd, ipv6_map_fd, verdict);
427 if (r < 0)
428 return r;
429 }
430
431 *ret_ipv4_map_fd = ipv4_map_fd;
432 *ret_ipv6_map_fd = ipv6_map_fd;
433
434 ipv4_map_fd = ipv6_map_fd = -1;
435 return 0;
436 }
437
438 static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
439 int r;
440
441 assert(u);
442 assert(fd_ingress);
443 assert(fd_egress);
444
445 if (enabled) {
446 if (*fd_ingress < 0) {
447 r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
448 if (r < 0)
449 return r;
450
451 *fd_ingress = r;
452 }
453
454 if (*fd_egress < 0) {
455
456 r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
457 if (r < 0)
458 return r;
459
460 *fd_egress = r;
461 }
462
463 } else {
464 *fd_ingress = safe_close(*fd_ingress);
465 *fd_egress = safe_close(*fd_egress);
466
467 zero(u->ip_accounting_extra);
468 }
469
470 return 0;
471 }
472
473 int bpf_firewall_compile(Unit *u) {
474 CGroupContext *cc;
475 int r, supported;
476
477 assert(u);
478
479 cc = unit_get_cgroup_context(u);
480 if (!cc)
481 return -EINVAL;
482
483 supported = bpf_firewall_supported();
484 if (supported < 0)
485 return supported;
486 if (supported == BPF_FIREWALL_UNSUPPORTED) {
487 log_unit_debug(u, "BPF firewalling not supported on this manager, proceeding without.");
488 return -EOPNOTSUPP;
489 }
490 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
491 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
492 * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
493 * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
494 * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
495 * all, either. */
496 log_unit_debug(u, "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
497 return -EOPNOTSUPP;
498 }
499
500 /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
501 * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
502 * configuration, but we don't flush out the accounting unnecessarily */
503
504 u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
505 u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);
506
507 u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
508 u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
509
510 u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
511 u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
512
513 if (u->type != UNIT_SLICE) {
514 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
515 * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
516 * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
517 * means that all configure IP access rules *will* take effect on processes, even though we never
518 * compile them for inner nodes. */
519
520 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd);
521 if (r < 0)
522 return log_unit_error_errno(u, r, "Preparation of eBPF allow maps failed: %m");
523
524 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd);
525 if (r < 0)
526 return log_unit_error_errno(u, r, "Preparation of eBPF deny maps failed: %m");
527 }
528
529 r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
530 if (r < 0)
531 return log_unit_error_errno(u, r, "Preparation of eBPF accounting maps failed: %m");
532
533 r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress);
534 if (r < 0)
535 return log_unit_error_errno(u, r, "Compilation for ingress BPF program failed: %m");
536
537 r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress);
538 if (r < 0)
539 return log_unit_error_errno(u, r, "Compilation for egress BPF program failed: %m");
540
541 return 0;
542 }
543
544 int bpf_firewall_install(Unit *u) {
545 _cleanup_free_ char *path = NULL;
546 CGroupContext *cc;
547 int r, supported;
548 uint32_t flags;
549
550 assert(u);
551
552 cc = unit_get_cgroup_context(u);
553 if (!cc)
554 return -EINVAL;
555 if (!u->cgroup_path)
556 return -EINVAL;
557 if (!u->cgroup_realized)
558 return -EINVAL;
559
560 supported = bpf_firewall_supported();
561 if (supported < 0)
562 return supported;
563 if (supported == BPF_FIREWALL_UNSUPPORTED) {
564 log_unit_debug(u, "BPF firewalling not supported on this manager, proceeding without.");
565 return -EOPNOTSUPP;
566 }
567 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
568 log_unit_debug(u, "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
569 return -EOPNOTSUPP;
570 }
571
572 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
573 if (r < 0)
574 return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m");
575
576 flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
577 (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
578
579 /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
580 * minimize the time window when we don't account for IP traffic. */
581 u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
582 u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
583
584 if (u->ip_bpf_egress) {
585 r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
586 if (r < 0)
587 return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path);
588
589 /* Remember that this BPF program is installed now. */
590 u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
591 }
592
593 if (u->ip_bpf_ingress) {
594 r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
595 if (r < 0)
596 return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
597
598 u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
599 }
600
601 return 0;
602 }
603
604 int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
605 uint64_t key, packets;
606 int r;
607
608 if (map_fd < 0)
609 return -EBADF;
610
611 if (ret_packets) {
612 key = MAP_KEY_PACKETS;
613 r = bpf_map_lookup_element(map_fd, &key, &packets);
614 if (r < 0)
615 return r;
616 }
617
618 if (ret_bytes) {
619 key = MAP_KEY_BYTES;
620 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
621 if (r < 0)
622 return r;
623 }
624
625 if (ret_packets)
626 *ret_packets = packets;
627
628 return 0;
629 }
630
631 int bpf_firewall_reset_accounting(int map_fd) {
632 uint64_t key, value = 0;
633 int r;
634
635 if (map_fd < 0)
636 return -EBADF;
637
638 key = MAP_KEY_PACKETS;
639 r = bpf_map_update_element(map_fd, &key, &value);
640 if (r < 0)
641 return r;
642
643 key = MAP_KEY_BYTES;
644 return bpf_map_update_element(map_fd, &key, &value);
645 }
646
647 int bpf_firewall_supported(void) {
648 struct bpf_insn trivial[] = {
649 BPF_MOV64_IMM(BPF_REG_0, 1),
650 BPF_EXIT_INSN()
651 };
652
653 _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
654 static int supported = -1;
655 union bpf_attr attr;
656 int fd, r;
657
658 /* Checks whether BPF firewalling is supported. For this, we check five things:
659 *
660 * a) whether we are privileged
661 * b) whether the unified hierarchy is being used
662 * c) the BPF implementation in the kernel supports BPF LPM TRIE maps, which we require
663 * d) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
664 * e) the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
665 */
666
667 if (supported >= 0)
668 return supported;
669
670 if (geteuid() != 0) {
671 log_debug("Not enough privileges, BPF firewalling is not supported.");
672 return supported = BPF_FIREWALL_UNSUPPORTED;
673 }
674
675 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
676 if (r < 0)
677 return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
678 if (r == 0) {
679 log_debug("Not running with unified cgroups, BPF firewalling is not supported.");
680 return supported = BPF_FIREWALL_UNSUPPORTED;
681 }
682
683 fd = bpf_map_new(BPF_MAP_TYPE_LPM_TRIE,
684 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint64_t),
685 sizeof(uint64_t),
686 1,
687 BPF_F_NO_PREALLOC);
688 if (fd < 0) {
689 log_debug_errno(fd, "Can't allocate BPF LPM TRIE map, BPF firewalling is not supported: %m");
690 return supported = BPF_FIREWALL_UNSUPPORTED;
691 }
692
693 safe_close(fd);
694
695 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program);
696 if (r < 0) {
697 log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
698 return supported = BPF_FIREWALL_UNSUPPORTED;
699 }
700
701 r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
702 if (r < 0) {
703 log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
704 return supported = BPF_FIREWALL_UNSUPPORTED;
705 }
706
707 r = bpf_program_load_kernel(program, NULL, 0);
708 if (r < 0) {
709 log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
710 return supported = BPF_FIREWALL_UNSUPPORTED;
711 }
712
713 /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
714 * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
715 * program if we can't do a thing with it later?
716 *
717 * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
718 * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
719 * parameters are validated however, and that'll fail with EBADF then. */
720
721 attr = (union bpf_attr) {
722 .attach_type = BPF_CGROUP_INET_EGRESS,
723 .target_fd = -1,
724 .attach_bpf_fd = -1,
725 };
726
727 if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
728 if (errno != EBADF) {
729 log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
730 return supported = BPF_FIREWALL_UNSUPPORTED;
731 }
732
733 /* YAY! */
734 } else {
735 log_debug("Wut? Kernel accepted our invalid BPF_PROG_DETACH call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
736 return supported = BPF_FIREWALL_UNSUPPORTED;
737 }
738
739 /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
740 * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
741 * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
742 * get EINVAL if it's not supported, and EBADF as before if it is available. */
743
744 attr = (union bpf_attr) {
745 .attach_type = BPF_CGROUP_INET_EGRESS,
746 .target_fd = -1,
747 .attach_bpf_fd = -1,
748 .attach_flags = BPF_F_ALLOW_MULTI,
749 };
750
751 if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
752 if (errno == EBADF) {
753 log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
754 return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
755 }
756
757 if (errno == EINVAL)
758 log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
759 else
760 log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
761
762 return supported = BPF_FIREWALL_SUPPORTED;
763 } else {
764 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
765 return supported = BPF_FIREWALL_UNSUPPORTED;
766 }
767 }