]> git.ipfire.org Git - thirdparty/systemd.git/blame_incremental - src/core/bpf-firewall.c
io.systemd.Unit.List fix context/runtime split (#38172)
[thirdparty/systemd.git] / src / core / bpf-firewall.c
... / ...
CommitLineData
1/* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3#include <linux/bpf.h>
4#include <linux/bpf_insn.h>
5#include <linux/if_ether.h>
6#include <net/if.h>
7#include <netinet/ip.h>
8#include <netinet/ip6.h>
9#include <stdio.h>
10#include <unistd.h>
11
12#include "alloc-util.h"
13#include "bpf-firewall.h"
14#include "bpf-program.h"
15#include "errno-util.h"
16#include "fd-util.h"
17#include "in-addr-prefix-util.h"
18#include "manager.h"
19#include "memory-util.h"
20#include "set.h"
21#include "string-util.h"
22#include "strv.h"
23#include "unit.h"
24#include "virt.h"
25
26enum {
27 MAP_KEY_PACKETS,
28 MAP_KEY_BYTES,
29};
30
31enum {
32 ACCESS_ALLOWED = 1,
33 ACCESS_DENIED = 2,
34};
35
36/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
37
38static int add_lookup_instructions(
39 BPFProgram *p,
40 int map_fd,
41 int protocol,
42 bool is_ingress,
43 int verdict) {
44
45 int r, addr_offset, addr_size;
46
47 assert(p);
48 assert(map_fd >= 0);
49
50 switch (protocol) {
51
52 case ETH_P_IP:
53 addr_size = sizeof(uint32_t);
54 addr_offset = is_ingress ?
55 offsetof(struct iphdr, saddr) :
56 offsetof(struct iphdr, daddr);
57 break;
58
59 case ETH_P_IPV6:
60 addr_size = 4 * sizeof(uint32_t);
61 addr_offset = is_ingress ?
62 offsetof(struct ip6_hdr, ip6_src.s6_addr) :
63 offsetof(struct ip6_hdr, ip6_dst.s6_addr);
64 break;
65
66 default:
67 return -EAFNOSUPPORT;
68 }
69
70 do {
71 /* Compare IPv4 with one word instruction (32-bit) */
72 struct bpf_insn insn[] = {
73 /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
74 BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
75
76 /*
77 * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
78 *
79 * R1: Pointer to the skb
80 * R2: Data offset
81 * R3: Destination buffer on the stack (r10 - 4)
82 * R4: Number of bytes to read (4)
83 */
84
85 BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
86 BPF_MOV32_IMM(BPF_REG_2, addr_offset),
87
88 BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
89 BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
90
91 BPF_MOV32_IMM(BPF_REG_4, addr_size),
92 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
93
94 /*
95 * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
96 * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
97 * has to be set to the maximum possible value.
98 *
99 * On success, the looked up value is stored in R0. For this application, the actual
100 * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
101 * matching value.
102 */
103
104 BPF_LD_MAP_FD(BPF_REG_1, map_fd),
105 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
106 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
107 BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
108
109 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
110 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
111 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
112 };
113
114 /* Jump label fixup */
115 insn[0].off = ELEMENTSOF(insn) - 1;
116
117 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
118 if (r < 0)
119 return r;
120
121 } while (false);
122
123 return 0;
124}
125
126static int add_instructions_for_ip_any(
127 BPFProgram *p,
128 int verdict) {
129 int r;
130
131 assert(p);
132
133 const struct bpf_insn insn[] = {
134 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
135 };
136
137 r = bpf_program_add_instructions(p, insn, 1);
138 if (r < 0)
139 return r;
140
141 return 0;
142}
143
144static int bpf_firewall_compile_bpf(
145 Unit *u,
146 const char *prog_name,
147 bool is_ingress,
148 BPFProgram **ret,
149 bool ip_allow_any,
150 bool ip_deny_any) {
151
152 const struct bpf_insn pre_insn[] = {
153 /*
154 * When the eBPF program is entered, R1 contains the address of the skb.
155 * However, R1-R5 are scratch registers that are not preserved when calling
156 * into kernel functions, so we need to save anything that's supposed to
157 * stay around to R6-R9. Save the skb to R6.
158 */
159 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
160
161 /*
162 * Although we cannot access the skb data directly from eBPF programs used in this
163 * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
164 * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
165 * for later use.
166 */
167 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
168
169 /*
170 * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
171 * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
172 */
173 BPF_MOV32_IMM(BPF_REG_8, 0),
174 };
175
176 /*
177 * The access checkers compiled for the configured allowance and denial lists
178 * write to R8 at runtime. The following code prepares for an early exit that
179 * skip the accounting if the packet is denied.
180 *
181 * R0 = 1
182 * if (R8 == ACCESS_DENIED)
183 * R0 = 0
184 *
185 * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
186 * is allowed to pass.
187 */
188 const struct bpf_insn post_insn[] = {
189 BPF_MOV64_IMM(BPF_REG_0, 1),
190 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
191 BPF_MOV64_IMM(BPF_REG_0, 0),
192 };
193
194 _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
195 int accounting_map_fd, r;
196 bool access_enabled;
197 CGroupRuntime *crt;
198
199 assert(u);
200 assert(ret);
201
202 crt = unit_get_cgroup_runtime(u);
203 if (!crt) {
204 *ret = NULL;
205 return 0;
206 }
207
208 accounting_map_fd = is_ingress ?
209 crt->ip_accounting_ingress_map_fd :
210 crt->ip_accounting_egress_map_fd;
211
212 access_enabled =
213 crt->ipv4_allow_map_fd >= 0 ||
214 crt->ipv6_allow_map_fd >= 0 ||
215 crt->ipv4_deny_map_fd >= 0 ||
216 crt->ipv6_deny_map_fd >= 0 ||
217 ip_allow_any ||
218 ip_deny_any;
219
220 if (accounting_map_fd < 0 && !access_enabled) {
221 *ret = NULL;
222 return 0;
223 }
224
225 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, prog_name, &p);
226 if (r < 0)
227 return r;
228
229 r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
230 if (r < 0)
231 return r;
232
233 if (access_enabled) {
234 /*
235 * The simple rule this function translates into eBPF instructions is:
236 *
237 * - Access will be granted when an address matches an entry in @list_allow
238 * - Otherwise, access will be denied when an address matches an entry in @list_deny
239 * - Otherwise, access will be granted
240 */
241
242 if (crt->ipv4_deny_map_fd >= 0) {
243 r = add_lookup_instructions(p, crt->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
244 if (r < 0)
245 return r;
246 }
247
248 if (crt->ipv6_deny_map_fd >= 0) {
249 r = add_lookup_instructions(p, crt->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
250 if (r < 0)
251 return r;
252 }
253
254 if (crt->ipv4_allow_map_fd >= 0) {
255 r = add_lookup_instructions(p, crt->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
256 if (r < 0)
257 return r;
258 }
259
260 if (crt->ipv6_allow_map_fd >= 0) {
261 r = add_lookup_instructions(p, crt->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
262 if (r < 0)
263 return r;
264 }
265
266 if (ip_allow_any) {
267 r = add_instructions_for_ip_any(p, ACCESS_ALLOWED);
268 if (r < 0)
269 return r;
270 }
271
272 if (ip_deny_any) {
273 r = add_instructions_for_ip_any(p, ACCESS_DENIED);
274 if (r < 0)
275 return r;
276 }
277 }
278
279 r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
280 if (r < 0)
281 return r;
282
283 if (accounting_map_fd >= 0) {
284 struct bpf_insn insn[] = {
285 /*
286 * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
287 * The jump label will be fixed up later.
288 */
289 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
290
291 /* Count packets */
292 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
293 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
294 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
295 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
296 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
297 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
298 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
299 BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
300 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
301
302 /* Count bytes */
303 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
304 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
305 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
306 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
307 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
308 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
309 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
310 BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
311 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
312
313 /* Allow the packet to pass */
314 BPF_MOV64_IMM(BPF_REG_0, 1),
315 };
316
317 /* Jump label fixup */
318 insn[0].off = ELEMENTSOF(insn) - 1;
319
320 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
321 if (r < 0)
322 return r;
323 }
324
325 do {
326 /*
327 * Exit from the eBPF program, R0 contains the verdict.
328 * 0 means the packet is denied, 1 means the packet may pass.
329 */
330 const struct bpf_insn insn[] = {
331 BPF_EXIT_INSN()
332 };
333
334 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
335 if (r < 0)
336 return r;
337 } while (false);
338
339 *ret = TAKE_PTR(p);
340
341 return 0;
342}
343
344static int bpf_firewall_count_access_items(Set *prefixes, size_t *n_ipv4, size_t *n_ipv6) {
345 struct in_addr_prefix *a;
346
347 assert(n_ipv4);
348 assert(n_ipv6);
349
350 SET_FOREACH(a, prefixes)
351 switch (a->family) {
352
353 case AF_INET:
354 (*n_ipv4)++;
355 break;
356
357 case AF_INET6:
358 (*n_ipv6)++;
359 break;
360
361 default:
362 return -EAFNOSUPPORT;
363 }
364
365 return 0;
366}
367
368static int bpf_firewall_add_access_items(
369 Set *prefixes,
370 int ipv4_map_fd,
371 int ipv6_map_fd,
372 int verdict) {
373
374 struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
375 struct in_addr_prefix *a;
376 uint64_t value = verdict;
377 int r;
378
379 key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
380 key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
381
382 SET_FOREACH(a, prefixes)
383 switch (a->family) {
384
385 case AF_INET:
386 key_ipv4->prefixlen = a->prefixlen;
387 memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
388
389 r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
390 if (r < 0)
391 return r;
392
393 break;
394
395 case AF_INET6:
396 key_ipv6->prefixlen = a->prefixlen;
397 memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
398
399 r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
400 if (r < 0)
401 return r;
402
403 break;
404
405 default:
406 return -EAFNOSUPPORT;
407 }
408
409 return 0;
410}
411
412static int bpf_firewall_prepare_access_maps(
413 Unit *u,
414 int verdict,
415 int *ret_ipv4_map_fd,
416 int *ret_ipv6_map_fd,
417 bool *ret_has_any) {
418
419 _cleanup_close_ int ipv4_map_fd = -EBADF, ipv6_map_fd = -EBADF;
420 size_t n_ipv4 = 0, n_ipv6 = 0;
421 int r;
422
423 assert(ret_ipv4_map_fd);
424 assert(ret_ipv6_map_fd);
425 assert(ret_has_any);
426
427 for (Unit *p = u; p; p = UNIT_GET_SLICE(p)) {
428 CGroupContext *cc;
429 Set *prefixes;
430 bool *reduced;
431
432 cc = unit_get_cgroup_context(p);
433 if (!cc)
434 continue;
435
436 prefixes = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny;
437 reduced = verdict == ACCESS_ALLOWED ? &cc->ip_address_allow_reduced : &cc->ip_address_deny_reduced;
438
439 if (!*reduced) {
440 r = in_addr_prefixes_reduce(prefixes);
441 if (r < 0)
442 return r;
443
444 *reduced = true;
445 }
446
447 bpf_firewall_count_access_items(prefixes, &n_ipv4, &n_ipv6);
448
449 /* Skip making the LPM trie map in cases where we are using "any" in order to hack around
450 * needing CAP_SYS_ADMIN for allocating LPM trie map. */
451 if (in_addr_prefixes_is_any(prefixes)) {
452 *ret_has_any = true;
453 return 0;
454 }
455 }
456
457 if (n_ipv4 > 0) {
458 const char *name = strjoina("4_", u->id);
459 ipv4_map_fd = bpf_map_new(
460 name,
461 BPF_MAP_TYPE_LPM_TRIE,
462 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
463 sizeof(uint64_t),
464 n_ipv4,
465 BPF_F_NO_PREALLOC);
466 if (ipv4_map_fd < 0)
467 return ipv4_map_fd;
468 }
469
470 if (n_ipv6 > 0) {
471 const char *name = strjoina("6_", u->id);
472 ipv6_map_fd = bpf_map_new(
473 name,
474 BPF_MAP_TYPE_LPM_TRIE,
475 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
476 sizeof(uint64_t),
477 n_ipv6,
478 BPF_F_NO_PREALLOC);
479 if (ipv6_map_fd < 0)
480 return ipv6_map_fd;
481 }
482
483 for (Unit *p = u; p; p = UNIT_GET_SLICE(p)) {
484 CGroupContext *cc;
485
486 cc = unit_get_cgroup_context(p);
487 if (!cc)
488 continue;
489
490 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
491 ipv4_map_fd, ipv6_map_fd, verdict);
492 if (r < 0)
493 return r;
494 }
495
496 *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd);
497 *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd);
498 *ret_has_any = false;
499 return 0;
500}
501
502static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, CGroupRuntime *crt) {
503 int r;
504
505 assert(u);
506 assert(crt);
507
508 if (enabled) {
509 if (crt->ip_accounting_ingress_map_fd < 0) {
510 const char *name = strjoina("I_", u->id);
511 r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
512 if (r < 0)
513 return r;
514
515 crt->ip_accounting_ingress_map_fd = r;
516 }
517
518 if (crt->ip_accounting_egress_map_fd < 0) {
519 const char *name = strjoina("E_", u->id);
520 r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
521 if (r < 0)
522 return r;
523
524 crt->ip_accounting_egress_map_fd = r;
525 }
526
527 } else {
528 crt->ip_accounting_ingress_map_fd = safe_close(crt->ip_accounting_ingress_map_fd);
529 crt->ip_accounting_egress_map_fd = safe_close(crt->ip_accounting_egress_map_fd);
530
531 zero(crt->ip_accounting_extra);
532 }
533
534 return 0;
535}
536
537int bpf_firewall_compile(Unit *u) {
538 const char *ingress_name = NULL, *egress_name = NULL;
539 bool ip_allow_any = false, ip_deny_any = false;
540 CGroupContext *cc;
541 CGroupRuntime *crt;
542 int r;
543
544 assert(u);
545
546 cc = unit_get_cgroup_context(u);
547 if (!cc)
548 return -EINVAL;
549
550 crt = unit_setup_cgroup_runtime(u);
551 if (!crt)
552 return -ENOMEM;
553
554 if (bpf_program_supported() <= 0)
555 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
556 "bpf-firewall: BPF firewalling not supported, proceeding without.");
557
558 ingress_name = "sd_fw_ingress";
559 egress_name = "sd_fw_egress";
560
561 /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
562 * but we reuse the accounting maps. That way the firewall in effect always maps to the actual
563 * configuration, but we don't flush out the accounting unnecessarily */
564
565 crt->ip_bpf_ingress = bpf_program_free(crt->ip_bpf_ingress);
566 crt->ip_bpf_egress = bpf_program_free(crt->ip_bpf_egress);
567
568 crt->ipv4_allow_map_fd = safe_close(crt->ipv4_allow_map_fd);
569 crt->ipv4_deny_map_fd = safe_close(crt->ipv4_deny_map_fd);
570
571 crt->ipv6_allow_map_fd = safe_close(crt->ipv6_allow_map_fd);
572 crt->ipv6_deny_map_fd = safe_close(crt->ipv6_deny_map_fd);
573
574 if (u->type != UNIT_SLICE) {
575 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
576 * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
577 * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
578 * means that all configure IP access rules *will* take effect on processes, even though we never
579 * compile them for inner nodes. */
580
581 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &crt->ipv4_allow_map_fd, &crt->ipv6_allow_map_fd, &ip_allow_any);
582 if (r < 0)
583 return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF allow maps failed: %m");
584
585 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &crt->ipv4_deny_map_fd, &crt->ipv6_deny_map_fd, &ip_deny_any);
586 if (r < 0)
587 return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF deny maps failed: %m");
588 }
589
590 r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, crt);
591 if (r < 0)
592 return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF accounting maps failed: %m");
593
594 r = bpf_firewall_compile_bpf(u, ingress_name, true, &crt->ip_bpf_ingress, ip_allow_any, ip_deny_any);
595 if (r < 0)
596 return log_unit_error_errno(u, r, "bpf-firewall: Compilation of ingress BPF program failed: %m");
597
598 r = bpf_firewall_compile_bpf(u, egress_name, false, &crt->ip_bpf_egress, ip_allow_any, ip_deny_any);
599 if (r < 0)
600 return log_unit_error_errno(u, r, "bpf-firewall: Compilation of egress BPF program failed: %m");
601
602 return 0;
603}
604
605static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) {
606 set_clear(*set);
607
608 STRV_FOREACH(bpf_fs_path, filter_paths) {
609 _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
610 int r;
611
612 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &prog);
613 if (r < 0)
614 return log_unit_error_errno(u, r, "bpf-firewall: Allocation of SKB BPF program failed: %m");
615
616 r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path);
617 if (r < 0)
618 return log_unit_error_errno(u, r, "bpf-firewall: Loading of ingress BPF program %s failed: %m", *bpf_fs_path);
619
620 r = set_ensure_consume(set, &bpf_program_hash_ops, TAKE_PTR(prog));
621 if (r < 0)
622 return log_oom();
623 }
624
625 return 0;
626}
627
628int bpf_firewall_load_custom(Unit *u) {
629 CGroupContext *cc;
630 CGroupRuntime *crt;
631 int r;
632
633 assert(u);
634
635 cc = unit_get_cgroup_context(u);
636 if (!cc)
637 return 0;
638 crt = unit_get_cgroup_runtime(u);
639 if (!crt)
640 return 0;
641
642 if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
643 return 0;
644
645 if (bpf_program_supported() <= 0)
646 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
647 "bpf-firewall: BPF firewalling not supported, cannot attach custom BPF programs.");
648
649 r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &crt->ip_bpf_custom_ingress);
650 if (r < 0)
651 return r;
652 r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &crt->ip_bpf_custom_egress);
653 if (r < 0)
654 return r;
655
656 return 0;
657}
658
659static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) {
660 BPFProgram *prog;
661 int r;
662
663 assert(u);
664
665 set_clear(*set_installed);
666 r = set_ensure_allocated(set_installed, &bpf_program_hash_ops);
667 if (r < 0)
668 return log_oom();
669
670 SET_FOREACH_MOVE(prog, *set_installed, *set) {
671 r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI);
672 if (r < 0)
673 return log_unit_error_errno(u, r, "bpf-firewall: Attaching custom egress BPF program to cgroup %s failed: %m", path);
674 }
675 return 0;
676}
677
678int bpf_firewall_install(Unit *u) {
679 _cleanup_(bpf_program_freep) BPFProgram *ip_bpf_ingress_uninstall = NULL, *ip_bpf_egress_uninstall = NULL;
680 _cleanup_free_ char *path = NULL;
681 CGroupContext *cc;
682 CGroupRuntime *crt;
683 int r;
684
685 assert(u);
686
687 cc = unit_get_cgroup_context(u);
688 if (!cc)
689 return -EINVAL;
690
691 crt = unit_get_cgroup_runtime(u);
692 if (!crt || !crt->cgroup_path)
693 return -EOWNERDEAD;
694
695 if (bpf_program_supported() <= 0)
696 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
697 "bpf-firewall: BPF firewalling not supported, proceeding without.");
698
699 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, crt->cgroup_path, NULL, &path);
700 if (r < 0)
701 return log_unit_error_errno(u, r, "bpf-firewall: Failed to determine cgroup path: %m");
702
703 /* Let's clear the fields, but destroy the programs only after attaching the new programs, so that
704 * there's no time window where neither program is attached. (There will be a program where both are
705 * attached, but that's OK, since this is a security feature where we rather want to lock down too
706 * much than too little. */
707 ip_bpf_egress_uninstall = TAKE_PTR(crt->ip_bpf_egress_installed);
708 ip_bpf_ingress_uninstall = TAKE_PTR(crt->ip_bpf_ingress_installed);
709
710 if (crt->ip_bpf_egress) {
711 r = bpf_program_cgroup_attach(crt->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, BPF_F_ALLOW_MULTI);
712 if (r < 0)
713 return log_unit_error_errno(u, r,
714 "bpf-firewall: Attaching egress BPF program to cgroup %s failed: %m", path);
715
716 /* Remember that this BPF program is installed now. */
717 crt->ip_bpf_egress_installed = TAKE_PTR(crt->ip_bpf_egress);
718 }
719
720 if (crt->ip_bpf_ingress) {
721 r = bpf_program_cgroup_attach(crt->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, BPF_F_ALLOW_MULTI);
722 if (r < 0)
723 return log_unit_error_errno(u, r,
724 "bpf-firewall: Attaching ingress BPF program to cgroup %s failed: %m", path);
725
726 crt->ip_bpf_ingress_installed = TAKE_PTR(crt->ip_bpf_ingress);
727 }
728
729 /* And now, definitely get rid of the old programs, and detach them */
730 ip_bpf_egress_uninstall = bpf_program_free(ip_bpf_egress_uninstall);
731 ip_bpf_ingress_uninstall = bpf_program_free(ip_bpf_ingress_uninstall);
732
733 r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &crt->ip_bpf_custom_egress, &crt->ip_bpf_custom_egress_installed);
734 if (r < 0)
735 return r;
736
737 r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &crt->ip_bpf_custom_ingress, &crt->ip_bpf_custom_ingress_installed);
738 if (r < 0)
739 return r;
740
741 return 0;
742}
743
744int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
745 uint64_t key, packets;
746 int r;
747
748 if (map_fd < 0)
749 return -EBADF;
750
751 if (ret_packets) {
752 key = MAP_KEY_PACKETS;
753 r = bpf_map_lookup_element(map_fd, &key, &packets);
754 if (r < 0)
755 return r;
756 }
757
758 if (ret_bytes) {
759 key = MAP_KEY_BYTES;
760 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
761 if (r < 0)
762 return r;
763 }
764
765 if (ret_packets)
766 *ret_packets = packets;
767
768 return 0;
769}
770
771int bpf_firewall_reset_accounting(int map_fd) {
772 uint64_t key, value = 0;
773 int r;
774
775 if (map_fd < 0)
776 return -EBADF;
777
778 key = MAP_KEY_PACKETS;
779 r = bpf_map_update_element(map_fd, &key, &value);
780 if (r < 0)
781 return r;
782
783 key = MAP_KEY_BYTES;
784 return bpf_map_update_element(map_fd, &key, &value);
785}
786
787void emit_bpf_firewall_warning(Unit *u) {
788 static bool warned = false;
789 int r;
790
791 assert(u);
792 assert(u->manager);
793
794 if (warned || MANAGER_IS_TEST_RUN(u->manager))
795 return;
796
797 r = bpf_program_supported();
798 assert(r < 0);
799
800 bool quiet = ERRNO_IS_NEG_PRIVILEGE(r) && detect_container() > 0;
801
802 log_unit_full_errno(u, quiet ? LOG_DEBUG : LOG_WARNING, r,
803 "unit configures an IP firewall, but %s.\n"
804 "(This warning is only shown for the first unit using IP firewalling.)",
805 getuid() != 0 ? "not running as root" :
806 "the local system does not support BPF/cgroup firewalling");
807 warned = true;
808}
809
810void bpf_firewall_close(CGroupRuntime *crt) {
811 assert(crt);
812
813 crt->ip_accounting_ingress_map_fd = safe_close(crt->ip_accounting_ingress_map_fd);
814 crt->ip_accounting_egress_map_fd = safe_close(crt->ip_accounting_egress_map_fd);
815
816 crt->ipv4_allow_map_fd = safe_close(crt->ipv4_allow_map_fd);
817 crt->ipv6_allow_map_fd = safe_close(crt->ipv6_allow_map_fd);
818 crt->ipv4_deny_map_fd = safe_close(crt->ipv4_deny_map_fd);
819 crt->ipv6_deny_map_fd = safe_close(crt->ipv6_deny_map_fd);
820
821 crt->ip_bpf_ingress = bpf_program_free(crt->ip_bpf_ingress);
822 crt->ip_bpf_ingress_installed = bpf_program_free(crt->ip_bpf_ingress_installed);
823 crt->ip_bpf_egress = bpf_program_free(crt->ip_bpf_egress);
824 crt->ip_bpf_egress_installed = bpf_program_free(crt->ip_bpf_egress_installed);
825
826 crt->ip_bpf_custom_ingress = set_free(crt->ip_bpf_custom_ingress);
827 crt->ip_bpf_custom_egress = set_free(crt->ip_bpf_custom_egress);
828 crt->ip_bpf_custom_ingress_installed = set_free(crt->ip_bpf_custom_ingress_installed);
829 crt->ip_bpf_custom_egress_installed = set_free(crt->ip_bpf_custom_egress_installed);
830}