]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/core/bpf-firewall.c
Merge pull request #13096 from keszybz/unit-loading
[thirdparty/systemd.git] / src / core / bpf-firewall.c
1 /* SPDX-License-Identifier: LGPL-2.1+ */
2
3 #include <arpa/inet.h>
4 #include <assert.h>
5 #include <errno.h>
6 #include <fcntl.h>
7 #include <linux/bpf_insn.h>
8 #include <net/ethernet.h>
9 #include <net/if.h>
10 #include <netinet/ip.h>
11 #include <netinet/ip6.h>
12 #include <stddef.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <unistd.h>
17
18 #include "alloc-util.h"
19 #include "bpf-firewall.h"
20 #include "bpf-program.h"
21 #include "fd-util.h"
22 #include "ip-address-access.h"
23 #include "memory-util.h"
24 #include "missing_syscall.h"
25 #include "unit.h"
26 #include "strv.h"
27 #include "virt.h"
28
29 enum {
30 MAP_KEY_PACKETS,
31 MAP_KEY_BYTES,
32 };
33
34 enum {
35 ACCESS_ALLOWED = 1,
36 ACCESS_DENIED = 2,
37 };
38
39 /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
40
41 static int add_lookup_instructions(
42 BPFProgram *p,
43 int map_fd,
44 int protocol,
45 bool is_ingress,
46 int verdict) {
47
48 int r, addr_offset, addr_size;
49
50 assert(p);
51 assert(map_fd >= 0);
52
53 switch (protocol) {
54
55 case ETH_P_IP:
56 addr_size = sizeof(uint32_t);
57 addr_offset = is_ingress ?
58 offsetof(struct iphdr, saddr) :
59 offsetof(struct iphdr, daddr);
60 break;
61
62 case ETH_P_IPV6:
63 addr_size = 4 * sizeof(uint32_t);
64 addr_offset = is_ingress ?
65 offsetof(struct ip6_hdr, ip6_src.s6_addr) :
66 offsetof(struct ip6_hdr, ip6_dst.s6_addr);
67 break;
68
69 default:
70 return -EAFNOSUPPORT;
71 }
72
73 do {
74 /* Compare IPv4 with one word instruction (32bit) */
75 struct bpf_insn insn[] = {
76 /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
77 BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
78
79 /*
80 * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
81 *
82 * R1: Pointer to the skb
83 * R2: Data offset
84 * R3: Destination buffer on the stack (r10 - 4)
85 * R4: Number of bytes to read (4)
86 */
87
88 BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
89 BPF_MOV32_IMM(BPF_REG_2, addr_offset),
90
91 BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
92 BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
93
94 BPF_MOV32_IMM(BPF_REG_4, addr_size),
95 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
96
97 /*
98 * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
99 * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
100 * has to be set to the maximum possible value.
101 *
102 * On success, the looked up value is stored in R0. For this application, the actual
103 * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
104 * matching value.
105 */
106
107 BPF_LD_MAP_FD(BPF_REG_1, map_fd),
108 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
109 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
110 BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
111
112 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
113 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
114 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
115 };
116
117 /* Jump label fixup */
118 insn[0].off = ELEMENTSOF(insn) - 1;
119
120 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
121 if (r < 0)
122 return r;
123
124 } while (false);
125
126 return 0;
127 }
128
129 static int add_instructions_for_ip_any(
130 BPFProgram *p,
131 int verdict) {
132 int r;
133
134 assert(p);
135
136 struct bpf_insn insn[] = {
137 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
138 };
139
140 r = bpf_program_add_instructions(p, insn, 1);
141 if (r < 0)
142 return r;
143
144 return 0;
145 }
146
147 static int bpf_firewall_compile_bpf(
148 Unit *u,
149 bool is_ingress,
150 BPFProgram **ret,
151 bool ip_allow_any,
152 bool ip_deny_any) {
153
154 struct bpf_insn pre_insn[] = {
155 /*
156 * When the eBPF program is entered, R1 contains the address of the skb.
157 * However, R1-R5 are scratch registers that are not preserved when calling
158 * into kernel functions, so we need to save anything that's supposed to
159 * stay around to R6-R9. Save the skb to R6.
160 */
161 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
162
163 /*
164 * Although we cannot access the skb data directly from eBPF programs used in this
165 * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
166 * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
167 * for later use.
168 */
169 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
170
171 /*
172 * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
173 * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
174 */
175 BPF_MOV32_IMM(BPF_REG_8, 0),
176 };
177
178 /*
179 * The access checkers compiled for the configured allowance and denial lists
180 * write to R8 at runtime. The following code prepares for an early exit that
181 * skip the accounting if the packet is denied.
182 *
183 * R0 = 1
184 * if (R8 == ACCESS_DENIED)
185 * R0 = 0
186 *
187 * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
188 * is allowed to pass.
189 */
190 struct bpf_insn post_insn[] = {
191 BPF_MOV64_IMM(BPF_REG_0, 1),
192 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
193 BPF_MOV64_IMM(BPF_REG_0, 0),
194 };
195
196 _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
197 int accounting_map_fd, r;
198 bool access_enabled;
199
200 assert(u);
201 assert(ret);
202
203 accounting_map_fd = is_ingress ?
204 u->ip_accounting_ingress_map_fd :
205 u->ip_accounting_egress_map_fd;
206
207 access_enabled =
208 u->ipv4_allow_map_fd >= 0 ||
209 u->ipv6_allow_map_fd >= 0 ||
210 u->ipv4_deny_map_fd >= 0 ||
211 u->ipv6_deny_map_fd >= 0 ||
212 ip_allow_any ||
213 ip_deny_any;
214
215 if (accounting_map_fd < 0 && !access_enabled) {
216 *ret = NULL;
217 return 0;
218 }
219
220 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
221 if (r < 0)
222 return r;
223
224 r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
225 if (r < 0)
226 return r;
227
228 if (access_enabled) {
229 /*
230 * The simple rule this function translates into eBPF instructions is:
231 *
232 * - Access will be granted when an address matches an entry in @list_allow
233 * - Otherwise, access will be denied when an address matches an entry in @list_deny
234 * - Otherwise, access will be granted
235 */
236
237 if (u->ipv4_deny_map_fd >= 0) {
238 r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
239 if (r < 0)
240 return r;
241 }
242
243 if (u->ipv6_deny_map_fd >= 0) {
244 r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
245 if (r < 0)
246 return r;
247 }
248
249 if (u->ipv4_allow_map_fd >= 0) {
250 r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
251 if (r < 0)
252 return r;
253 }
254
255 if (u->ipv6_allow_map_fd >= 0) {
256 r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
257 if (r < 0)
258 return r;
259 }
260
261 if (ip_allow_any) {
262 r = add_instructions_for_ip_any(p, ACCESS_ALLOWED);
263 if (r < 0)
264 return r;
265 }
266
267 if (ip_deny_any) {
268 r = add_instructions_for_ip_any(p, ACCESS_DENIED);
269 if (r < 0)
270 return r;
271 }
272 }
273
274 r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
275 if (r < 0)
276 return r;
277
278 if (accounting_map_fd >= 0) {
279 struct bpf_insn insn[] = {
280 /*
281 * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
282 * The jump label will be fixed up later.
283 */
284 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
285
286 /* Count packets */
287 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
288 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
289 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
290 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
291 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
292 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
293 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
294 BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
295 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
296
297 /* Count bytes */
298 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
299 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
300 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
301 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
302 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
303 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
304 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
305 BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
306 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
307
308 /* Allow the packet to pass */
309 BPF_MOV64_IMM(BPF_REG_0, 1),
310 };
311
312 /* Jump label fixup */
313 insn[0].off = ELEMENTSOF(insn) - 1;
314
315 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
316 if (r < 0)
317 return r;
318 }
319
320 do {
321 /*
322 * Exit from the eBPF program, R0 contains the verdict.
323 * 0 means the packet is denied, 1 means the packet may pass.
324 */
325 struct bpf_insn insn[] = {
326 BPF_EXIT_INSN()
327 };
328
329 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
330 if (r < 0)
331 return r;
332 } while (false);
333
334 *ret = TAKE_PTR(p);
335
336 return 0;
337 }
338
339 static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) {
340 IPAddressAccessItem *a;
341
342 assert(n_ipv4);
343 assert(n_ipv6);
344
345 LIST_FOREACH(items, a, list) {
346 switch (a->family) {
347
348 case AF_INET:
349 (*n_ipv4)++;
350 break;
351
352 case AF_INET6:
353 (*n_ipv6)++;
354 break;
355
356 default:
357 return -EAFNOSUPPORT;
358 }
359 }
360
361 return 0;
362 }
363
364 static int bpf_firewall_add_access_items(
365 IPAddressAccessItem *list,
366 int ipv4_map_fd,
367 int ipv6_map_fd,
368 int verdict) {
369
370 struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
371 uint64_t value = verdict;
372 IPAddressAccessItem *a;
373 int r;
374
375 key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
376 key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
377
378 LIST_FOREACH(items, a, list) {
379 switch (a->family) {
380
381 case AF_INET:
382 key_ipv4->prefixlen = a->prefixlen;
383 memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
384
385 r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
386 if (r < 0)
387 return r;
388
389 break;
390
391 case AF_INET6:
392 key_ipv6->prefixlen = a->prefixlen;
393 memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
394
395 r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
396 if (r < 0)
397 return r;
398
399 break;
400
401 default:
402 return -EAFNOSUPPORT;
403 }
404 }
405
406 return 0;
407 }
408
409 static int bpf_firewall_prepare_access_maps(
410 Unit *u,
411 int verdict,
412 int *ret_ipv4_map_fd,
413 int *ret_ipv6_map_fd,
414 bool *ret_has_any) {
415
416 _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
417 size_t n_ipv4 = 0, n_ipv6 = 0;
418 IPAddressAccessItem *list;
419 Unit *p;
420 int r;
421
422 assert(ret_ipv4_map_fd);
423 assert(ret_ipv6_map_fd);
424 assert(ret_has_any);
425
426 for (p = u; p; p = UNIT_DEREF(p->slice)) {
427 CGroupContext *cc;
428
429 cc = unit_get_cgroup_context(p);
430 if (!cc)
431 continue;
432
433 list = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny;
434
435 bpf_firewall_count_access_items(list, &n_ipv4, &n_ipv6);
436
437 /* Skip making the LPM trie map in cases where we are using "any" in order to hack around
438 * needing CAP_SYS_ADMIN for allocating LPM trie map. */
439 if (ip_address_access_item_is_any(list)) {
440 *ret_has_any = true;
441 return 0;
442 }
443 }
444
445 if (n_ipv4 > 0) {
446 ipv4_map_fd = bpf_map_new(
447 BPF_MAP_TYPE_LPM_TRIE,
448 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
449 sizeof(uint64_t),
450 n_ipv4,
451 BPF_F_NO_PREALLOC);
452 if (ipv4_map_fd < 0)
453 return ipv4_map_fd;
454 }
455
456 if (n_ipv6 > 0) {
457 ipv6_map_fd = bpf_map_new(
458 BPF_MAP_TYPE_LPM_TRIE,
459 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
460 sizeof(uint64_t),
461 n_ipv6,
462 BPF_F_NO_PREALLOC);
463 if (ipv6_map_fd < 0)
464 return ipv6_map_fd;
465 }
466
467 for (p = u; p; p = UNIT_DEREF(p->slice)) {
468 CGroupContext *cc;
469
470 cc = unit_get_cgroup_context(p);
471 if (!cc)
472 continue;
473
474 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
475 ipv4_map_fd, ipv6_map_fd, verdict);
476 if (r < 0)
477 return r;
478 }
479
480 *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd);
481 *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd);
482 *ret_has_any = false;
483 return 0;
484 }
485
486 static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
487 int r;
488
489 assert(u);
490 assert(fd_ingress);
491 assert(fd_egress);
492
493 if (enabled) {
494 if (*fd_ingress < 0) {
495 r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
496 if (r < 0)
497 return r;
498
499 *fd_ingress = r;
500 }
501
502 if (*fd_egress < 0) {
503
504 r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
505 if (r < 0)
506 return r;
507
508 *fd_egress = r;
509 }
510
511 } else {
512 *fd_ingress = safe_close(*fd_ingress);
513 *fd_egress = safe_close(*fd_egress);
514
515 zero(u->ip_accounting_extra);
516 }
517
518 return 0;
519 }
520
521 int bpf_firewall_compile(Unit *u) {
522 CGroupContext *cc;
523 int r, supported;
524 bool ip_allow_any = false, ip_deny_any = false;
525
526 assert(u);
527
528 cc = unit_get_cgroup_context(u);
529 if (!cc)
530 return -EINVAL;
531
532 supported = bpf_firewall_supported();
533 if (supported < 0)
534 return supported;
535 if (supported == BPF_FIREWALL_UNSUPPORTED)
536 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
537 "BPF firewalling not supported on this manager, proceeding without.");
538 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
539 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
540 * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
541 * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
542 * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
543 * all, either. */
544 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
545 "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
546
547 /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
548 * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
549 * configuration, but we don't flush out the accounting unnecessarily */
550
551 u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
552 u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);
553
554 u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
555 u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
556
557 u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
558 u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
559
560 if (u->type != UNIT_SLICE) {
561 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
562 * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
563 * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
564 * means that all configure IP access rules *will* take effect on processes, even though we never
565 * compile them for inner nodes. */
566
567 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any);
568 if (r < 0)
569 return log_unit_error_errno(u, r, "Preparation of eBPF allow maps failed: %m");
570
571 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any);
572 if (r < 0)
573 return log_unit_error_errno(u, r, "Preparation of eBPF deny maps failed: %m");
574 }
575
576 r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
577 if (r < 0)
578 return log_unit_error_errno(u, r, "Preparation of eBPF accounting maps failed: %m");
579
580 r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any);
581 if (r < 0)
582 return log_unit_error_errno(u, r, "Compilation for ingress BPF program failed: %m");
583
584 r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any);
585 if (r < 0)
586 return log_unit_error_errno(u, r, "Compilation for egress BPF program failed: %m");
587
588 return 0;
589 }
590
591 DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(filter_prog_hash_ops, void, trivial_hash_func, trivial_compare_func, BPFProgram, bpf_program_unref);
592
593 static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) {
594 char **bpf_fs_path;
595
596 set_clear(*set);
597
598 STRV_FOREACH(bpf_fs_path, filter_paths) {
599 _cleanup_free_ BPFProgram *prog = NULL;
600 int r;
601
602 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &prog);
603 if (r < 0)
604 return log_unit_error_errno(u, r, "Can't allocate CGROUP SKB BPF program: %m");
605
606 r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path);
607 if (r < 0)
608 return log_unit_error_errno(u, r, "Loading of ingress BPF program %s failed: %m", *bpf_fs_path);
609
610 r = set_ensure_allocated(set, &filter_prog_hash_ops);
611 if (r < 0)
612 return log_unit_error_errno(u, r, "Can't allocate BPF program set: %m");
613
614 r = set_put(*set, prog);
615 if (r < 0)
616 return log_unit_error_errno(u, r, "Can't add program to BPF program set: %m");
617 TAKE_PTR(prog);
618 }
619
620 return 0;
621 }
622
623 int bpf_firewall_load_custom(Unit *u) {
624 CGroupContext *cc;
625 int r, supported;
626
627 assert(u);
628
629 cc = unit_get_cgroup_context(u);
630 if (!cc)
631 return 0;
632
633 if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
634 return 0;
635
636 supported = bpf_firewall_supported();
637 if (supported < 0)
638 return supported;
639
640 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI)
641 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs.");
642
643 r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress);
644 if (r < 0)
645 return r;
646 r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress);
647 if (r < 0)
648 return r;
649
650 return 0;
651 }
652
653 static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) {
654 BPFProgram *prog;
655 Iterator i;
656 int r;
657
658 assert(u);
659
660 set_clear(*set_installed);
661
662 SET_FOREACH(prog, *set, i) {
663 r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI);
664 if (r < 0)
665 return log_unit_error_errno(u, r, "Attaching custom egress BPF program to cgroup %s failed: %m", path);
666 /* Remember that these BPF programs are installed now. */
667 r = set_ensure_allocated(set_installed, &filter_prog_hash_ops);
668 if (r < 0)
669 return log_unit_error_errno(u, r, "Can't allocate BPF program set: %m");
670
671 r = set_put(*set_installed, prog);
672 if (r < 0)
673 return log_unit_error_errno(u, r, "Can't add program to BPF program set: %m");
674 bpf_program_ref(prog);
675 }
676
677 return 0;
678 }
679
680 int bpf_firewall_install(Unit *u) {
681 _cleanup_free_ char *path = NULL;
682 CGroupContext *cc;
683 int r, supported;
684 uint32_t flags;
685
686 assert(u);
687
688 cc = unit_get_cgroup_context(u);
689 if (!cc)
690 return -EINVAL;
691 if (!u->cgroup_path)
692 return -EINVAL;
693 if (!u->cgroup_realized)
694 return -EINVAL;
695
696 supported = bpf_firewall_supported();
697 if (supported < 0)
698 return supported;
699 if (supported == BPF_FIREWALL_UNSUPPORTED) {
700 log_unit_debug(u, "BPF firewalling not supported on this manager, proceeding without.");
701 return -EOPNOTSUPP;
702 }
703 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
704 log_unit_debug(u, "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
705 return -EOPNOTSUPP;
706 }
707 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
708 (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress)))
709 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs.");
710
711 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
712 if (r < 0)
713 return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m");
714
715 flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
716 (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
717
718 /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
719 * minimize the time window when we don't account for IP traffic. */
720 u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
721 u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
722
723 if (u->ip_bpf_egress) {
724 r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path,
725 flags | (set_isempty(u->ip_bpf_custom_egress) ? 0 : BPF_F_ALLOW_MULTI));
726 if (r < 0)
727 return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path);
728
729 /* Remember that this BPF program is installed now. */
730 u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
731 }
732
733 if (u->ip_bpf_ingress) {
734 r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path,
735 flags | (set_isempty(u->ip_bpf_custom_ingress) ? 0 : BPF_F_ALLOW_MULTI));
736 if (r < 0)
737 return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
738
739 u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
740 }
741
742 r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed);
743 if (r < 0)
744 return r;
745
746 r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed);
747 if (r < 0)
748 return r;
749
750 return 0;
751 }
752
753 int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
754 uint64_t key, packets;
755 int r;
756
757 if (map_fd < 0)
758 return -EBADF;
759
760 if (ret_packets) {
761 key = MAP_KEY_PACKETS;
762 r = bpf_map_lookup_element(map_fd, &key, &packets);
763 if (r < 0)
764 return r;
765 }
766
767 if (ret_bytes) {
768 key = MAP_KEY_BYTES;
769 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
770 if (r < 0)
771 return r;
772 }
773
774 if (ret_packets)
775 *ret_packets = packets;
776
777 return 0;
778 }
779
780 int bpf_firewall_reset_accounting(int map_fd) {
781 uint64_t key, value = 0;
782 int r;
783
784 if (map_fd < 0)
785 return -EBADF;
786
787 key = MAP_KEY_PACKETS;
788 r = bpf_map_update_element(map_fd, &key, &value);
789 if (r < 0)
790 return r;
791
792 key = MAP_KEY_BYTES;
793 return bpf_map_update_element(map_fd, &key, &value);
794 }
795
796 static int bpf_firewall_unsupported_reason = 0;
797
798 int bpf_firewall_supported(void) {
799 struct bpf_insn trivial[] = {
800 BPF_MOV64_IMM(BPF_REG_0, 1),
801 BPF_EXIT_INSN()
802 };
803
804 _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
805 static int supported = -1;
806 union bpf_attr attr;
807 int r;
808
809 /* Checks whether BPF firewalling is supported. For this, we check the following things:
810 *
811 * - whether the unified hierarchy is being used
812 * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
813 * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
814 */
815 if (supported >= 0)
816 return supported;
817
818 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
819 if (r < 0)
820 return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
821 if (r == 0) {
822 bpf_firewall_unsupported_reason =
823 log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN),
824 "Not running with unified cgroups, BPF firewalling is not supported.");
825 return supported = BPF_FIREWALL_UNSUPPORTED;
826 }
827
828 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program);
829 if (r < 0) {
830 bpf_firewall_unsupported_reason =
831 log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
832 return supported = BPF_FIREWALL_UNSUPPORTED;
833 }
834
835 r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
836 if (r < 0) {
837 bpf_firewall_unsupported_reason =
838 log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
839 return supported = BPF_FIREWALL_UNSUPPORTED;
840 }
841
842 r = bpf_program_load_kernel(program, NULL, 0);
843 if (r < 0) {
844 bpf_firewall_unsupported_reason =
845 log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
846 return supported = BPF_FIREWALL_UNSUPPORTED;
847 }
848
849 /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
850 * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
851 * program if we can't do a thing with it later?
852 *
853 * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
854 * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
855 * parameters are validated however, and that'll fail with EBADF then. */
856
857 attr = (union bpf_attr) {
858 .attach_type = BPF_CGROUP_INET_EGRESS,
859 .target_fd = -1,
860 .attach_bpf_fd = -1,
861 };
862
863 if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
864 if (errno != EBADF) {
865 bpf_firewall_unsupported_reason =
866 log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
867 return supported = BPF_FIREWALL_UNSUPPORTED;
868 }
869
870 /* YAY! */
871 } else {
872 log_debug("Wut? Kernel accepted our invalid BPF_PROG_DETACH call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
873 return supported = BPF_FIREWALL_UNSUPPORTED;
874 }
875
876 /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
877 * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
878 * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
879 * get EINVAL if it's not supported, and EBADF as before if it is available. */
880
881 attr = (union bpf_attr) {
882 .attach_type = BPF_CGROUP_INET_EGRESS,
883 .target_fd = -1,
884 .attach_bpf_fd = -1,
885 .attach_flags = BPF_F_ALLOW_MULTI,
886 };
887
888 if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
889 if (errno == EBADF) {
890 log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
891 return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
892 }
893
894 if (errno == EINVAL)
895 log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
896 else
897 log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
898
899 return supported = BPF_FIREWALL_SUPPORTED;
900 } else {
901 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
902 return supported = BPF_FIREWALL_UNSUPPORTED;
903 }
904 }
905
906 void emit_bpf_firewall_warning(Unit *u) {
907 static bool warned = false;
908
909 if (!warned) {
910 bool quiet = bpf_firewall_unsupported_reason == -EPERM && detect_container();
911
912 log_unit_full(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason,
913 "unit configures an IP firewall, but %s.\n"
914 "(This warning is only shown for the first unit using IP firewalling.)",
915 getuid() != 0 ? "not running as root" :
916 "the local system does not support BPF/cgroup firewalling");
917 warned = true;
918 }
919 }