]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/bpf-firewall.c
Merge pull request #17732 from yuwata/core-use-synthetic_errno
[thirdparty/systemd.git] / src / core / bpf-firewall.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
1988a9d1
DM
2
3#include <arpa/inet.h>
4#include <assert.h>
5#include <errno.h>
6#include <fcntl.h>
01234e1f 7#include <linux/bpf_insn.h>
1988a9d1
DM
8#include <net/ethernet.h>
9#include <net/if.h>
10#include <netinet/ip.h>
11#include <netinet/ip6.h>
12#include <stddef.h>
13#include <stdio.h>
14#include <stdlib.h>
1988a9d1
DM
15#include <unistd.h>
16
17#include "alloc-util.h"
18#include "bpf-firewall.h"
19#include "bpf-program.h"
20#include "fd-util.h"
21#include "ip-address-access.h"
0a970718 22#include "memory-util.h"
e93672ee 23#include "missing_syscall.h"
1988a9d1 24#include "unit.h"
5cfa33e0 25#include "strv.h"
f140ed02 26#include "virt.h"
1988a9d1
DM
27
28enum {
29 MAP_KEY_PACKETS,
30 MAP_KEY_BYTES,
31};
32
33enum {
34 ACCESS_ALLOWED = 1,
35 ACCESS_DENIED = 2,
36};
37
38/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
39
40static int add_lookup_instructions(
41 BPFProgram *p,
42 int map_fd,
43 int protocol,
44 bool is_ingress,
45 int verdict) {
46
47 int r, addr_offset, addr_size;
48
49 assert(p);
50 assert(map_fd >= 0);
51
52 switch (protocol) {
53
54 case ETH_P_IP:
55 addr_size = sizeof(uint32_t);
56 addr_offset = is_ingress ?
57 offsetof(struct iphdr, saddr) :
58 offsetof(struct iphdr, daddr);
59 break;
60
61 case ETH_P_IPV6:
62 addr_size = 4 * sizeof(uint32_t);
63 addr_offset = is_ingress ?
64 offsetof(struct ip6_hdr, ip6_src.s6_addr) :
65 offsetof(struct ip6_hdr, ip6_dst.s6_addr);
66 break;
67
68 default:
69 return -EAFNOSUPPORT;
70 }
71
72 do {
73 /* Compare IPv4 with one word instruction (32bit) */
74 struct bpf_insn insn[] = {
75 /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
76 BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
77
78 /*
79 * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
80 *
81 * R1: Pointer to the skb
82 * R2: Data offset
83 * R3: Destination buffer on the stack (r10 - 4)
84 * R4: Number of bytes to read (4)
85 */
86
87 BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
88 BPF_MOV32_IMM(BPF_REG_2, addr_offset),
89
90 BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
91 BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
92
93 BPF_MOV32_IMM(BPF_REG_4, addr_size),
94 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
95
96 /*
97 * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
98 * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
99 * has to be set to the maximum possible value.
100 *
101 * On success, the looked up value is stored in R0. For this application, the actual
102 * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
103 * matching value.
104 */
105
106 BPF_LD_MAP_FD(BPF_REG_1, map_fd),
107 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
108 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
109 BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
110
111 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
112 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
113 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
114 };
115
116 /* Jump label fixup */
117 insn[0].off = ELEMENTSOF(insn) - 1;
118
119 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
120 if (r < 0)
121 return r;
122
123 } while (false);
124
125 return 0;
126}
127
4c1567f2
AZ
128static int add_instructions_for_ip_any(
129 BPFProgram *p,
130 int verdict) {
131 int r;
132
133 assert(p);
134
2899aac4 135 const struct bpf_insn insn[] = {
4c1567f2
AZ
136 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
137 };
138
139 r = bpf_program_add_instructions(p, insn, 1);
140 if (r < 0)
141 return r;
142
143 return 0;
144}
145
1988a9d1
DM
146static int bpf_firewall_compile_bpf(
147 Unit *u,
148 bool is_ingress,
4c1567f2
AZ
149 BPFProgram **ret,
150 bool ip_allow_any,
151 bool ip_deny_any) {
1988a9d1 152
2899aac4 153 const struct bpf_insn pre_insn[] = {
1988a9d1
DM
154 /*
155 * When the eBPF program is entered, R1 contains the address of the skb.
156 * However, R1-R5 are scratch registers that are not preserved when calling
157 * into kernel functions, so we need to save anything that's supposed to
158 * stay around to R6-R9. Save the skb to R6.
159 */
160 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
161
162 /*
163 * Although we cannot access the skb data directly from eBPF programs used in this
164 * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
165 * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
166 * for later use.
167 */
168 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
169
170 /*
171 * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
172 * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
173 */
174 BPF_MOV32_IMM(BPF_REG_8, 0),
175 };
176
177 /*
178 * The access checkers compiled for the configured allowance and denial lists
179 * write to R8 at runtime. The following code prepares for an early exit that
180 * skip the accounting if the packet is denied.
181 *
182 * R0 = 1
183 * if (R8 == ACCESS_DENIED)
184 * R0 = 0
185 *
186 * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
187 * is allowed to pass.
188 */
2899aac4 189 const struct bpf_insn post_insn[] = {
1988a9d1
DM
190 BPF_MOV64_IMM(BPF_REG_0, 1),
191 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
192 BPF_MOV64_IMM(BPF_REG_0, 0),
193 };
194
195 _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
196 int accounting_map_fd, r;
197 bool access_enabled;
198
199 assert(u);
200 assert(ret);
201
202 accounting_map_fd = is_ingress ?
203 u->ip_accounting_ingress_map_fd :
204 u->ip_accounting_egress_map_fd;
205
206 access_enabled =
207 u->ipv4_allow_map_fd >= 0 ||
208 u->ipv6_allow_map_fd >= 0 ||
209 u->ipv4_deny_map_fd >= 0 ||
4c1567f2
AZ
210 u->ipv6_deny_map_fd >= 0 ||
211 ip_allow_any ||
212 ip_deny_any;
1988a9d1
DM
213
214 if (accounting_map_fd < 0 && !access_enabled) {
215 *ret = NULL;
216 return 0;
217 }
218
219 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
220 if (r < 0)
221 return r;
222
223 r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
224 if (r < 0)
225 return r;
226
227 if (access_enabled) {
228 /*
229 * The simple rule this function translates into eBPF instructions is:
230 *
231 * - Access will be granted when an address matches an entry in @list_allow
232 * - Otherwise, access will be denied when an address matches an entry in @list_deny
233 * - Otherwise, access will be granted
234 */
235
236 if (u->ipv4_deny_map_fd >= 0) {
237 r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
238 if (r < 0)
239 return r;
240 }
241
242 if (u->ipv6_deny_map_fd >= 0) {
243 r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
244 if (r < 0)
245 return r;
246 }
247
248 if (u->ipv4_allow_map_fd >= 0) {
249 r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
250 if (r < 0)
251 return r;
252 }
253
254 if (u->ipv6_allow_map_fd >= 0) {
255 r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
256 if (r < 0)
257 return r;
258 }
4c1567f2
AZ
259
260 if (ip_allow_any) {
261 r = add_instructions_for_ip_any(p, ACCESS_ALLOWED);
262 if (r < 0)
263 return r;
264 }
265
266 if (ip_deny_any) {
267 r = add_instructions_for_ip_any(p, ACCESS_DENIED);
268 if (r < 0)
269 return r;
270 }
1988a9d1
DM
271 }
272
273 r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
274 if (r < 0)
275 return r;
276
277 if (accounting_map_fd >= 0) {
278 struct bpf_insn insn[] = {
279 /*
280 * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
281 * The jump label will be fixed up later.
282 */
283 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
284
285 /* Count packets */
286 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
287 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
288 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
289 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
290 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
291 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
292 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
293 BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
294 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
295
296 /* Count bytes */
297 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
298 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
299 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
300 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
301 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
302 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
303 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
304 BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
305 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
306
307 /* Allow the packet to pass */
308 BPF_MOV64_IMM(BPF_REG_0, 1),
309 };
310
311 /* Jump label fixup */
312 insn[0].off = ELEMENTSOF(insn) - 1;
313
314 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
315 if (r < 0)
316 return r;
317 }
318
319 do {
320 /*
321 * Exit from the eBPF program, R0 contains the verdict.
322 * 0 means the packet is denied, 1 means the packet may pass.
323 */
2899aac4 324 const struct bpf_insn insn[] = {
1988a9d1
DM
325 BPF_EXIT_INSN()
326 };
327
328 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
329 if (r < 0)
330 return r;
331 } while (false);
332
1cc6c93a 333 *ret = TAKE_PTR(p);
1988a9d1
DM
334
335 return 0;
336}
337
338static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) {
339 IPAddressAccessItem *a;
340
341 assert(n_ipv4);
342 assert(n_ipv6);
343
344 LIST_FOREACH(items, a, list) {
345 switch (a->family) {
346
347 case AF_INET:
348 (*n_ipv4)++;
349 break;
350
351 case AF_INET6:
352 (*n_ipv6)++;
353 break;
354
355 default:
356 return -EAFNOSUPPORT;
357 }
358 }
359
360 return 0;
361}
362
363static int bpf_firewall_add_access_items(
364 IPAddressAccessItem *list,
365 int ipv4_map_fd,
366 int ipv6_map_fd,
367 int verdict) {
368
369 struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
370 uint64_t value = verdict;
371 IPAddressAccessItem *a;
372 int r;
373
374 key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
375 key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
376
377 LIST_FOREACH(items, a, list) {
378 switch (a->family) {
379
380 case AF_INET:
381 key_ipv4->prefixlen = a->prefixlen;
382 memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
383
384 r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
385 if (r < 0)
386 return r;
387
388 break;
389
390 case AF_INET6:
391 key_ipv6->prefixlen = a->prefixlen;
392 memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
393
394 r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
395 if (r < 0)
396 return r;
397
398 break;
399
400 default:
401 return -EAFNOSUPPORT;
402 }
403 }
404
405 return 0;
406}
407
408static int bpf_firewall_prepare_access_maps(
409 Unit *u,
410 int verdict,
411 int *ret_ipv4_map_fd,
4c1567f2
AZ
412 int *ret_ipv6_map_fd,
413 bool *ret_has_any) {
1988a9d1
DM
414
415 _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
416 size_t n_ipv4 = 0, n_ipv6 = 0;
4c1567f2 417 IPAddressAccessItem *list;
1988a9d1
DM
418 Unit *p;
419 int r;
420
421 assert(ret_ipv4_map_fd);
422 assert(ret_ipv6_map_fd);
4c1567f2 423 assert(ret_has_any);
1988a9d1
DM
424
425 for (p = u; p; p = UNIT_DEREF(p->slice)) {
426 CGroupContext *cc;
427
428 cc = unit_get_cgroup_context(p);
429 if (!cc)
430 continue;
431
4c1567f2
AZ
432 list = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny;
433
434 bpf_firewall_count_access_items(list, &n_ipv4, &n_ipv6);
435
436 /* Skip making the LPM trie map in cases where we are using "any" in order to hack around
437 * needing CAP_SYS_ADMIN for allocating LPM trie map. */
438 if (ip_address_access_item_is_any(list)) {
439 *ret_has_any = true;
440 return 0;
441 }
1988a9d1
DM
442 }
443
444 if (n_ipv4 > 0) {
445 ipv4_map_fd = bpf_map_new(
446 BPF_MAP_TYPE_LPM_TRIE,
447 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
448 sizeof(uint64_t),
449 n_ipv4,
450 BPF_F_NO_PREALLOC);
451 if (ipv4_map_fd < 0)
452 return ipv4_map_fd;
453 }
454
455 if (n_ipv6 > 0) {
456 ipv6_map_fd = bpf_map_new(
457 BPF_MAP_TYPE_LPM_TRIE,
458 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
459 sizeof(uint64_t),
460 n_ipv6,
461 BPF_F_NO_PREALLOC);
462 if (ipv6_map_fd < 0)
463 return ipv6_map_fd;
464 }
465
466 for (p = u; p; p = UNIT_DEREF(p->slice)) {
467 CGroupContext *cc;
468
469 cc = unit_get_cgroup_context(p);
470 if (!cc)
471 continue;
472
473 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
474 ipv4_map_fd, ipv6_map_fd, verdict);
475 if (r < 0)
476 return r;
477 }
478
1e59b545
LP
479 *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd);
480 *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd);
4c1567f2 481 *ret_has_any = false;
1988a9d1
DM
482 return 0;
483}
484
51283461 485static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
1988a9d1
DM
486 int r;
487
51283461 488 assert(u);
1988a9d1
DM
489 assert(fd_ingress);
490 assert(fd_egress);
491
492 if (enabled) {
493 if (*fd_ingress < 0) {
494 r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
495 if (r < 0)
496 return r;
497
498 *fd_ingress = r;
499 }
500
501 if (*fd_egress < 0) {
502
503 r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
504 if (r < 0)
505 return r;
506
507 *fd_egress = r;
508 }
51283461 509
1988a9d1
DM
510 } else {
511 *fd_ingress = safe_close(*fd_ingress);
512 *fd_egress = safe_close(*fd_egress);
51283461
LP
513
514 zero(u->ip_accounting_extra);
1988a9d1
DM
515 }
516
517 return 0;
518}
519
520int bpf_firewall_compile(Unit *u) {
521 CGroupContext *cc;
acf7f253 522 int r, supported;
4c1567f2 523 bool ip_allow_any = false, ip_deny_any = false;
1988a9d1
DM
524
525 assert(u);
526
51283461
LP
527 cc = unit_get_cgroup_context(u);
528 if (!cc)
529 return -EINVAL;
530
acf7f253
LP
531 supported = bpf_firewall_supported();
532 if (supported < 0)
533 return supported;
84d2744b
ZJS
534 if (supported == BPF_FIREWALL_UNSUPPORTED)
535 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
536 "BPF firewalling not supported on this manager, proceeding without.");
537 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
acf7f253
LP
538 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
539 * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
540 * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
541 * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
542 * all, either. */
84d2744b
ZJS
543 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
544 "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
1988a9d1
DM
545
546 /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
37b22b3b 547 * but we reuse the accounting maps. That way the firewall in effect always maps to the actual
1988a9d1
DM
548 * configuration, but we don't flush out the accounting unnecessarily */
549
550 u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
551 u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);
552
553 u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
554 u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
555
556 u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
557 u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
558
acf7f253
LP
559 if (u->type != UNIT_SLICE) {
560 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
561 * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
562 * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
563 * means that all configure IP access rules *will* take effect on processes, even though we never
564 * compile them for inner nodes. */
1988a9d1 565
4c1567f2 566 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any);
acf7f253 567 if (r < 0)
13711093 568 return log_unit_error_errno(u, r, "Preparation of eBPF allow maps failed: %m");
acf7f253 569
4c1567f2 570 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any);
acf7f253 571 if (r < 0)
13711093 572 return log_unit_error_errno(u, r, "Preparation of eBPF deny maps failed: %m");
acf7f253 573 }
1988a9d1 574
51283461 575 r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
1988a9d1 576 if (r < 0)
13711093 577 return log_unit_error_errno(u, r, "Preparation of eBPF accounting maps failed: %m");
1988a9d1 578
4c1567f2 579 r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any);
1988a9d1 580 if (r < 0)
13711093 581 return log_unit_error_errno(u, r, "Compilation for ingress BPF program failed: %m");
1988a9d1 582
4c1567f2 583 r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any);
1988a9d1 584 if (r < 0)
13711093 585 return log_unit_error_errno(u, r, "Compilation for egress BPF program failed: %m");
1988a9d1
DM
586
587 return 0;
588}
589
fab34748
KL
590DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(filter_prog_hash_ops, void, trivial_hash_func, trivial_compare_func, BPFProgram, bpf_program_unref);
591
592static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) {
593 char **bpf_fs_path;
594
595 set_clear(*set);
596
597 STRV_FOREACH(bpf_fs_path, filter_paths) {
d02fd8b1 598 _cleanup_(bpf_program_unrefp) BPFProgram *prog = NULL;
fab34748
KL
599 int r;
600
601 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &prog);
602 if (r < 0)
603 return log_unit_error_errno(u, r, "Can't allocate CGROUP SKB BPF program: %m");
604
605 r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path);
606 if (r < 0)
607 return log_unit_error_errno(u, r, "Loading of ingress BPF program %s failed: %m", *bpf_fs_path);
608
d02fd8b1 609 r = set_ensure_consume(set, &filter_prog_hash_ops, TAKE_PTR(prog));
fab34748
KL
610 if (r < 0)
611 return log_unit_error_errno(u, r, "Can't add program to BPF program set: %m");
fab34748
KL
612 }
613
614 return 0;
615}
616
617int bpf_firewall_load_custom(Unit *u) {
618 CGroupContext *cc;
619 int r, supported;
620
621 assert(u);
622
623 cc = unit_get_cgroup_context(u);
624 if (!cc)
625 return 0;
626
627 if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
628 return 0;
629
630 supported = bpf_firewall_supported();
631 if (supported < 0)
632 return supported;
633
634 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI)
635 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs.");
636
637 r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress);
638 if (r < 0)
639 return r;
640 r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress);
641 if (r < 0)
642 return r;
643
644 return 0;
645}
646
647static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) {
648 BPFProgram *prog;
fab34748
KL
649 int r;
650
651 assert(u);
652
653 set_clear(*set_installed);
654
90e74a66 655 SET_FOREACH(prog, *set) {
fab34748
KL
656 r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI);
657 if (r < 0)
658 return log_unit_error_errno(u, r, "Attaching custom egress BPF program to cgroup %s failed: %m", path);
fab34748 659
de7fef4b
ZJS
660 /* Remember that these BPF programs are installed now. */
661 r = set_ensure_put(set_installed, &filter_prog_hash_ops, prog);
fab34748
KL
662 if (r < 0)
663 return log_unit_error_errno(u, r, "Can't add program to BPF program set: %m");
664 bpf_program_ref(prog);
665 }
666
667 return 0;
668}
669
1988a9d1
DM
670int bpf_firewall_install(Unit *u) {
671 _cleanup_free_ char *path = NULL;
9f2e6892 672 CGroupContext *cc;
acf7f253 673 int r, supported;
aa2b6f1d 674 uint32_t flags;
1988a9d1
DM
675
676 assert(u);
677
9f2e6892
LP
678 cc = unit_get_cgroup_context(u);
679 if (!cc)
680 return -EINVAL;
aa2b6f1d
LP
681 if (!u->cgroup_path)
682 return -EINVAL;
683 if (!u->cgroup_realized)
684 return -EINVAL;
9f2e6892 685
acf7f253
LP
686 supported = bpf_firewall_supported();
687 if (supported < 0)
688 return supported;
d85ff944
YW
689 if (supported == BPF_FIREWALL_UNSUPPORTED)
690 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF firewalling not supported on this manager, proceeding without.");
691 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
692 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
fab34748
KL
693 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
694 (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress)))
695 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP), "BPF_F_ALLOW_MULTI not supported on this manager, cannot attach custom BPF programs.");
1988a9d1
DM
696
697 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
698 if (r < 0)
13711093 699 return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m");
1988a9d1 700
acf7f253
LP
701 flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
702 (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
703
aa2b6f1d
LP
704 /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
705 * minimize the time window when we don't account for IP traffic. */
706 u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
707 u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
1988a9d1 708
aa2b6f1d 709 if (u->ip_bpf_egress) {
fab34748
KL
710 r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path,
711 flags | (set_isempty(u->ip_bpf_custom_egress) ? 0 : BPF_F_ALLOW_MULTI));
1988a9d1 712 if (r < 0)
13711093 713 return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path);
aa2b6f1d
LP
714
715 /* Remember that this BPF program is installed now. */
716 u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
1988a9d1
DM
717 }
718
719 if (u->ip_bpf_ingress) {
fab34748
KL
720 r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path,
721 flags | (set_isempty(u->ip_bpf_custom_ingress) ? 0 : BPF_F_ALLOW_MULTI));
1988a9d1 722 if (r < 0)
13711093 723 return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
aa2b6f1d
LP
724
725 u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
1988a9d1
DM
726 }
727
fab34748
KL
728 r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed);
729 if (r < 0)
730 return r;
731
732 r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed);
733 if (r < 0)
734 return r;
735
1988a9d1
DM
736 return 0;
737}
738
739int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
740 uint64_t key, packets;
741 int r;
742
743 if (map_fd < 0)
744 return -EBADF;
745
746 if (ret_packets) {
747 key = MAP_KEY_PACKETS;
748 r = bpf_map_lookup_element(map_fd, &key, &packets);
749 if (r < 0)
750 return r;
751 }
752
753 if (ret_bytes) {
754 key = MAP_KEY_BYTES;
755 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
756 if (r < 0)
757 return r;
758 }
759
760 if (ret_packets)
761 *ret_packets = packets;
762
763 return 0;
764}
765
766int bpf_firewall_reset_accounting(int map_fd) {
767 uint64_t key, value = 0;
768 int r;
769
770 if (map_fd < 0)
771 return -EBADF;
772
773 key = MAP_KEY_PACKETS;
774 r = bpf_map_update_element(map_fd, &key, &value);
775 if (r < 0)
776 return r;
777
778 key = MAP_KEY_BYTES;
779 return bpf_map_update_element(map_fd, &key, &value);
780}
781
f140ed02
ZJS
782static int bpf_firewall_unsupported_reason = 0;
783
1988a9d1 784int bpf_firewall_supported(void) {
2899aac4 785 const struct bpf_insn trivial[] = {
93e93da5
LP
786 BPF_MOV64_IMM(BPF_REG_0, 1),
787 BPF_EXIT_INSN()
788 };
789
790 _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
1988a9d1 791 static int supported = -1;
e583759b 792 union bpf_attr attr;
4c1567f2 793 int r;
1988a9d1 794
4c1567f2 795 /* Checks whether BPF firewalling is supported. For this, we check the following things:
1988a9d1 796 *
4c1567f2
AZ
797 * - whether the unified hierarchy is being used
798 * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
799 * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
1988a9d1 800 */
1988a9d1
DM
801 if (supported >= 0)
802 return supported;
803
1988a9d1
DM
804 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
805 if (r < 0)
806 return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
e583759b 807 if (r == 0) {
f140ed02
ZJS
808 bpf_firewall_unsupported_reason =
809 log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN),
810 "Not running with unified cgroups, BPF firewalling is not supported.");
2ae7ee58 811 return supported = BPF_FIREWALL_UNSUPPORTED;
e583759b 812 }
1988a9d1 813
4355f1c9
ZJS
814 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program);
815 if (r < 0) {
f140ed02
ZJS
816 bpf_firewall_unsupported_reason =
817 log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
2ae7ee58 818 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5
LP
819 }
820
821 r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
822 if (r < 0) {
f140ed02
ZJS
823 bpf_firewall_unsupported_reason =
824 log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
2ae7ee58 825 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5
LP
826 }
827
828 r = bpf_program_load_kernel(program, NULL, 0);
829 if (r < 0) {
f140ed02
ZJS
830 bpf_firewall_unsupported_reason =
831 log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
2ae7ee58 832 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5
LP
833 }
834
e583759b
LP
835 /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
836 * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
837 * program if we can't do a thing with it later?
838 *
047de7e1 839 * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
e583759b
LP
840 * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
841 * parameters are validated however, and that'll fail with EBADF then. */
842
843 attr = (union bpf_attr) {
844 .attach_type = BPF_CGROUP_INET_EGRESS,
845 .target_fd = -1,
846 .attach_bpf_fd = -1,
847 };
848
047de7e1 849 if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
2ae7ee58 850 if (errno != EBADF) {
f140ed02
ZJS
851 bpf_firewall_unsupported_reason =
852 log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
2ae7ee58
LP
853 return supported = BPF_FIREWALL_UNSUPPORTED;
854 }
855
856 /* YAY! */
857 } else {
047de7e1 858 log_debug("Wut? Kernel accepted our invalid BPF_PROG_DETACH call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
2ae7ee58
LP
859 return supported = BPF_FIREWALL_UNSUPPORTED;
860 }
e583759b 861
2ae7ee58 862 /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
047de7e1
AF
863 * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
864 * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
865 * get EINVAL if it's not supported, and EBADF as before if it is available. */
e583759b 866
2ae7ee58
LP
867 attr = (union bpf_attr) {
868 .attach_type = BPF_CGROUP_INET_EGRESS,
869 .target_fd = -1,
870 .attach_bpf_fd = -1,
871 .attach_flags = BPF_F_ALLOW_MULTI,
872 };
873
b1c05b98 874 if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
2ae7ee58
LP
875 if (errno == EBADF) {
876 log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
877 return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
878 }
879
880 if (errno == EINVAL)
881 log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
882 else
883 log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
884
885 return supported = BPF_FIREWALL_SUPPORTED;
886 } else {
887 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
888 return supported = BPF_FIREWALL_UNSUPPORTED;
889 }
1988a9d1 890}
84d2744b
ZJS
891
892void emit_bpf_firewall_warning(Unit *u) {
893 static bool warned = false;
894
895 if (!warned) {
f140ed02
ZJS
896 bool quiet = bpf_firewall_unsupported_reason == -EPERM && detect_container();
897
8ed6f81b
YW
898 log_unit_full_errno(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason,
899 "unit configures an IP firewall, but %s.\n"
900 "(This warning is only shown for the first unit using IP firewalling.)",
901 getuid() != 0 ? "not running as root" :
902 "the local system does not support BPF/cgroup firewalling");
84d2744b
ZJS
903 warned = true;
904 }
905}