]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/bpf-firewall.c
cgroup-util: kill also threads
[thirdparty/systemd.git] / src / core / bpf-firewall.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
1988a9d1
DM
2
3#include <arpa/inet.h>
4#include <assert.h>
5#include <errno.h>
6#include <fcntl.h>
01234e1f 7#include <linux/bpf_insn.h>
1988a9d1
DM
8#include <net/ethernet.h>
9#include <net/if.h>
10#include <netinet/ip.h>
11#include <netinet/ip6.h>
12#include <stddef.h>
13#include <stdio.h>
14#include <stdlib.h>
15#include <string.h>
16#include <unistd.h>
17
18#include "alloc-util.h"
19#include "bpf-firewall.h"
20#include "bpf-program.h"
21#include "fd-util.h"
22#include "ip-address-access.h"
0a970718 23#include "memory-util.h"
e93672ee 24#include "missing_syscall.h"
1988a9d1 25#include "unit.h"
f140ed02 26#include "virt.h"
1988a9d1
DM
27
28enum {
29 MAP_KEY_PACKETS,
30 MAP_KEY_BYTES,
31};
32
33enum {
34 ACCESS_ALLOWED = 1,
35 ACCESS_DENIED = 2,
36};
37
38/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
39
40static int add_lookup_instructions(
41 BPFProgram *p,
42 int map_fd,
43 int protocol,
44 bool is_ingress,
45 int verdict) {
46
47 int r, addr_offset, addr_size;
48
49 assert(p);
50 assert(map_fd >= 0);
51
52 switch (protocol) {
53
54 case ETH_P_IP:
55 addr_size = sizeof(uint32_t);
56 addr_offset = is_ingress ?
57 offsetof(struct iphdr, saddr) :
58 offsetof(struct iphdr, daddr);
59 break;
60
61 case ETH_P_IPV6:
62 addr_size = 4 * sizeof(uint32_t);
63 addr_offset = is_ingress ?
64 offsetof(struct ip6_hdr, ip6_src.s6_addr) :
65 offsetof(struct ip6_hdr, ip6_dst.s6_addr);
66 break;
67
68 default:
69 return -EAFNOSUPPORT;
70 }
71
72 do {
73 /* Compare IPv4 with one word instruction (32bit) */
74 struct bpf_insn insn[] = {
75 /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
76 BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
77
78 /*
79 * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
80 *
81 * R1: Pointer to the skb
82 * R2: Data offset
83 * R3: Destination buffer on the stack (r10 - 4)
84 * R4: Number of bytes to read (4)
85 */
86
87 BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
88 BPF_MOV32_IMM(BPF_REG_2, addr_offset),
89
90 BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
91 BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
92
93 BPF_MOV32_IMM(BPF_REG_4, addr_size),
94 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
95
96 /*
97 * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
98 * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
99 * has to be set to the maximum possible value.
100 *
101 * On success, the looked up value is stored in R0. For this application, the actual
102 * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
103 * matching value.
104 */
105
106 BPF_LD_MAP_FD(BPF_REG_1, map_fd),
107 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
108 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
109 BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
110
111 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
112 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
113 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
114 };
115
116 /* Jump label fixup */
117 insn[0].off = ELEMENTSOF(insn) - 1;
118
119 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
120 if (r < 0)
121 return r;
122
123 } while (false);
124
125 return 0;
126}
127
128static int bpf_firewall_compile_bpf(
129 Unit *u,
130 bool is_ingress,
131 BPFProgram **ret) {
132
133 struct bpf_insn pre_insn[] = {
134 /*
135 * When the eBPF program is entered, R1 contains the address of the skb.
136 * However, R1-R5 are scratch registers that are not preserved when calling
137 * into kernel functions, so we need to save anything that's supposed to
138 * stay around to R6-R9. Save the skb to R6.
139 */
140 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
141
142 /*
143 * Although we cannot access the skb data directly from eBPF programs used in this
144 * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
145 * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
146 * for later use.
147 */
148 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
149
150 /*
151 * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
152 * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
153 */
154 BPF_MOV32_IMM(BPF_REG_8, 0),
155 };
156
157 /*
158 * The access checkers compiled for the configured allowance and denial lists
159 * write to R8 at runtime. The following code prepares for an early exit that
160 * skip the accounting if the packet is denied.
161 *
162 * R0 = 1
163 * if (R8 == ACCESS_DENIED)
164 * R0 = 0
165 *
166 * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
167 * is allowed to pass.
168 */
169 struct bpf_insn post_insn[] = {
170 BPF_MOV64_IMM(BPF_REG_0, 1),
171 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
172 BPF_MOV64_IMM(BPF_REG_0, 0),
173 };
174
175 _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
176 int accounting_map_fd, r;
177 bool access_enabled;
178
179 assert(u);
180 assert(ret);
181
182 accounting_map_fd = is_ingress ?
183 u->ip_accounting_ingress_map_fd :
184 u->ip_accounting_egress_map_fd;
185
186 access_enabled =
187 u->ipv4_allow_map_fd >= 0 ||
188 u->ipv6_allow_map_fd >= 0 ||
189 u->ipv4_deny_map_fd >= 0 ||
190 u->ipv6_deny_map_fd >= 0;
191
192 if (accounting_map_fd < 0 && !access_enabled) {
193 *ret = NULL;
194 return 0;
195 }
196
197 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
198 if (r < 0)
199 return r;
200
201 r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
202 if (r < 0)
203 return r;
204
205 if (access_enabled) {
206 /*
207 * The simple rule this function translates into eBPF instructions is:
208 *
209 * - Access will be granted when an address matches an entry in @list_allow
210 * - Otherwise, access will be denied when an address matches an entry in @list_deny
211 * - Otherwise, access will be granted
212 */
213
214 if (u->ipv4_deny_map_fd >= 0) {
215 r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
216 if (r < 0)
217 return r;
218 }
219
220 if (u->ipv6_deny_map_fd >= 0) {
221 r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
222 if (r < 0)
223 return r;
224 }
225
226 if (u->ipv4_allow_map_fd >= 0) {
227 r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
228 if (r < 0)
229 return r;
230 }
231
232 if (u->ipv6_allow_map_fd >= 0) {
233 r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
234 if (r < 0)
235 return r;
236 }
237 }
238
239 r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
240 if (r < 0)
241 return r;
242
243 if (accounting_map_fd >= 0) {
244 struct bpf_insn insn[] = {
245 /*
246 * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
247 * The jump label will be fixed up later.
248 */
249 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
250
251 /* Count packets */
252 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
253 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
254 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
255 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
256 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
257 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
258 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
259 BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
260 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
261
262 /* Count bytes */
263 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
264 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
265 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
266 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
267 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
268 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
269 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
270 BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
271 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
272
273 /* Allow the packet to pass */
274 BPF_MOV64_IMM(BPF_REG_0, 1),
275 };
276
277 /* Jump label fixup */
278 insn[0].off = ELEMENTSOF(insn) - 1;
279
280 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
281 if (r < 0)
282 return r;
283 }
284
285 do {
286 /*
287 * Exit from the eBPF program, R0 contains the verdict.
288 * 0 means the packet is denied, 1 means the packet may pass.
289 */
290 struct bpf_insn insn[] = {
291 BPF_EXIT_INSN()
292 };
293
294 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
295 if (r < 0)
296 return r;
297 } while (false);
298
1cc6c93a 299 *ret = TAKE_PTR(p);
1988a9d1
DM
300
301 return 0;
302}
303
304static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) {
305 IPAddressAccessItem *a;
306
307 assert(n_ipv4);
308 assert(n_ipv6);
309
310 LIST_FOREACH(items, a, list) {
311 switch (a->family) {
312
313 case AF_INET:
314 (*n_ipv4)++;
315 break;
316
317 case AF_INET6:
318 (*n_ipv6)++;
319 break;
320
321 default:
322 return -EAFNOSUPPORT;
323 }
324 }
325
326 return 0;
327}
328
329static int bpf_firewall_add_access_items(
330 IPAddressAccessItem *list,
331 int ipv4_map_fd,
332 int ipv6_map_fd,
333 int verdict) {
334
335 struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
336 uint64_t value = verdict;
337 IPAddressAccessItem *a;
338 int r;
339
340 key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
341 key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
342
343 LIST_FOREACH(items, a, list) {
344 switch (a->family) {
345
346 case AF_INET:
347 key_ipv4->prefixlen = a->prefixlen;
348 memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
349
350 r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
351 if (r < 0)
352 return r;
353
354 break;
355
356 case AF_INET6:
357 key_ipv6->prefixlen = a->prefixlen;
358 memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
359
360 r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
361 if (r < 0)
362 return r;
363
364 break;
365
366 default:
367 return -EAFNOSUPPORT;
368 }
369 }
370
371 return 0;
372}
373
374static int bpf_firewall_prepare_access_maps(
375 Unit *u,
376 int verdict,
377 int *ret_ipv4_map_fd,
378 int *ret_ipv6_map_fd) {
379
380 _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
381 size_t n_ipv4 = 0, n_ipv6 = 0;
382 Unit *p;
383 int r;
384
385 assert(ret_ipv4_map_fd);
386 assert(ret_ipv6_map_fd);
387
388 for (p = u; p; p = UNIT_DEREF(p->slice)) {
389 CGroupContext *cc;
390
391 cc = unit_get_cgroup_context(p);
392 if (!cc)
393 continue;
394
395 bpf_firewall_count_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, &n_ipv4, &n_ipv6);
396 }
397
398 if (n_ipv4 > 0) {
399 ipv4_map_fd = bpf_map_new(
400 BPF_MAP_TYPE_LPM_TRIE,
401 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
402 sizeof(uint64_t),
403 n_ipv4,
404 BPF_F_NO_PREALLOC);
405 if (ipv4_map_fd < 0)
406 return ipv4_map_fd;
407 }
408
409 if (n_ipv6 > 0) {
410 ipv6_map_fd = bpf_map_new(
411 BPF_MAP_TYPE_LPM_TRIE,
412 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
413 sizeof(uint64_t),
414 n_ipv6,
415 BPF_F_NO_PREALLOC);
416 if (ipv6_map_fd < 0)
417 return ipv6_map_fd;
418 }
419
420 for (p = u; p; p = UNIT_DEREF(p->slice)) {
421 CGroupContext *cc;
422
423 cc = unit_get_cgroup_context(p);
424 if (!cc)
425 continue;
426
427 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
428 ipv4_map_fd, ipv6_map_fd, verdict);
429 if (r < 0)
430 return r;
431 }
432
1e59b545
LP
433 *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd);
434 *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd);
1988a9d1
DM
435 return 0;
436}
437
51283461 438static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
1988a9d1
DM
439 int r;
440
51283461 441 assert(u);
1988a9d1
DM
442 assert(fd_ingress);
443 assert(fd_egress);
444
445 if (enabled) {
446 if (*fd_ingress < 0) {
447 r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
448 if (r < 0)
449 return r;
450
451 *fd_ingress = r;
452 }
453
454 if (*fd_egress < 0) {
455
456 r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
457 if (r < 0)
458 return r;
459
460 *fd_egress = r;
461 }
51283461 462
1988a9d1
DM
463 } else {
464 *fd_ingress = safe_close(*fd_ingress);
465 *fd_egress = safe_close(*fd_egress);
51283461
LP
466
467 zero(u->ip_accounting_extra);
1988a9d1
DM
468 }
469
470 return 0;
471}
472
473int bpf_firewall_compile(Unit *u) {
474 CGroupContext *cc;
acf7f253 475 int r, supported;
1988a9d1
DM
476
477 assert(u);
478
51283461
LP
479 cc = unit_get_cgroup_context(u);
480 if (!cc)
481 return -EINVAL;
482
acf7f253
LP
483 supported = bpf_firewall_supported();
484 if (supported < 0)
485 return supported;
84d2744b
ZJS
486 if (supported == BPF_FIREWALL_UNSUPPORTED)
487 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
488 "BPF firewalling not supported on this manager, proceeding without.");
489 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
acf7f253
LP
490 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
491 * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
492 * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
493 * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
494 * all, either. */
84d2744b
ZJS
495 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
496 "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
1988a9d1
DM
497
498 /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
499 * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
500 * configuration, but we don't flush out the accounting unnecessarily */
501
502 u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
503 u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);
504
505 u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
506 u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
507
508 u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
509 u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
510
acf7f253
LP
511 if (u->type != UNIT_SLICE) {
512 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
513 * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
514 * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
515 * means that all configure IP access rules *will* take effect on processes, even though we never
516 * compile them for inner nodes. */
1988a9d1 517
acf7f253
LP
518 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd);
519 if (r < 0)
13711093 520 return log_unit_error_errno(u, r, "Preparation of eBPF allow maps failed: %m");
acf7f253
LP
521
522 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd);
523 if (r < 0)
13711093 524 return log_unit_error_errno(u, r, "Preparation of eBPF deny maps failed: %m");
acf7f253 525 }
1988a9d1 526
51283461 527 r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
1988a9d1 528 if (r < 0)
13711093 529 return log_unit_error_errno(u, r, "Preparation of eBPF accounting maps failed: %m");
1988a9d1
DM
530
531 r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress);
532 if (r < 0)
13711093 533 return log_unit_error_errno(u, r, "Compilation for ingress BPF program failed: %m");
1988a9d1
DM
534
535 r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress);
536 if (r < 0)
13711093 537 return log_unit_error_errno(u, r, "Compilation for egress BPF program failed: %m");
1988a9d1
DM
538
539 return 0;
540}
541
542int bpf_firewall_install(Unit *u) {
543 _cleanup_free_ char *path = NULL;
9f2e6892 544 CGroupContext *cc;
acf7f253 545 int r, supported;
aa2b6f1d 546 uint32_t flags;
1988a9d1
DM
547
548 assert(u);
549
9f2e6892
LP
550 cc = unit_get_cgroup_context(u);
551 if (!cc)
552 return -EINVAL;
aa2b6f1d
LP
553 if (!u->cgroup_path)
554 return -EINVAL;
555 if (!u->cgroup_realized)
556 return -EINVAL;
9f2e6892 557
acf7f253
LP
558 supported = bpf_firewall_supported();
559 if (supported < 0)
560 return supported;
561 if (supported == BPF_FIREWALL_UNSUPPORTED) {
13711093 562 log_unit_debug(u, "BPF firewalling not supported on this manager, proceeding without.");
1988a9d1
DM
563 return -EOPNOTSUPP;
564 }
acf7f253 565 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
13711093 566 log_unit_debug(u, "BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
acf7f253
LP
567 return -EOPNOTSUPP;
568 }
1988a9d1
DM
569
570 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
571 if (r < 0)
13711093 572 return log_unit_error_errno(u, r, "Failed to determine cgroup path: %m");
1988a9d1 573
acf7f253
LP
574 flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
575 (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
576
aa2b6f1d
LP
577 /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
578 * minimize the time window when we don't account for IP traffic. */
579 u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
580 u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
1988a9d1 581
aa2b6f1d 582 if (u->ip_bpf_egress) {
acf7f253 583 r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
1988a9d1 584 if (r < 0)
13711093 585 return log_unit_error_errno(u, r, "Attaching egress BPF program to cgroup %s failed: %m", path);
aa2b6f1d
LP
586
587 /* Remember that this BPF program is installed now. */
588 u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
1988a9d1
DM
589 }
590
591 if (u->ip_bpf_ingress) {
acf7f253 592 r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
1988a9d1 593 if (r < 0)
13711093 594 return log_unit_error_errno(u, r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
aa2b6f1d
LP
595
596 u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
1988a9d1
DM
597 }
598
599 return 0;
600}
601
602int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
603 uint64_t key, packets;
604 int r;
605
606 if (map_fd < 0)
607 return -EBADF;
608
609 if (ret_packets) {
610 key = MAP_KEY_PACKETS;
611 r = bpf_map_lookup_element(map_fd, &key, &packets);
612 if (r < 0)
613 return r;
614 }
615
616 if (ret_bytes) {
617 key = MAP_KEY_BYTES;
618 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
619 if (r < 0)
620 return r;
621 }
622
623 if (ret_packets)
624 *ret_packets = packets;
625
626 return 0;
627}
628
629int bpf_firewall_reset_accounting(int map_fd) {
630 uint64_t key, value = 0;
631 int r;
632
633 if (map_fd < 0)
634 return -EBADF;
635
636 key = MAP_KEY_PACKETS;
637 r = bpf_map_update_element(map_fd, &key, &value);
638 if (r < 0)
639 return r;
640
641 key = MAP_KEY_BYTES;
642 return bpf_map_update_element(map_fd, &key, &value);
643}
644
f140ed02
ZJS
645static int bpf_firewall_unsupported_reason = 0;
646
1988a9d1 647int bpf_firewall_supported(void) {
93e93da5
LP
648 struct bpf_insn trivial[] = {
649 BPF_MOV64_IMM(BPF_REG_0, 1),
650 BPF_EXIT_INSN()
651 };
652
653 _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
1988a9d1 654 static int supported = -1;
e583759b 655 union bpf_attr attr;
1988a9d1
DM
656 int fd, r;
657
e583759b 658 /* Checks whether BPF firewalling is supported. For this, we check five things:
1988a9d1
DM
659 *
660 * a) whether we are privileged
661 * b) whether the unified hierarchy is being used
662 * c) the BPF implementation in the kernel supports BPF LPM TRIE maps, which we require
e583759b 663 * d) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
047de7e1 664 * e) the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
1988a9d1
DM
665 */
666
667 if (supported >= 0)
668 return supported;
669
93e93da5 670 if (geteuid() != 0) {
f140ed02
ZJS
671 bpf_firewall_unsupported_reason =
672 log_debug_errno(SYNTHETIC_ERRNO(EACCES),
673 "Not enough privileges, BPF firewalling is not supported.");
2ae7ee58 674 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5 675 }
1988a9d1
DM
676
677 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
678 if (r < 0)
679 return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
e583759b 680 if (r == 0) {
f140ed02
ZJS
681 bpf_firewall_unsupported_reason =
682 log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN),
683 "Not running with unified cgroups, BPF firewalling is not supported.");
2ae7ee58 684 return supported = BPF_FIREWALL_UNSUPPORTED;
e583759b 685 }
1988a9d1
DM
686
687 fd = bpf_map_new(BPF_MAP_TYPE_LPM_TRIE,
688 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint64_t),
689 sizeof(uint64_t),
690 1,
691 BPF_F_NO_PREALLOC);
692 if (fd < 0) {
f140ed02
ZJS
693 bpf_firewall_unsupported_reason =
694 log_debug_errno(fd, "Can't allocate BPF LPM TRIE map, BPF firewalling is not supported: %m");
2ae7ee58 695 return supported = BPF_FIREWALL_UNSUPPORTED;
1988a9d1
DM
696 }
697
698 safe_close(fd);
699
4355f1c9
ZJS
700 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program);
701 if (r < 0) {
f140ed02
ZJS
702 bpf_firewall_unsupported_reason =
703 log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
2ae7ee58 704 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5
LP
705 }
706
707 r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
708 if (r < 0) {
f140ed02
ZJS
709 bpf_firewall_unsupported_reason =
710 log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
2ae7ee58 711 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5
LP
712 }
713
714 r = bpf_program_load_kernel(program, NULL, 0);
715 if (r < 0) {
f140ed02
ZJS
716 bpf_firewall_unsupported_reason =
717 log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
2ae7ee58 718 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5
LP
719 }
720
e583759b
LP
721 /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
722 * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
723 * program if we can't do a thing with it later?
724 *
047de7e1 725 * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
e583759b
LP
726 * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
727 * parameters are validated however, and that'll fail with EBADF then. */
728
729 attr = (union bpf_attr) {
730 .attach_type = BPF_CGROUP_INET_EGRESS,
731 .target_fd = -1,
732 .attach_bpf_fd = -1,
733 };
734
047de7e1 735 if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
2ae7ee58 736 if (errno != EBADF) {
f140ed02
ZJS
737 bpf_firewall_unsupported_reason =
738 log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
2ae7ee58
LP
739 return supported = BPF_FIREWALL_UNSUPPORTED;
740 }
741
742 /* YAY! */
743 } else {
047de7e1 744 log_debug("Wut? Kernel accepted our invalid BPF_PROG_DETACH call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
2ae7ee58
LP
745 return supported = BPF_FIREWALL_UNSUPPORTED;
746 }
e583759b 747
2ae7ee58 748 /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
047de7e1
AF
749 * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
750 * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
751 * get EINVAL if it's not supported, and EBADF as before if it is available. */
e583759b 752
2ae7ee58
LP
753 attr = (union bpf_attr) {
754 .attach_type = BPF_CGROUP_INET_EGRESS,
755 .target_fd = -1,
756 .attach_bpf_fd = -1,
757 .attach_flags = BPF_F_ALLOW_MULTI,
758 };
759
b1c05b98 760 if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
2ae7ee58
LP
761 if (errno == EBADF) {
762 log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
763 return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
764 }
765
766 if (errno == EINVAL)
767 log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
768 else
769 log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
770
771 return supported = BPF_FIREWALL_SUPPORTED;
772 } else {
773 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
774 return supported = BPF_FIREWALL_UNSUPPORTED;
775 }
1988a9d1 776}
84d2744b
ZJS
777
778void emit_bpf_firewall_warning(Unit *u) {
779 static bool warned = false;
780
781 if (!warned) {
f140ed02
ZJS
782 bool quiet = bpf_firewall_unsupported_reason == -EPERM && detect_container();
783
784 log_unit_full(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason,
785 "unit configures an IP firewall, but %s.\n"
786 "(This warning is only shown for the first unit using IP firewalling.)",
787 getuid() != 0 ? "not running as root" :
788 "the local system does not support BPF/cgroup firewalling");
84d2744b
ZJS
789 warned = true;
790 }
791}