]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/bpf-firewall.c
tree-wide: avoid assignment of r just to use in a comparison
[thirdparty/systemd.git] / src / core / bpf-firewall.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
1988a9d1
DM
2/***
3 This file is part of systemd.
4
5 Copyright 2016 Daniel Mack
1988a9d1
DM
6***/
7
8#include <arpa/inet.h>
9#include <assert.h>
10#include <errno.h>
11#include <fcntl.h>
12#include <linux/libbpf.h>
13#include <net/ethernet.h>
14#include <net/if.h>
15#include <netinet/ip.h>
16#include <netinet/ip6.h>
17#include <stddef.h>
18#include <stdio.h>
19#include <stdlib.h>
20#include <string.h>
21#include <unistd.h>
22
23#include "alloc-util.h"
24#include "bpf-firewall.h"
25#include "bpf-program.h"
26#include "fd-util.h"
27#include "ip-address-access.h"
28#include "unit.h"
29
30enum {
31 MAP_KEY_PACKETS,
32 MAP_KEY_BYTES,
33};
34
35enum {
36 ACCESS_ALLOWED = 1,
37 ACCESS_DENIED = 2,
38};
39
40/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
41
42static int add_lookup_instructions(
43 BPFProgram *p,
44 int map_fd,
45 int protocol,
46 bool is_ingress,
47 int verdict) {
48
49 int r, addr_offset, addr_size;
50
51 assert(p);
52 assert(map_fd >= 0);
53
54 switch (protocol) {
55
56 case ETH_P_IP:
57 addr_size = sizeof(uint32_t);
58 addr_offset = is_ingress ?
59 offsetof(struct iphdr, saddr) :
60 offsetof(struct iphdr, daddr);
61 break;
62
63 case ETH_P_IPV6:
64 addr_size = 4 * sizeof(uint32_t);
65 addr_offset = is_ingress ?
66 offsetof(struct ip6_hdr, ip6_src.s6_addr) :
67 offsetof(struct ip6_hdr, ip6_dst.s6_addr);
68 break;
69
70 default:
71 return -EAFNOSUPPORT;
72 }
73
74 do {
75 /* Compare IPv4 with one word instruction (32bit) */
76 struct bpf_insn insn[] = {
77 /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
78 BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
79
80 /*
81 * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
82 *
83 * R1: Pointer to the skb
84 * R2: Data offset
85 * R3: Destination buffer on the stack (r10 - 4)
86 * R4: Number of bytes to read (4)
87 */
88
89 BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
90 BPF_MOV32_IMM(BPF_REG_2, addr_offset),
91
92 BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
93 BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
94
95 BPF_MOV32_IMM(BPF_REG_4, addr_size),
96 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
97
98 /*
99 * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
100 * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
101 * has to be set to the maximum possible value.
102 *
103 * On success, the looked up value is stored in R0. For this application, the actual
104 * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
105 * matching value.
106 */
107
108 BPF_LD_MAP_FD(BPF_REG_1, map_fd),
109 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
110 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
111 BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
112
113 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
114 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
115 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
116 };
117
118 /* Jump label fixup */
119 insn[0].off = ELEMENTSOF(insn) - 1;
120
121 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
122 if (r < 0)
123 return r;
124
125 } while (false);
126
127 return 0;
128}
129
130static int bpf_firewall_compile_bpf(
131 Unit *u,
132 bool is_ingress,
133 BPFProgram **ret) {
134
135 struct bpf_insn pre_insn[] = {
136 /*
137 * When the eBPF program is entered, R1 contains the address of the skb.
138 * However, R1-R5 are scratch registers that are not preserved when calling
139 * into kernel functions, so we need to save anything that's supposed to
140 * stay around to R6-R9. Save the skb to R6.
141 */
142 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
143
144 /*
145 * Although we cannot access the skb data directly from eBPF programs used in this
146 * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
147 * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
148 * for later use.
149 */
150 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
151
152 /*
153 * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
154 * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
155 */
156 BPF_MOV32_IMM(BPF_REG_8, 0),
157 };
158
159 /*
160 * The access checkers compiled for the configured allowance and denial lists
161 * write to R8 at runtime. The following code prepares for an early exit that
162 * skip the accounting if the packet is denied.
163 *
164 * R0 = 1
165 * if (R8 == ACCESS_DENIED)
166 * R0 = 0
167 *
168 * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
169 * is allowed to pass.
170 */
171 struct bpf_insn post_insn[] = {
172 BPF_MOV64_IMM(BPF_REG_0, 1),
173 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
174 BPF_MOV64_IMM(BPF_REG_0, 0),
175 };
176
177 _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
178 int accounting_map_fd, r;
179 bool access_enabled;
180
181 assert(u);
182 assert(ret);
183
184 accounting_map_fd = is_ingress ?
185 u->ip_accounting_ingress_map_fd :
186 u->ip_accounting_egress_map_fd;
187
188 access_enabled =
189 u->ipv4_allow_map_fd >= 0 ||
190 u->ipv6_allow_map_fd >= 0 ||
191 u->ipv4_deny_map_fd >= 0 ||
192 u->ipv6_deny_map_fd >= 0;
193
194 if (accounting_map_fd < 0 && !access_enabled) {
195 *ret = NULL;
196 return 0;
197 }
198
199 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
200 if (r < 0)
201 return r;
202
203 r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
204 if (r < 0)
205 return r;
206
207 if (access_enabled) {
208 /*
209 * The simple rule this function translates into eBPF instructions is:
210 *
211 * - Access will be granted when an address matches an entry in @list_allow
212 * - Otherwise, access will be denied when an address matches an entry in @list_deny
213 * - Otherwise, access will be granted
214 */
215
216 if (u->ipv4_deny_map_fd >= 0) {
217 r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
218 if (r < 0)
219 return r;
220 }
221
222 if (u->ipv6_deny_map_fd >= 0) {
223 r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
224 if (r < 0)
225 return r;
226 }
227
228 if (u->ipv4_allow_map_fd >= 0) {
229 r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
230 if (r < 0)
231 return r;
232 }
233
234 if (u->ipv6_allow_map_fd >= 0) {
235 r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
236 if (r < 0)
237 return r;
238 }
239 }
240
241 r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
242 if (r < 0)
243 return r;
244
245 if (accounting_map_fd >= 0) {
246 struct bpf_insn insn[] = {
247 /*
248 * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
249 * The jump label will be fixed up later.
250 */
251 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
252
253 /* Count packets */
254 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
255 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
256 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
257 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
258 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
259 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
260 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
261 BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
262 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
263
264 /* Count bytes */
265 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
266 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
267 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
268 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
269 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
270 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
271 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
272 BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
273 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
274
275 /* Allow the packet to pass */
276 BPF_MOV64_IMM(BPF_REG_0, 1),
277 };
278
279 /* Jump label fixup */
280 insn[0].off = ELEMENTSOF(insn) - 1;
281
282 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
283 if (r < 0)
284 return r;
285 }
286
287 do {
288 /*
289 * Exit from the eBPF program, R0 contains the verdict.
290 * 0 means the packet is denied, 1 means the packet may pass.
291 */
292 struct bpf_insn insn[] = {
293 BPF_EXIT_INSN()
294 };
295
296 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
297 if (r < 0)
298 return r;
299 } while (false);
300
1cc6c93a 301 *ret = TAKE_PTR(p);
1988a9d1
DM
302
303 return 0;
304}
305
306static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) {
307 IPAddressAccessItem *a;
308
309 assert(n_ipv4);
310 assert(n_ipv6);
311
312 LIST_FOREACH(items, a, list) {
313 switch (a->family) {
314
315 case AF_INET:
316 (*n_ipv4)++;
317 break;
318
319 case AF_INET6:
320 (*n_ipv6)++;
321 break;
322
323 default:
324 return -EAFNOSUPPORT;
325 }
326 }
327
328 return 0;
329}
330
331static int bpf_firewall_add_access_items(
332 IPAddressAccessItem *list,
333 int ipv4_map_fd,
334 int ipv6_map_fd,
335 int verdict) {
336
337 struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
338 uint64_t value = verdict;
339 IPAddressAccessItem *a;
340 int r;
341
342 key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
343 key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
344
345 LIST_FOREACH(items, a, list) {
346 switch (a->family) {
347
348 case AF_INET:
349 key_ipv4->prefixlen = a->prefixlen;
350 memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
351
352 r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
353 if (r < 0)
354 return r;
355
356 break;
357
358 case AF_INET6:
359 key_ipv6->prefixlen = a->prefixlen;
360 memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
361
362 r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
363 if (r < 0)
364 return r;
365
366 break;
367
368 default:
369 return -EAFNOSUPPORT;
370 }
371 }
372
373 return 0;
374}
375
376static int bpf_firewall_prepare_access_maps(
377 Unit *u,
378 int verdict,
379 int *ret_ipv4_map_fd,
380 int *ret_ipv6_map_fd) {
381
382 _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
383 size_t n_ipv4 = 0, n_ipv6 = 0;
384 Unit *p;
385 int r;
386
387 assert(ret_ipv4_map_fd);
388 assert(ret_ipv6_map_fd);
389
390 for (p = u; p; p = UNIT_DEREF(p->slice)) {
391 CGroupContext *cc;
392
393 cc = unit_get_cgroup_context(p);
394 if (!cc)
395 continue;
396
397 bpf_firewall_count_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, &n_ipv4, &n_ipv6);
398 }
399
400 if (n_ipv4 > 0) {
401 ipv4_map_fd = bpf_map_new(
402 BPF_MAP_TYPE_LPM_TRIE,
403 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
404 sizeof(uint64_t),
405 n_ipv4,
406 BPF_F_NO_PREALLOC);
407 if (ipv4_map_fd < 0)
408 return ipv4_map_fd;
409 }
410
411 if (n_ipv6 > 0) {
412 ipv6_map_fd = bpf_map_new(
413 BPF_MAP_TYPE_LPM_TRIE,
414 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
415 sizeof(uint64_t),
416 n_ipv6,
417 BPF_F_NO_PREALLOC);
418 if (ipv6_map_fd < 0)
419 return ipv6_map_fd;
420 }
421
422 for (p = u; p; p = UNIT_DEREF(p->slice)) {
423 CGroupContext *cc;
424
425 cc = unit_get_cgroup_context(p);
426 if (!cc)
427 continue;
428
429 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
430 ipv4_map_fd, ipv6_map_fd, verdict);
431 if (r < 0)
432 return r;
433 }
434
435 *ret_ipv4_map_fd = ipv4_map_fd;
436 *ret_ipv6_map_fd = ipv6_map_fd;
437
438 ipv4_map_fd = ipv6_map_fd = -1;
439 return 0;
440}
441
51283461 442static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
1988a9d1
DM
443 int r;
444
51283461 445 assert(u);
1988a9d1
DM
446 assert(fd_ingress);
447 assert(fd_egress);
448
449 if (enabled) {
450 if (*fd_ingress < 0) {
451 r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
452 if (r < 0)
453 return r;
454
455 *fd_ingress = r;
456 }
457
458 if (*fd_egress < 0) {
459
460 r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
461 if (r < 0)
462 return r;
463
464 *fd_egress = r;
465 }
51283461 466
1988a9d1
DM
467 } else {
468 *fd_ingress = safe_close(*fd_ingress);
469 *fd_egress = safe_close(*fd_egress);
51283461
LP
470
471 zero(u->ip_accounting_extra);
1988a9d1
DM
472 }
473
474 return 0;
475}
476
477int bpf_firewall_compile(Unit *u) {
478 CGroupContext *cc;
acf7f253 479 int r, supported;
1988a9d1
DM
480
481 assert(u);
482
51283461
LP
483 cc = unit_get_cgroup_context(u);
484 if (!cc)
485 return -EINVAL;
486
acf7f253
LP
487 supported = bpf_firewall_supported();
488 if (supported < 0)
489 return supported;
490 if (supported == BPF_FIREWALL_UNSUPPORTED) {
418cdd69 491 log_debug("BPF firewalling not supported on this manager, proceeding without.");
1988a9d1
DM
492 return -EOPNOTSUPP;
493 }
acf7f253
LP
494 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
495 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
496 * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
497 * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
498 * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
499 * all, either. */
500 log_debug("BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
501 return -EOPNOTSUPP;
502 }
1988a9d1
DM
503
504 /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
505 * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
506 * configuration, but we don't flush out the accounting unnecessarily */
507
508 u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
509 u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);
510
511 u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
512 u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
513
514 u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
515 u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
516
acf7f253
LP
517 if (u->type != UNIT_SLICE) {
518 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
519 * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
520 * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
521 * means that all configure IP access rules *will* take effect on processes, even though we never
522 * compile them for inner nodes. */
1988a9d1 523
acf7f253
LP
524 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd);
525 if (r < 0)
526 return log_error_errno(r, "Preparation of eBPF allow maps failed: %m");
527
528 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd);
529 if (r < 0)
530 return log_error_errno(r, "Preparation of eBPF deny maps failed: %m");
531 }
1988a9d1 532
51283461 533 r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
1988a9d1
DM
534 if (r < 0)
535 return log_error_errno(r, "Preparation of eBPF accounting maps failed: %m");
536
537 r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress);
538 if (r < 0)
539 return log_error_errno(r, "Compilation for ingress BPF program failed: %m");
540
541 r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress);
542 if (r < 0)
543 return log_error_errno(r, "Compilation for egress BPF program failed: %m");
544
545 return 0;
546}
547
548int bpf_firewall_install(Unit *u) {
549 _cleanup_free_ char *path = NULL;
9f2e6892 550 CGroupContext *cc;
acf7f253 551 int r, supported;
aa2b6f1d 552 uint32_t flags;
1988a9d1
DM
553
554 assert(u);
555
9f2e6892
LP
556 cc = unit_get_cgroup_context(u);
557 if (!cc)
558 return -EINVAL;
aa2b6f1d
LP
559 if (!u->cgroup_path)
560 return -EINVAL;
561 if (!u->cgroup_realized)
562 return -EINVAL;
9f2e6892 563
acf7f253
LP
564 supported = bpf_firewall_supported();
565 if (supported < 0)
566 return supported;
567 if (supported == BPF_FIREWALL_UNSUPPORTED) {
418cdd69 568 log_debug("BPF firewalling not supported on this manager, proceeding without.");
1988a9d1
DM
569 return -EOPNOTSUPP;
570 }
acf7f253
LP
571 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
572 log_debug("BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
573 return -EOPNOTSUPP;
574 }
1988a9d1
DM
575
576 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
577 if (r < 0)
578 return log_error_errno(r, "Failed to determine cgroup path: %m");
579
acf7f253
LP
580 flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
581 (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
582
aa2b6f1d
LP
583 /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
584 * minimize the time window when we don't account for IP traffic. */
585 u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
586 u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
1988a9d1 587
aa2b6f1d 588 if (u->ip_bpf_egress) {
acf7f253 589 r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
1988a9d1
DM
590 if (r < 0)
591 return log_error_errno(r, "Attaching egress BPF program to cgroup %s failed: %m", path);
aa2b6f1d
LP
592
593 /* Remember that this BPF program is installed now. */
594 u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
1988a9d1
DM
595 }
596
597 if (u->ip_bpf_ingress) {
acf7f253 598 r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
1988a9d1
DM
599 if (r < 0)
600 return log_error_errno(r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
aa2b6f1d
LP
601
602 u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
1988a9d1
DM
603 }
604
605 return 0;
606}
607
608int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
609 uint64_t key, packets;
610 int r;
611
612 if (map_fd < 0)
613 return -EBADF;
614
615 if (ret_packets) {
616 key = MAP_KEY_PACKETS;
617 r = bpf_map_lookup_element(map_fd, &key, &packets);
618 if (r < 0)
619 return r;
620 }
621
622 if (ret_bytes) {
623 key = MAP_KEY_BYTES;
624 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
625 if (r < 0)
626 return r;
627 }
628
629 if (ret_packets)
630 *ret_packets = packets;
631
632 return 0;
633}
634
635int bpf_firewall_reset_accounting(int map_fd) {
636 uint64_t key, value = 0;
637 int r;
638
639 if (map_fd < 0)
640 return -EBADF;
641
642 key = MAP_KEY_PACKETS;
643 r = bpf_map_update_element(map_fd, &key, &value);
644 if (r < 0)
645 return r;
646
647 key = MAP_KEY_BYTES;
648 return bpf_map_update_element(map_fd, &key, &value);
649}
650
1988a9d1 651int bpf_firewall_supported(void) {
93e93da5
LP
652 struct bpf_insn trivial[] = {
653 BPF_MOV64_IMM(BPF_REG_0, 1),
654 BPF_EXIT_INSN()
655 };
656
657 _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
1988a9d1 658 static int supported = -1;
e583759b 659 union bpf_attr attr;
1988a9d1
DM
660 int fd, r;
661
e583759b 662 /* Checks whether BPF firewalling is supported. For this, we check five things:
1988a9d1
DM
663 *
664 * a) whether we are privileged
665 * b) whether the unified hierarchy is being used
666 * c) the BPF implementation in the kernel supports BPF LPM TRIE maps, which we require
e583759b
LP
667 * d) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
668 * e) the BPF implementation in the kernel supports the BPF_PROG_ATTACH call, which we require
1988a9d1
DM
669 *
670 */
671
672 if (supported >= 0)
673 return supported;
674
93e93da5
LP
675 if (geteuid() != 0) {
676 log_debug("Not enough privileges, BPF firewalling is not supported.");
2ae7ee58 677 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5 678 }
1988a9d1
DM
679
680 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
681 if (r < 0)
682 return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
e583759b
LP
683 if (r == 0) {
684 log_debug("Not running with unified cgroups, BPF firewalling is not supported.");
2ae7ee58 685 return supported = BPF_FIREWALL_UNSUPPORTED;
e583759b 686 }
1988a9d1
DM
687
688 fd = bpf_map_new(BPF_MAP_TYPE_LPM_TRIE,
689 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint64_t),
690 sizeof(uint64_t),
691 1,
692 BPF_F_NO_PREALLOC);
693 if (fd < 0) {
694 log_debug_errno(r, "Can't allocate BPF LPM TRIE map, BPF firewalling is not supported: %m");
2ae7ee58 695 return supported = BPF_FIREWALL_UNSUPPORTED;
1988a9d1
DM
696 }
697
698 safe_close(fd);
699
93e93da5
LP
700 if (bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program) < 0) {
701 log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
2ae7ee58 702 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5
LP
703 }
704
705 r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
706 if (r < 0) {
707 log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
2ae7ee58 708 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5
LP
709 }
710
711 r = bpf_program_load_kernel(program, NULL, 0);
712 if (r < 0) {
713 log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
2ae7ee58 714 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5
LP
715 }
716
e583759b
LP
717 /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
718 * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
719 * program if we can't do a thing with it later?
720 *
721 * We detect this case by issuing the BPF_PROG_ATTACH bpf() call with invalid file descriptors: if
722 * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
723 * parameters are validated however, and that'll fail with EBADF then. */
724
725 attr = (union bpf_attr) {
726 .attach_type = BPF_CGROUP_INET_EGRESS,
727 .target_fd = -1,
728 .attach_bpf_fd = -1,
729 };
730
b1c05b98 731 if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
2ae7ee58
LP
732 if (errno != EBADF) {
733 log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_ATTACH, BPF firewalling is not supported: %m");
734 return supported = BPF_FIREWALL_UNSUPPORTED;
735 }
736
737 /* YAY! */
738 } else {
739 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
740 return supported = BPF_FIREWALL_UNSUPPORTED;
741 }
e583759b 742
2ae7ee58
LP
743 /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
744 * (which was added in kernel 4.15). We use a similar logic as before, but this time we use
745 * BPF_F_ALLOW_MULTI. Since the flags are checked early in the system call we'll get EINVAL if it's not
746 * supported, and EBADF as before if it is available. */
e583759b 747
2ae7ee58
LP
748 attr = (union bpf_attr) {
749 .attach_type = BPF_CGROUP_INET_EGRESS,
750 .target_fd = -1,
751 .attach_bpf_fd = -1,
752 .attach_flags = BPF_F_ALLOW_MULTI,
753 };
754
b1c05b98 755 if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
2ae7ee58
LP
756 if (errno == EBADF) {
757 log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
758 return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
759 }
760
761 if (errno == EINVAL)
762 log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
763 else
764 log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
765
766 return supported = BPF_FIREWALL_SUPPORTED;
767 } else {
768 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
769 return supported = BPF_FIREWALL_UNSUPPORTED;
770 }
1988a9d1 771}