]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/bpf-firewall.c
network: fix typo in log message
[thirdparty/systemd.git] / src / core / bpf-firewall.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
1988a9d1
DM
2/***
3 This file is part of systemd.
4
5 Copyright 2016 Daniel Mack
6
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
16
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
21#include <arpa/inet.h>
22#include <assert.h>
23#include <errno.h>
24#include <fcntl.h>
25#include <linux/libbpf.h>
26#include <net/ethernet.h>
27#include <net/if.h>
28#include <netinet/ip.h>
29#include <netinet/ip6.h>
30#include <stddef.h>
31#include <stdio.h>
32#include <stdlib.h>
33#include <string.h>
34#include <unistd.h>
35
36#include "alloc-util.h"
37#include "bpf-firewall.h"
38#include "bpf-program.h"
39#include "fd-util.h"
40#include "ip-address-access.h"
41#include "unit.h"
42
43enum {
44 MAP_KEY_PACKETS,
45 MAP_KEY_BYTES,
46};
47
48enum {
49 ACCESS_ALLOWED = 1,
50 ACCESS_DENIED = 2,
51};
52
53/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
54
55static int add_lookup_instructions(
56 BPFProgram *p,
57 int map_fd,
58 int protocol,
59 bool is_ingress,
60 int verdict) {
61
62 int r, addr_offset, addr_size;
63
64 assert(p);
65 assert(map_fd >= 0);
66
67 switch (protocol) {
68
69 case ETH_P_IP:
70 addr_size = sizeof(uint32_t);
71 addr_offset = is_ingress ?
72 offsetof(struct iphdr, saddr) :
73 offsetof(struct iphdr, daddr);
74 break;
75
76 case ETH_P_IPV6:
77 addr_size = 4 * sizeof(uint32_t);
78 addr_offset = is_ingress ?
79 offsetof(struct ip6_hdr, ip6_src.s6_addr) :
80 offsetof(struct ip6_hdr, ip6_dst.s6_addr);
81 break;
82
83 default:
84 return -EAFNOSUPPORT;
85 }
86
87 do {
88 /* Compare IPv4 with one word instruction (32bit) */
89 struct bpf_insn insn[] = {
90 /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
91 BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
92
93 /*
94 * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
95 *
96 * R1: Pointer to the skb
97 * R2: Data offset
98 * R3: Destination buffer on the stack (r10 - 4)
99 * R4: Number of bytes to read (4)
100 */
101
102 BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
103 BPF_MOV32_IMM(BPF_REG_2, addr_offset),
104
105 BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
106 BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
107
108 BPF_MOV32_IMM(BPF_REG_4, addr_size),
109 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
110
111 /*
112 * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
113 * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
114 * has to be set to the maximum possible value.
115 *
116 * On success, the looked up value is stored in R0. For this application, the actual
117 * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
118 * matching value.
119 */
120
121 BPF_LD_MAP_FD(BPF_REG_1, map_fd),
122 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
123 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
124 BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
125
126 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
127 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
128 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
129 };
130
131 /* Jump label fixup */
132 insn[0].off = ELEMENTSOF(insn) - 1;
133
134 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
135 if (r < 0)
136 return r;
137
138 } while (false);
139
140 return 0;
141}
142
143static int bpf_firewall_compile_bpf(
144 Unit *u,
145 bool is_ingress,
146 BPFProgram **ret) {
147
148 struct bpf_insn pre_insn[] = {
149 /*
150 * When the eBPF program is entered, R1 contains the address of the skb.
151 * However, R1-R5 are scratch registers that are not preserved when calling
152 * into kernel functions, so we need to save anything that's supposed to
153 * stay around to R6-R9. Save the skb to R6.
154 */
155 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
156
157 /*
158 * Although we cannot access the skb data directly from eBPF programs used in this
159 * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
160 * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
161 * for later use.
162 */
163 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
164
165 /*
166 * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
167 * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
168 */
169 BPF_MOV32_IMM(BPF_REG_8, 0),
170 };
171
172 /*
173 * The access checkers compiled for the configured allowance and denial lists
174 * write to R8 at runtime. The following code prepares for an early exit that
175 * skip the accounting if the packet is denied.
176 *
177 * R0 = 1
178 * if (R8 == ACCESS_DENIED)
179 * R0 = 0
180 *
181 * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
182 * is allowed to pass.
183 */
184 struct bpf_insn post_insn[] = {
185 BPF_MOV64_IMM(BPF_REG_0, 1),
186 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
187 BPF_MOV64_IMM(BPF_REG_0, 0),
188 };
189
190 _cleanup_(bpf_program_unrefp) BPFProgram *p = NULL;
191 int accounting_map_fd, r;
192 bool access_enabled;
193
194 assert(u);
195 assert(ret);
196
197 accounting_map_fd = is_ingress ?
198 u->ip_accounting_ingress_map_fd :
199 u->ip_accounting_egress_map_fd;
200
201 access_enabled =
202 u->ipv4_allow_map_fd >= 0 ||
203 u->ipv6_allow_map_fd >= 0 ||
204 u->ipv4_deny_map_fd >= 0 ||
205 u->ipv6_deny_map_fd >= 0;
206
207 if (accounting_map_fd < 0 && !access_enabled) {
208 *ret = NULL;
209 return 0;
210 }
211
212 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &p);
213 if (r < 0)
214 return r;
215
216 r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
217 if (r < 0)
218 return r;
219
220 if (access_enabled) {
221 /*
222 * The simple rule this function translates into eBPF instructions is:
223 *
224 * - Access will be granted when an address matches an entry in @list_allow
225 * - Otherwise, access will be denied when an address matches an entry in @list_deny
226 * - Otherwise, access will be granted
227 */
228
229 if (u->ipv4_deny_map_fd >= 0) {
230 r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
231 if (r < 0)
232 return r;
233 }
234
235 if (u->ipv6_deny_map_fd >= 0) {
236 r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
237 if (r < 0)
238 return r;
239 }
240
241 if (u->ipv4_allow_map_fd >= 0) {
242 r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
243 if (r < 0)
244 return r;
245 }
246
247 if (u->ipv6_allow_map_fd >= 0) {
248 r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
249 if (r < 0)
250 return r;
251 }
252 }
253
254 r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
255 if (r < 0)
256 return r;
257
258 if (accounting_map_fd >= 0) {
259 struct bpf_insn insn[] = {
260 /*
261 * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
262 * The jump label will be fixed up later.
263 */
264 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
265
266 /* Count packets */
267 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
268 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
269 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
270 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
271 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
272 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
273 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
274 BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
275 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
276
277 /* Count bytes */
278 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
279 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
280 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
281 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
282 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
283 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
284 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
285 BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
286 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
287
288 /* Allow the packet to pass */
289 BPF_MOV64_IMM(BPF_REG_0, 1),
290 };
291
292 /* Jump label fixup */
293 insn[0].off = ELEMENTSOF(insn) - 1;
294
295 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
296 if (r < 0)
297 return r;
298 }
299
300 do {
301 /*
302 * Exit from the eBPF program, R0 contains the verdict.
303 * 0 means the packet is denied, 1 means the packet may pass.
304 */
305 struct bpf_insn insn[] = {
306 BPF_EXIT_INSN()
307 };
308
309 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
310 if (r < 0)
311 return r;
312 } while (false);
313
314 *ret = p;
315 p = NULL;
316
317 return 0;
318}
319
320static int bpf_firewall_count_access_items(IPAddressAccessItem *list, size_t *n_ipv4, size_t *n_ipv6) {
321 IPAddressAccessItem *a;
322
323 assert(n_ipv4);
324 assert(n_ipv6);
325
326 LIST_FOREACH(items, a, list) {
327 switch (a->family) {
328
329 case AF_INET:
330 (*n_ipv4)++;
331 break;
332
333 case AF_INET6:
334 (*n_ipv6)++;
335 break;
336
337 default:
338 return -EAFNOSUPPORT;
339 }
340 }
341
342 return 0;
343}
344
345static int bpf_firewall_add_access_items(
346 IPAddressAccessItem *list,
347 int ipv4_map_fd,
348 int ipv6_map_fd,
349 int verdict) {
350
351 struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
352 uint64_t value = verdict;
353 IPAddressAccessItem *a;
354 int r;
355
356 key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
357 key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
358
359 LIST_FOREACH(items, a, list) {
360 switch (a->family) {
361
362 case AF_INET:
363 key_ipv4->prefixlen = a->prefixlen;
364 memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
365
366 r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
367 if (r < 0)
368 return r;
369
370 break;
371
372 case AF_INET6:
373 key_ipv6->prefixlen = a->prefixlen;
374 memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
375
376 r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
377 if (r < 0)
378 return r;
379
380 break;
381
382 default:
383 return -EAFNOSUPPORT;
384 }
385 }
386
387 return 0;
388}
389
390static int bpf_firewall_prepare_access_maps(
391 Unit *u,
392 int verdict,
393 int *ret_ipv4_map_fd,
394 int *ret_ipv6_map_fd) {
395
396 _cleanup_close_ int ipv4_map_fd = -1, ipv6_map_fd = -1;
397 size_t n_ipv4 = 0, n_ipv6 = 0;
398 Unit *p;
399 int r;
400
401 assert(ret_ipv4_map_fd);
402 assert(ret_ipv6_map_fd);
403
404 for (p = u; p; p = UNIT_DEREF(p->slice)) {
405 CGroupContext *cc;
406
407 cc = unit_get_cgroup_context(p);
408 if (!cc)
409 continue;
410
411 bpf_firewall_count_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny, &n_ipv4, &n_ipv6);
412 }
413
414 if (n_ipv4 > 0) {
415 ipv4_map_fd = bpf_map_new(
416 BPF_MAP_TYPE_LPM_TRIE,
417 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
418 sizeof(uint64_t),
419 n_ipv4,
420 BPF_F_NO_PREALLOC);
421 if (ipv4_map_fd < 0)
422 return ipv4_map_fd;
423 }
424
425 if (n_ipv6 > 0) {
426 ipv6_map_fd = bpf_map_new(
427 BPF_MAP_TYPE_LPM_TRIE,
428 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
429 sizeof(uint64_t),
430 n_ipv6,
431 BPF_F_NO_PREALLOC);
432 if (ipv6_map_fd < 0)
433 return ipv6_map_fd;
434 }
435
436 for (p = u; p; p = UNIT_DEREF(p->slice)) {
437 CGroupContext *cc;
438
439 cc = unit_get_cgroup_context(p);
440 if (!cc)
441 continue;
442
443 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
444 ipv4_map_fd, ipv6_map_fd, verdict);
445 if (r < 0)
446 return r;
447 }
448
449 *ret_ipv4_map_fd = ipv4_map_fd;
450 *ret_ipv6_map_fd = ipv6_map_fd;
451
452 ipv4_map_fd = ipv6_map_fd = -1;
453 return 0;
454}
455
51283461 456static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
1988a9d1
DM
457 int r;
458
51283461 459 assert(u);
1988a9d1
DM
460 assert(fd_ingress);
461 assert(fd_egress);
462
463 if (enabled) {
464 if (*fd_ingress < 0) {
465 r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
466 if (r < 0)
467 return r;
468
469 *fd_ingress = r;
470 }
471
472 if (*fd_egress < 0) {
473
474 r = bpf_map_new(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
475 if (r < 0)
476 return r;
477
478 *fd_egress = r;
479 }
51283461 480
1988a9d1
DM
481 } else {
482 *fd_ingress = safe_close(*fd_ingress);
483 *fd_egress = safe_close(*fd_egress);
51283461
LP
484
485 zero(u->ip_accounting_extra);
1988a9d1
DM
486 }
487
488 return 0;
489}
490
491int bpf_firewall_compile(Unit *u) {
492 CGroupContext *cc;
acf7f253 493 int r, supported;
1988a9d1
DM
494
495 assert(u);
496
51283461
LP
497 cc = unit_get_cgroup_context(u);
498 if (!cc)
499 return -EINVAL;
500
acf7f253
LP
501 supported = bpf_firewall_supported();
502 if (supported < 0)
503 return supported;
504 if (supported == BPF_FIREWALL_UNSUPPORTED) {
418cdd69 505 log_debug("BPF firewalling not supported on this manager, proceeding without.");
1988a9d1
DM
506 return -EOPNOTSUPP;
507 }
acf7f253
LP
508 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
509 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
510 * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
511 * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
512 * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
513 * all, either. */
514 log_debug("BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
515 return -EOPNOTSUPP;
516 }
1988a9d1
DM
517
518 /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
519 * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
520 * configuration, but we don't flush out the accounting unnecessarily */
521
522 u->ip_bpf_ingress = bpf_program_unref(u->ip_bpf_ingress);
523 u->ip_bpf_egress = bpf_program_unref(u->ip_bpf_egress);
524
525 u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
526 u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
527
528 u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
529 u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
530
acf7f253
LP
531 if (u->type != UNIT_SLICE) {
532 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
533 * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
534 * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
535 * means that all configure IP access rules *will* take effect on processes, even though we never
536 * compile them for inner nodes. */
1988a9d1 537
acf7f253
LP
538 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd);
539 if (r < 0)
540 return log_error_errno(r, "Preparation of eBPF allow maps failed: %m");
541
542 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd);
543 if (r < 0)
544 return log_error_errno(r, "Preparation of eBPF deny maps failed: %m");
545 }
1988a9d1 546
51283461 547 r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
1988a9d1
DM
548 if (r < 0)
549 return log_error_errno(r, "Preparation of eBPF accounting maps failed: %m");
550
551 r = bpf_firewall_compile_bpf(u, true, &u->ip_bpf_ingress);
552 if (r < 0)
553 return log_error_errno(r, "Compilation for ingress BPF program failed: %m");
554
555 r = bpf_firewall_compile_bpf(u, false, &u->ip_bpf_egress);
556 if (r < 0)
557 return log_error_errno(r, "Compilation for egress BPF program failed: %m");
558
559 return 0;
560}
561
562int bpf_firewall_install(Unit *u) {
563 _cleanup_free_ char *path = NULL;
9f2e6892 564 CGroupContext *cc;
acf7f253 565 int r, supported;
aa2b6f1d 566 uint32_t flags;
1988a9d1
DM
567
568 assert(u);
569
9f2e6892
LP
570 cc = unit_get_cgroup_context(u);
571 if (!cc)
572 return -EINVAL;
aa2b6f1d
LP
573 if (!u->cgroup_path)
574 return -EINVAL;
575 if (!u->cgroup_realized)
576 return -EINVAL;
9f2e6892 577
acf7f253
LP
578 supported = bpf_firewall_supported();
579 if (supported < 0)
580 return supported;
581 if (supported == BPF_FIREWALL_UNSUPPORTED) {
418cdd69 582 log_debug("BPF firewalling not supported on this manager, proceeding without.");
1988a9d1
DM
583 return -EOPNOTSUPP;
584 }
acf7f253
LP
585 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE) {
586 log_debug("BPF_F_ALLOW_MULTI is not supported on this manager, not doing BPF firewall on slice units.");
587 return -EOPNOTSUPP;
588 }
1988a9d1
DM
589
590 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
591 if (r < 0)
592 return log_error_errno(r, "Failed to determine cgroup path: %m");
593
acf7f253
LP
594 flags = (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
595 (u->type == UNIT_SLICE || unit_cgroup_delegate(u))) ? BPF_F_ALLOW_MULTI : 0;
596
aa2b6f1d
LP
597 /* Unref the old BPF program (which will implicitly detach it) right before attaching the new program, to
598 * minimize the time window when we don't account for IP traffic. */
599 u->ip_bpf_egress_installed = bpf_program_unref(u->ip_bpf_egress_installed);
600 u->ip_bpf_ingress_installed = bpf_program_unref(u->ip_bpf_ingress_installed);
1988a9d1 601
aa2b6f1d 602 if (u->ip_bpf_egress) {
acf7f253 603 r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
1988a9d1
DM
604 if (r < 0)
605 return log_error_errno(r, "Attaching egress BPF program to cgroup %s failed: %m", path);
aa2b6f1d
LP
606
607 /* Remember that this BPF program is installed now. */
608 u->ip_bpf_egress_installed = bpf_program_ref(u->ip_bpf_egress);
1988a9d1
DM
609 }
610
611 if (u->ip_bpf_ingress) {
acf7f253 612 r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
1988a9d1
DM
613 if (r < 0)
614 return log_error_errno(r, "Attaching ingress BPF program to cgroup %s failed: %m", path);
aa2b6f1d
LP
615
616 u->ip_bpf_ingress_installed = bpf_program_ref(u->ip_bpf_ingress);
1988a9d1
DM
617 }
618
619 return 0;
620}
621
622int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
623 uint64_t key, packets;
624 int r;
625
626 if (map_fd < 0)
627 return -EBADF;
628
629 if (ret_packets) {
630 key = MAP_KEY_PACKETS;
631 r = bpf_map_lookup_element(map_fd, &key, &packets);
632 if (r < 0)
633 return r;
634 }
635
636 if (ret_bytes) {
637 key = MAP_KEY_BYTES;
638 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
639 if (r < 0)
640 return r;
641 }
642
643 if (ret_packets)
644 *ret_packets = packets;
645
646 return 0;
647}
648
649int bpf_firewall_reset_accounting(int map_fd) {
650 uint64_t key, value = 0;
651 int r;
652
653 if (map_fd < 0)
654 return -EBADF;
655
656 key = MAP_KEY_PACKETS;
657 r = bpf_map_update_element(map_fd, &key, &value);
658 if (r < 0)
659 return r;
660
661 key = MAP_KEY_BYTES;
662 return bpf_map_update_element(map_fd, &key, &value);
663}
664
1988a9d1 665int bpf_firewall_supported(void) {
93e93da5
LP
666 struct bpf_insn trivial[] = {
667 BPF_MOV64_IMM(BPF_REG_0, 1),
668 BPF_EXIT_INSN()
669 };
670
671 _cleanup_(bpf_program_unrefp) BPFProgram *program = NULL;
1988a9d1 672 static int supported = -1;
e583759b 673 union bpf_attr attr;
1988a9d1
DM
674 int fd, r;
675
e583759b 676 /* Checks whether BPF firewalling is supported. For this, we check five things:
1988a9d1
DM
677 *
678 * a) whether we are privileged
679 * b) whether the unified hierarchy is being used
680 * c) the BPF implementation in the kernel supports BPF LPM TRIE maps, which we require
e583759b
LP
681 * d) the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
682 * e) the BPF implementation in the kernel supports the BPF_PROG_ATTACH call, which we require
1988a9d1
DM
683 *
684 */
685
686 if (supported >= 0)
687 return supported;
688
93e93da5
LP
689 if (geteuid() != 0) {
690 log_debug("Not enough privileges, BPF firewalling is not supported.");
2ae7ee58 691 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5 692 }
1988a9d1
DM
693
694 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
695 if (r < 0)
696 return log_error_errno(r, "Can't determine whether the unified hierarchy is used: %m");
e583759b
LP
697 if (r == 0) {
698 log_debug("Not running with unified cgroups, BPF firewalling is not supported.");
2ae7ee58 699 return supported = BPF_FIREWALL_UNSUPPORTED;
e583759b 700 }
1988a9d1
DM
701
702 fd = bpf_map_new(BPF_MAP_TYPE_LPM_TRIE,
703 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint64_t),
704 sizeof(uint64_t),
705 1,
706 BPF_F_NO_PREALLOC);
707 if (fd < 0) {
708 log_debug_errno(r, "Can't allocate BPF LPM TRIE map, BPF firewalling is not supported: %m");
2ae7ee58 709 return supported = BPF_FIREWALL_UNSUPPORTED;
1988a9d1
DM
710 }
711
712 safe_close(fd);
713
93e93da5
LP
714 if (bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, &program) < 0) {
715 log_debug_errno(r, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
2ae7ee58 716 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5
LP
717 }
718
719 r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
720 if (r < 0) {
721 log_debug_errno(r, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
2ae7ee58 722 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5
LP
723 }
724
725 r = bpf_program_load_kernel(program, NULL, 0);
726 if (r < 0) {
727 log_debug_errno(r, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
2ae7ee58 728 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5
LP
729 }
730
e583759b
LP
731 /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
732 * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
733 * program if we can't do a thing with it later?
734 *
735 * We detect this case by issuing the BPF_PROG_ATTACH bpf() call with invalid file descriptors: if
736 * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
737 * parameters are validated however, and that'll fail with EBADF then. */
738
739 attr = (union bpf_attr) {
740 .attach_type = BPF_CGROUP_INET_EGRESS,
741 .target_fd = -1,
742 .attach_bpf_fd = -1,
743 };
744
745 r = bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
746 if (r < 0) {
2ae7ee58
LP
747 if (errno != EBADF) {
748 log_debug_errno(errno, "Didn't get EBADF from BPF_PROG_ATTACH, BPF firewalling is not supported: %m");
749 return supported = BPF_FIREWALL_UNSUPPORTED;
750 }
751
752 /* YAY! */
753 } else {
754 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
755 return supported = BPF_FIREWALL_UNSUPPORTED;
756 }
e583759b 757
2ae7ee58
LP
758 /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
759 * (which was added in kernel 4.15). We use a similar logic as before, but this time we use
760 * BPF_F_ALLOW_MULTI. Since the flags are checked early in the system call we'll get EINVAL if it's not
761 * supported, and EBADF as before if it is available. */
e583759b 762
2ae7ee58
LP
763 attr = (union bpf_attr) {
764 .attach_type = BPF_CGROUP_INET_EGRESS,
765 .target_fd = -1,
766 .attach_bpf_fd = -1,
767 .attach_flags = BPF_F_ALLOW_MULTI,
768 };
769
770 r = bpf(BPF_PROG_ATTACH, &attr, sizeof(attr));
771 if (r < 0) {
772 if (errno == EBADF) {
773 log_debug_errno(errno, "Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
774 return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
775 }
776
777 if (errno == EINVAL)
778 log_debug_errno(errno, "Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
779 else
780 log_debug_errno(errno, "Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
781
782 return supported = BPF_FIREWALL_SUPPORTED;
783 } else {
784 log_debug("Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? Something is weird, assuming BPF firewalling is broken and hence not supported.");
785 return supported = BPF_FIREWALL_UNSUPPORTED;
786 }
1988a9d1 787}