]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/core/bpf-firewall.c
bpf-firewall: give a name to maps used
[thirdparty/systemd.git] / src / core / bpf-firewall.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
1988a9d1
DM
2
3#include <arpa/inet.h>
4#include <assert.h>
5#include <errno.h>
6#include <fcntl.h>
01234e1f 7#include <linux/bpf_insn.h>
1988a9d1
DM
8#include <net/ethernet.h>
9#include <net/if.h>
10#include <netinet/ip.h>
11#include <netinet/ip6.h>
12#include <stddef.h>
13#include <stdio.h>
14#include <stdlib.h>
1988a9d1
DM
15#include <unistd.h>
16
17#include "alloc-util.h"
18#include "bpf-firewall.h"
19#include "bpf-program.h"
20#include "fd-util.h"
84ebe6f0 21#include "in-addr-prefix-util.h"
0a970718 22#include "memory-util.h"
e93672ee 23#include "missing_syscall.h"
1988a9d1 24#include "unit.h"
5cfa33e0 25#include "strv.h"
f140ed02 26#include "virt.h"
1988a9d1
DM
27
28enum {
29 MAP_KEY_PACKETS,
30 MAP_KEY_BYTES,
31};
32
33enum {
34 ACCESS_ALLOWED = 1,
35 ACCESS_DENIED = 2,
36};
37
38/* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
39
40static int add_lookup_instructions(
41 BPFProgram *p,
42 int map_fd,
43 int protocol,
44 bool is_ingress,
45 int verdict) {
46
47 int r, addr_offset, addr_size;
48
49 assert(p);
50 assert(map_fd >= 0);
51
52 switch (protocol) {
53
54 case ETH_P_IP:
55 addr_size = sizeof(uint32_t);
56 addr_offset = is_ingress ?
57 offsetof(struct iphdr, saddr) :
58 offsetof(struct iphdr, daddr);
59 break;
60
61 case ETH_P_IPV6:
62 addr_size = 4 * sizeof(uint32_t);
63 addr_offset = is_ingress ?
64 offsetof(struct ip6_hdr, ip6_src.s6_addr) :
65 offsetof(struct ip6_hdr, ip6_dst.s6_addr);
66 break;
67
68 default:
69 return -EAFNOSUPPORT;
70 }
71
72 do {
73 /* Compare IPv4 with one word instruction (32bit) */
74 struct bpf_insn insn[] = {
75 /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
76 BPF_JMP_IMM(BPF_JNE, BPF_REG_7, htobe16(protocol), 0),
77
78 /*
79 * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
80 *
81 * R1: Pointer to the skb
82 * R2: Data offset
83 * R3: Destination buffer on the stack (r10 - 4)
84 * R4: Number of bytes to read (4)
85 */
86
87 BPF_MOV64_REG(BPF_REG_1, BPF_REG_6),
88 BPF_MOV32_IMM(BPF_REG_2, addr_offset),
89
90 BPF_MOV64_REG(BPF_REG_3, BPF_REG_10),
91 BPF_ALU64_IMM(BPF_ADD, BPF_REG_3, -addr_size),
92
93 BPF_MOV32_IMM(BPF_REG_4, addr_size),
94 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_skb_load_bytes),
95
96 /*
97 * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
98 * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
99 * has to be set to the maximum possible value.
100 *
101 * On success, the looked up value is stored in R0. For this application, the actual
102 * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
103 * matching value.
104 */
105
106 BPF_LD_MAP_FD(BPF_REG_1, map_fd),
107 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
108 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -addr_size - sizeof(uint32_t)),
109 BPF_ST_MEM(BPF_W, BPF_REG_2, 0, addr_size * 8),
110
111 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
112 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
113 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
114 };
115
116 /* Jump label fixup */
117 insn[0].off = ELEMENTSOF(insn) - 1;
118
119 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
120 if (r < 0)
121 return r;
122
123 } while (false);
124
125 return 0;
126}
127
4c1567f2
AZ
128static int add_instructions_for_ip_any(
129 BPFProgram *p,
130 int verdict) {
131 int r;
132
133 assert(p);
134
2899aac4 135 const struct bpf_insn insn[] = {
4c1567f2
AZ
136 BPF_ALU32_IMM(BPF_OR, BPF_REG_8, verdict),
137 };
138
139 r = bpf_program_add_instructions(p, insn, 1);
140 if (r < 0)
141 return r;
142
143 return 0;
144}
145
1988a9d1
DM
146static int bpf_firewall_compile_bpf(
147 Unit *u,
e0c694c7 148 const char *prog_name,
1988a9d1 149 bool is_ingress,
4c1567f2
AZ
150 BPFProgram **ret,
151 bool ip_allow_any,
152 bool ip_deny_any) {
1988a9d1 153
2899aac4 154 const struct bpf_insn pre_insn[] = {
1988a9d1
DM
155 /*
156 * When the eBPF program is entered, R1 contains the address of the skb.
157 * However, R1-R5 are scratch registers that are not preserved when calling
158 * into kernel functions, so we need to save anything that's supposed to
159 * stay around to R6-R9. Save the skb to R6.
160 */
161 BPF_MOV64_REG(BPF_REG_6, BPF_REG_1),
162
163 /*
164 * Although we cannot access the skb data directly from eBPF programs used in this
165 * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
166 * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
167 * for later use.
168 */
169 BPF_LDX_MEM(BPF_W, BPF_REG_7, BPF_REG_6, offsetof(struct __sk_buff, protocol)),
170
171 /*
172 * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
173 * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
174 */
175 BPF_MOV32_IMM(BPF_REG_8, 0),
176 };
177
178 /*
179 * The access checkers compiled for the configured allowance and denial lists
180 * write to R8 at runtime. The following code prepares for an early exit that
181 * skip the accounting if the packet is denied.
182 *
183 * R0 = 1
184 * if (R8 == ACCESS_DENIED)
185 * R0 = 0
186 *
187 * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
188 * is allowed to pass.
189 */
2899aac4 190 const struct bpf_insn post_insn[] = {
1988a9d1
DM
191 BPF_MOV64_IMM(BPF_REG_0, 1),
192 BPF_JMP_IMM(BPF_JNE, BPF_REG_8, ACCESS_DENIED, 1),
193 BPF_MOV64_IMM(BPF_REG_0, 0),
194 };
195
76dc1725 196 _cleanup_(bpf_program_freep) BPFProgram *p = NULL;
1988a9d1
DM
197 int accounting_map_fd, r;
198 bool access_enabled;
199
200 assert(u);
201 assert(ret);
202
203 accounting_map_fd = is_ingress ?
204 u->ip_accounting_ingress_map_fd :
205 u->ip_accounting_egress_map_fd;
206
207 access_enabled =
208 u->ipv4_allow_map_fd >= 0 ||
209 u->ipv6_allow_map_fd >= 0 ||
210 u->ipv4_deny_map_fd >= 0 ||
4c1567f2
AZ
211 u->ipv6_deny_map_fd >= 0 ||
212 ip_allow_any ||
213 ip_deny_any;
1988a9d1
DM
214
215 if (accounting_map_fd < 0 && !access_enabled) {
216 *ret = NULL;
217 return 0;
218 }
219
8fe9dbb9 220 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, prog_name, &p);
1988a9d1
DM
221 if (r < 0)
222 return r;
223
224 r = bpf_program_add_instructions(p, pre_insn, ELEMENTSOF(pre_insn));
225 if (r < 0)
226 return r;
227
228 if (access_enabled) {
229 /*
230 * The simple rule this function translates into eBPF instructions is:
231 *
232 * - Access will be granted when an address matches an entry in @list_allow
233 * - Otherwise, access will be denied when an address matches an entry in @list_deny
234 * - Otherwise, access will be granted
235 */
236
237 if (u->ipv4_deny_map_fd >= 0) {
238 r = add_lookup_instructions(p, u->ipv4_deny_map_fd, ETH_P_IP, is_ingress, ACCESS_DENIED);
239 if (r < 0)
240 return r;
241 }
242
243 if (u->ipv6_deny_map_fd >= 0) {
244 r = add_lookup_instructions(p, u->ipv6_deny_map_fd, ETH_P_IPV6, is_ingress, ACCESS_DENIED);
245 if (r < 0)
246 return r;
247 }
248
249 if (u->ipv4_allow_map_fd >= 0) {
250 r = add_lookup_instructions(p, u->ipv4_allow_map_fd, ETH_P_IP, is_ingress, ACCESS_ALLOWED);
251 if (r < 0)
252 return r;
253 }
254
255 if (u->ipv6_allow_map_fd >= 0) {
256 r = add_lookup_instructions(p, u->ipv6_allow_map_fd, ETH_P_IPV6, is_ingress, ACCESS_ALLOWED);
257 if (r < 0)
258 return r;
259 }
4c1567f2
AZ
260
261 if (ip_allow_any) {
262 r = add_instructions_for_ip_any(p, ACCESS_ALLOWED);
263 if (r < 0)
264 return r;
265 }
266
267 if (ip_deny_any) {
268 r = add_instructions_for_ip_any(p, ACCESS_DENIED);
269 if (r < 0)
270 return r;
271 }
1988a9d1
DM
272 }
273
274 r = bpf_program_add_instructions(p, post_insn, ELEMENTSOF(post_insn));
275 if (r < 0)
276 return r;
277
278 if (accounting_map_fd >= 0) {
279 struct bpf_insn insn[] = {
280 /*
281 * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
282 * The jump label will be fixed up later.
283 */
284 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 0),
285
286 /* Count packets */
287 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_PACKETS), /* r0 = 0 */
288 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
289 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
290 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
291 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd), /* load map fd to r1 */
292 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
293 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
294 BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */
295 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
296
297 /* Count bytes */
298 BPF_MOV64_IMM(BPF_REG_0, MAP_KEY_BYTES), /* r0 = 1 */
299 BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */
300 BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
301 BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */
302 BPF_LD_MAP_FD(BPF_REG_1, accounting_map_fd),
303 BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
304 BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
305 BPF_LDX_MEM(BPF_W, BPF_REG_1, BPF_REG_6, offsetof(struct __sk_buff, len)), /* r1 = skb->len */
306 BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */
307
308 /* Allow the packet to pass */
309 BPF_MOV64_IMM(BPF_REG_0, 1),
310 };
311
312 /* Jump label fixup */
313 insn[0].off = ELEMENTSOF(insn) - 1;
314
315 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
316 if (r < 0)
317 return r;
318 }
319
320 do {
321 /*
322 * Exit from the eBPF program, R0 contains the verdict.
323 * 0 means the packet is denied, 1 means the packet may pass.
324 */
2899aac4 325 const struct bpf_insn insn[] = {
1988a9d1
DM
326 BPF_EXIT_INSN()
327 };
328
329 r = bpf_program_add_instructions(p, insn, ELEMENTSOF(insn));
330 if (r < 0)
331 return r;
332 } while (false);
333
1cc6c93a 334 *ret = TAKE_PTR(p);
1988a9d1
DM
335
336 return 0;
337}
338
84ebe6f0
YW
339static int bpf_firewall_count_access_items(Set *prefixes, size_t *n_ipv4, size_t *n_ipv6) {
340 struct in_addr_prefix *a;
1988a9d1
DM
341
342 assert(n_ipv4);
343 assert(n_ipv6);
344
84ebe6f0 345 SET_FOREACH(a, prefixes)
1988a9d1
DM
346 switch (a->family) {
347
348 case AF_INET:
349 (*n_ipv4)++;
350 break;
351
352 case AF_INET6:
353 (*n_ipv6)++;
354 break;
355
356 default:
357 return -EAFNOSUPPORT;
358 }
1988a9d1
DM
359
360 return 0;
361}
362
363static int bpf_firewall_add_access_items(
84ebe6f0 364 Set *prefixes,
1988a9d1
DM
365 int ipv4_map_fd,
366 int ipv6_map_fd,
367 int verdict) {
368
369 struct bpf_lpm_trie_key *key_ipv4, *key_ipv6;
84ebe6f0 370 struct in_addr_prefix *a;
1988a9d1 371 uint64_t value = verdict;
1988a9d1
DM
372 int r;
373
374 key_ipv4 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t));
375 key_ipv6 = alloca0(offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t) * 4);
376
84ebe6f0 377 SET_FOREACH(a, prefixes)
1988a9d1
DM
378 switch (a->family) {
379
380 case AF_INET:
381 key_ipv4->prefixlen = a->prefixlen;
382 memcpy(key_ipv4->data, &a->address, sizeof(uint32_t));
383
384 r = bpf_map_update_element(ipv4_map_fd, key_ipv4, &value);
385 if (r < 0)
386 return r;
387
388 break;
389
390 case AF_INET6:
391 key_ipv6->prefixlen = a->prefixlen;
392 memcpy(key_ipv6->data, &a->address, 4 * sizeof(uint32_t));
393
394 r = bpf_map_update_element(ipv6_map_fd, key_ipv6, &value);
395 if (r < 0)
396 return r;
397
398 break;
399
400 default:
401 return -EAFNOSUPPORT;
402 }
1988a9d1
DM
403
404 return 0;
405}
406
407static int bpf_firewall_prepare_access_maps(
408 Unit *u,
409 int verdict,
410 int *ret_ipv4_map_fd,
4c1567f2
AZ
411 int *ret_ipv6_map_fd,
412 bool *ret_has_any) {
1988a9d1 413
254d1313 414 _cleanup_close_ int ipv4_map_fd = -EBADF, ipv6_map_fd = -EBADF;
1988a9d1
DM
415 size_t n_ipv4 = 0, n_ipv6 = 0;
416 Unit *p;
417 int r;
418
419 assert(ret_ipv4_map_fd);
420 assert(ret_ipv6_map_fd);
4c1567f2 421 assert(ret_has_any);
1988a9d1 422
12f64221 423 for (p = u; p; p = UNIT_GET_SLICE(p)) {
1988a9d1 424 CGroupContext *cc;
84ebe6f0
YW
425 Set *prefixes;
426 bool *reduced;
1988a9d1
DM
427
428 cc = unit_get_cgroup_context(p);
429 if (!cc)
430 continue;
431
84ebe6f0
YW
432 prefixes = verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny;
433 reduced = verdict == ACCESS_ALLOWED ? &cc->ip_address_allow_reduced : &cc->ip_address_deny_reduced;
434
435 if (!*reduced) {
436 r = in_addr_prefixes_reduce(prefixes);
437 if (r < 0)
438 return r;
439
440 *reduced = true;
441 }
4c1567f2 442
84ebe6f0 443 bpf_firewall_count_access_items(prefixes, &n_ipv4, &n_ipv6);
4c1567f2
AZ
444
445 /* Skip making the LPM trie map in cases where we are using "any" in order to hack around
446 * needing CAP_SYS_ADMIN for allocating LPM trie map. */
84ebe6f0 447 if (in_addr_prefixes_is_any(prefixes)) {
4c1567f2
AZ
448 *ret_has_any = true;
449 return 0;
450 }
1988a9d1
DM
451 }
452
453 if (n_ipv4 > 0) {
25d9c6cd 454 char *name = strjoina("4_", u->id);
1988a9d1 455 ipv4_map_fd = bpf_map_new(
25d9c6cd 456 name,
1988a9d1
DM
457 BPF_MAP_TYPE_LPM_TRIE,
458 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t),
459 sizeof(uint64_t),
460 n_ipv4,
461 BPF_F_NO_PREALLOC);
462 if (ipv4_map_fd < 0)
463 return ipv4_map_fd;
464 }
465
466 if (n_ipv6 > 0) {
25d9c6cd 467 char *name = strjoina("6_", u->id);
1988a9d1 468 ipv6_map_fd = bpf_map_new(
25d9c6cd 469 name,
1988a9d1
DM
470 BPF_MAP_TYPE_LPM_TRIE,
471 offsetof(struct bpf_lpm_trie_key, data) + sizeof(uint32_t)*4,
472 sizeof(uint64_t),
473 n_ipv6,
474 BPF_F_NO_PREALLOC);
475 if (ipv6_map_fd < 0)
476 return ipv6_map_fd;
477 }
478
12f64221 479 for (p = u; p; p = UNIT_GET_SLICE(p)) {
1988a9d1
DM
480 CGroupContext *cc;
481
482 cc = unit_get_cgroup_context(p);
483 if (!cc)
484 continue;
485
486 r = bpf_firewall_add_access_items(verdict == ACCESS_ALLOWED ? cc->ip_address_allow : cc->ip_address_deny,
487 ipv4_map_fd, ipv6_map_fd, verdict);
488 if (r < 0)
489 return r;
490 }
491
1e59b545
LP
492 *ret_ipv4_map_fd = TAKE_FD(ipv4_map_fd);
493 *ret_ipv6_map_fd = TAKE_FD(ipv6_map_fd);
4c1567f2 494 *ret_has_any = false;
1988a9d1
DM
495 return 0;
496}
497
51283461 498static int bpf_firewall_prepare_accounting_maps(Unit *u, bool enabled, int *fd_ingress, int *fd_egress) {
1988a9d1
DM
499 int r;
500
51283461 501 assert(u);
1988a9d1
DM
502 assert(fd_ingress);
503 assert(fd_egress);
504
505 if (enabled) {
506 if (*fd_ingress < 0) {
25d9c6cd
DM
507 char *name = strjoina("I_", u->id);
508 r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
1988a9d1
DM
509 if (r < 0)
510 return r;
511
512 *fd_ingress = r;
513 }
514
515 if (*fd_egress < 0) {
25d9c6cd
DM
516 char *name = strjoina("E_", u->id);
517 r = bpf_map_new(name, BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(uint64_t), 2, 0);
1988a9d1
DM
518 if (r < 0)
519 return r;
520
521 *fd_egress = r;
522 }
51283461 523
1988a9d1
DM
524 } else {
525 *fd_ingress = safe_close(*fd_ingress);
526 *fd_egress = safe_close(*fd_egress);
51283461
LP
527
528 zero(u->ip_accounting_extra);
1988a9d1
DM
529 }
530
531 return 0;
532}
533
534int bpf_firewall_compile(Unit *u) {
e0c694c7
JK
535 const char *ingress_name = NULL, *egress_name = NULL;
536 bool ip_allow_any = false, ip_deny_any = false;
1988a9d1 537 CGroupContext *cc;
acf7f253 538 int r, supported;
1988a9d1
DM
539
540 assert(u);
541
51283461
LP
542 cc = unit_get_cgroup_context(u);
543 if (!cc)
544 return -EINVAL;
545
acf7f253
LP
546 supported = bpf_firewall_supported();
547 if (supported < 0)
548 return supported;
84d2744b
ZJS
549 if (supported == BPF_FIREWALL_UNSUPPORTED)
550 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
b1acbc08 551 "bpf-firewall: BPF firewalling not supported, proceeding without.");
84d2744b 552 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
acf7f253
LP
553 /* If BPF_F_ALLOW_MULTI is not supported we don't support any BPF magic on inner nodes (i.e. on slice
554 * units), since that would mean leaf nodes couldn't do any BPF anymore at all. Under the assumption
555 * that BPF is more interesting on leaf nodes we hence avoid it on inner nodes in that case. This is
556 * consistent with old systemd behaviour from before v238, where BPF wasn't supported in inner nodes at
557 * all, either. */
84d2744b 558 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
b1acbc08 559 "bpf-firewall: BPF_F_ALLOW_MULTI is not supported, not doing BPF firewall on slice units.");
1988a9d1 560
e0c694c7
JK
561 /* If BPF_F_ALLOW_MULTI flag is supported program name is also supported (both were added to v4.15
562 * kernel). */
563 if (supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI) {
564 ingress_name = "sd_fw_ingress";
565 egress_name = "sd_fw_egress";
566 }
567
1988a9d1 568 /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
37b22b3b 569 * but we reuse the accounting maps. That way the firewall in effect always maps to the actual
1988a9d1
DM
570 * configuration, but we don't flush out the accounting unnecessarily */
571
76dc1725 572 u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress);
573 u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress);
1988a9d1
DM
574
575 u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
576 u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
577
578 u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
579 u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
580
acf7f253
LP
581 if (u->type != UNIT_SLICE) {
582 /* In inner nodes we only do accounting, we do not actually bother with access control. However, leaf
583 * nodes will incorporate all IP access rules set on all their parent nodes. This has the benefit that
584 * they can optionally cancel out system-wide rules. Since inner nodes can't contain processes this
585 * means that all configure IP access rules *will* take effect on processes, even though we never
586 * compile them for inner nodes. */
1988a9d1 587
4c1567f2 588 r = bpf_firewall_prepare_access_maps(u, ACCESS_ALLOWED, &u->ipv4_allow_map_fd, &u->ipv6_allow_map_fd, &ip_allow_any);
acf7f253 589 if (r < 0)
b1acbc08 590 return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF allow maps failed: %m");
acf7f253 591
4c1567f2 592 r = bpf_firewall_prepare_access_maps(u, ACCESS_DENIED, &u->ipv4_deny_map_fd, &u->ipv6_deny_map_fd, &ip_deny_any);
acf7f253 593 if (r < 0)
b1acbc08 594 return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF deny maps failed: %m");
acf7f253 595 }
1988a9d1 596
51283461 597 r = bpf_firewall_prepare_accounting_maps(u, cc->ip_accounting, &u->ip_accounting_ingress_map_fd, &u->ip_accounting_egress_map_fd);
1988a9d1 598 if (r < 0)
b1acbc08 599 return log_unit_error_errno(u, r, "bpf-firewall: Preparation of BPF accounting maps failed: %m");
1988a9d1 600
e0c694c7 601 r = bpf_firewall_compile_bpf(u, ingress_name, true, &u->ip_bpf_ingress, ip_allow_any, ip_deny_any);
1988a9d1 602 if (r < 0)
b1acbc08 603 return log_unit_error_errno(u, r, "bpf-firewall: Compilation of ingress BPF program failed: %m");
1988a9d1 604
e0c694c7 605 r = bpf_firewall_compile_bpf(u, egress_name, false, &u->ip_bpf_egress, ip_allow_any, ip_deny_any);
1988a9d1 606 if (r < 0)
b1acbc08 607 return log_unit_error_errno(u, r, "bpf-firewall: Compilation of egress BPF program failed: %m");
1988a9d1
DM
608
609 return 0;
610}
611
fab34748 612static int load_bpf_progs_from_fs_to_set(Unit *u, char **filter_paths, Set **set) {
fab34748
KL
613 set_clear(*set);
614
615 STRV_FOREACH(bpf_fs_path, filter_paths) {
76dc1725 616 _cleanup_(bpf_program_freep) BPFProgram *prog = NULL;
fab34748
KL
617 int r;
618
8fe9dbb9 619 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &prog);
fab34748 620 if (r < 0)
b1acbc08 621 return log_unit_error_errno(u, r, "bpf-firewall: Allocation of SKB BPF program failed: %m");
fab34748
KL
622
623 r = bpf_program_load_from_bpf_fs(prog, *bpf_fs_path);
624 if (r < 0)
b1acbc08 625 return log_unit_error_errno(u, r, "bpf-firewall: Loading of ingress BPF program %s failed: %m", *bpf_fs_path);
fab34748 626
7a7cf83d 627 r = set_ensure_consume(set, &bpf_program_hash_ops, TAKE_PTR(prog));
fab34748 628 if (r < 0)
b1acbc08 629 return log_oom();
fab34748
KL
630 }
631
632 return 0;
633}
634
635int bpf_firewall_load_custom(Unit *u) {
636 CGroupContext *cc;
637 int r, supported;
638
639 assert(u);
640
641 cc = unit_get_cgroup_context(u);
642 if (!cc)
643 return 0;
644
645 if (!(cc->ip_filters_ingress || cc->ip_filters_egress))
646 return 0;
647
648 supported = bpf_firewall_supported();
649 if (supported < 0)
650 return supported;
651
652 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI)
b1acbc08
ZJS
653 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
654 "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
fab34748
KL
655
656 r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_ingress, &u->ip_bpf_custom_ingress);
657 if (r < 0)
658 return r;
659 r = load_bpf_progs_from_fs_to_set(u, cc->ip_filters_egress, &u->ip_bpf_custom_egress);
660 if (r < 0)
661 return r;
662
663 return 0;
664}
665
666static int attach_custom_bpf_progs(Unit *u, const char *path, int attach_type, Set **set, Set **set_installed) {
667 BPFProgram *prog;
fab34748
KL
668 int r;
669
670 assert(u);
671
672 set_clear(*set_installed);
f25e10b1
YW
673 r = set_ensure_allocated(set_installed, &bpf_program_hash_ops);
674 if (r < 0)
675 return log_oom();
fab34748 676
76dc1725 677 SET_FOREACH_MOVE(prog, *set_installed, *set) {
fab34748
KL
678 r = bpf_program_cgroup_attach(prog, attach_type, path, BPF_F_ALLOW_MULTI);
679 if (r < 0)
b1acbc08 680 return log_unit_error_errno(u, r, "bpf-firewall: Attaching custom egress BPF program to cgroup %s failed: %m", path);
fab34748 681 }
fab34748
KL
682 return 0;
683}
684
1988a9d1 685int bpf_firewall_install(Unit *u) {
76dc1725 686 _cleanup_(bpf_program_freep) BPFProgram *ip_bpf_ingress_uninstall = NULL, *ip_bpf_egress_uninstall = NULL;
1988a9d1 687 _cleanup_free_ char *path = NULL;
9f2e6892 688 CGroupContext *cc;
acf7f253 689 int r, supported;
aa2b6f1d 690 uint32_t flags;
1988a9d1
DM
691
692 assert(u);
693
9f2e6892
LP
694 cc = unit_get_cgroup_context(u);
695 if (!cc)
696 return -EINVAL;
aa2b6f1d
LP
697 if (!u->cgroup_path)
698 return -EINVAL;
699 if (!u->cgroup_realized)
700 return -EINVAL;
9f2e6892 701
acf7f253
LP
702 supported = bpf_firewall_supported();
703 if (supported < 0)
704 return supported;
d85ff944 705 if (supported == BPF_FIREWALL_UNSUPPORTED)
b1acbc08
ZJS
706 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
707 "bpf-firewall: BPF firewalling not supported, proceeding without.");
d85ff944 708 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI && u->type == UNIT_SLICE)
b1acbc08
ZJS
709 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
710 "bpf-firewall: BPF_F_ALLOW_MULTI not supported, not doing BPF firewall on slice units.");
fab34748
KL
711 if (supported != BPF_FIREWALL_SUPPORTED_WITH_MULTI &&
712 (!set_isempty(u->ip_bpf_custom_ingress) || !set_isempty(u->ip_bpf_custom_egress)))
b1acbc08
ZJS
713 return log_unit_debug_errno(u, SYNTHETIC_ERRNO(EOPNOTSUPP),
714 "bpf-firewall: BPF_F_ALLOW_MULTI not supported, cannot attach custom BPF programs.");
1988a9d1
DM
715
716 r = cg_get_path(SYSTEMD_CGROUP_CONTROLLER, u->cgroup_path, NULL, &path);
717 if (r < 0)
b1acbc08 718 return log_unit_error_errno(u, r, "bpf-firewall: Failed to determine cgroup path: %m");
1988a9d1 719
a442ccb4 720 flags = supported == BPF_FIREWALL_SUPPORTED_WITH_MULTI ? BPF_F_ALLOW_MULTI : 0;
acf7f253 721
dbef3d16
LP
722 if (FLAGS_SET(flags, BPF_F_ALLOW_MULTI)) {
723 /* If we have BPF_F_ALLOW_MULTI, then let's clear the fields, but destroy the programs only
724 * after attaching the new programs, so that there's no time window where neither program is
725 * attached. (There will be a program where both are attached, but that's OK, since this is a
726 * security feature where we rather want to lock down too much than too little */
727 ip_bpf_egress_uninstall = TAKE_PTR(u->ip_bpf_egress_installed);
728 ip_bpf_ingress_uninstall = TAKE_PTR(u->ip_bpf_ingress_installed);
729 } else {
730 /* If we don't have BPF_F_ALLOW_MULTI then unref the old BPF programs (which will implicitly
731 * detach them) right before attaching the new program, to minimize the time window when we
732 * don't account for IP traffic. */
76dc1725 733 u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed);
734 u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed);
dbef3d16 735 }
1988a9d1 736
aa2b6f1d 737 if (u->ip_bpf_egress) {
a442ccb4 738 r = bpf_program_cgroup_attach(u->ip_bpf_egress, BPF_CGROUP_INET_EGRESS, path, flags);
1988a9d1 739 if (r < 0)
b1acbc08
ZJS
740 return log_unit_error_errno(u, r,
741 "bpf-firewall: Attaching egress BPF program to cgroup %s failed: %m", path);
aa2b6f1d
LP
742
743 /* Remember that this BPF program is installed now. */
76dc1725 744 u->ip_bpf_egress_installed = TAKE_PTR(u->ip_bpf_egress);
1988a9d1
DM
745 }
746
747 if (u->ip_bpf_ingress) {
a442ccb4 748 r = bpf_program_cgroup_attach(u->ip_bpf_ingress, BPF_CGROUP_INET_INGRESS, path, flags);
1988a9d1 749 if (r < 0)
b1acbc08
ZJS
750 return log_unit_error_errno(u, r,
751 "bpf-firewall: Attaching ingress BPF program to cgroup %s failed: %m", path);
aa2b6f1d 752
76dc1725 753 u->ip_bpf_ingress_installed = TAKE_PTR(u->ip_bpf_ingress);
1988a9d1
DM
754 }
755
dbef3d16 756 /* And now, definitely get rid of the old programs, and detach them */
76dc1725 757 ip_bpf_egress_uninstall = bpf_program_free(ip_bpf_egress_uninstall);
758 ip_bpf_ingress_uninstall = bpf_program_free(ip_bpf_ingress_uninstall);
dbef3d16 759
fab34748
KL
760 r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_EGRESS, &u->ip_bpf_custom_egress, &u->ip_bpf_custom_egress_installed);
761 if (r < 0)
762 return r;
763
764 r = attach_custom_bpf_progs(u, path, BPF_CGROUP_INET_INGRESS, &u->ip_bpf_custom_ingress, &u->ip_bpf_custom_ingress_installed);
765 if (r < 0)
766 return r;
767
1988a9d1
DM
768 return 0;
769}
770
771int bpf_firewall_read_accounting(int map_fd, uint64_t *ret_bytes, uint64_t *ret_packets) {
772 uint64_t key, packets;
773 int r;
774
775 if (map_fd < 0)
776 return -EBADF;
777
778 if (ret_packets) {
779 key = MAP_KEY_PACKETS;
780 r = bpf_map_lookup_element(map_fd, &key, &packets);
781 if (r < 0)
782 return r;
783 }
784
785 if (ret_bytes) {
786 key = MAP_KEY_BYTES;
787 r = bpf_map_lookup_element(map_fd, &key, ret_bytes);
788 if (r < 0)
789 return r;
790 }
791
792 if (ret_packets)
793 *ret_packets = packets;
794
795 return 0;
796}
797
798int bpf_firewall_reset_accounting(int map_fd) {
799 uint64_t key, value = 0;
800 int r;
801
802 if (map_fd < 0)
803 return -EBADF;
804
805 key = MAP_KEY_PACKETS;
806 r = bpf_map_update_element(map_fd, &key, &value);
807 if (r < 0)
808 return r;
809
810 key = MAP_KEY_BYTES;
811 return bpf_map_update_element(map_fd, &key, &value);
812}
813
f140ed02
ZJS
814static int bpf_firewall_unsupported_reason = 0;
815
1988a9d1 816int bpf_firewall_supported(void) {
2899aac4 817 const struct bpf_insn trivial[] = {
93e93da5
LP
818 BPF_MOV64_IMM(BPF_REG_0, 1),
819 BPF_EXIT_INSN()
820 };
821
76dc1725 822 _cleanup_(bpf_program_freep) BPFProgram *program = NULL;
1988a9d1 823 static int supported = -1;
e583759b 824 union bpf_attr attr;
4c1567f2 825 int r;
1988a9d1 826
4c1567f2 827 /* Checks whether BPF firewalling is supported. For this, we check the following things:
1988a9d1 828 *
4c1567f2
AZ
829 * - whether the unified hierarchy is being used
830 * - the BPF implementation in the kernel supports BPF_PROG_TYPE_CGROUP_SKB programs, which we require
831 * - the BPF implementation in the kernel supports the BPF_PROG_DETACH call, which we require
1988a9d1 832 */
1988a9d1
DM
833 if (supported >= 0)
834 return supported;
835
1988a9d1
DM
836 r = cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER);
837 if (r < 0)
b1acbc08 838 return log_error_errno(r, "bpf-firewall: Can't determine whether the unified hierarchy is used: %m");
e583759b 839 if (r == 0) {
f140ed02
ZJS
840 bpf_firewall_unsupported_reason =
841 log_debug_errno(SYNTHETIC_ERRNO(EUCLEAN),
b1acbc08 842 "bpf-firewall: Not running with unified cgroup hierarchy, BPF firewalling is not supported.");
2ae7ee58 843 return supported = BPF_FIREWALL_UNSUPPORTED;
e583759b 844 }
1988a9d1 845
e0c694c7 846 /* prog_name is NULL since it is supported only starting from v4.15 kernel. */
8fe9dbb9 847 r = bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB, NULL, &program);
4355f1c9 848 if (r < 0) {
f140ed02 849 bpf_firewall_unsupported_reason =
b1acbc08 850 log_debug_errno(r, "bpf-firewall: Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
2ae7ee58 851 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5
LP
852 }
853
854 r = bpf_program_add_instructions(program, trivial, ELEMENTSOF(trivial));
855 if (r < 0) {
f140ed02 856 bpf_firewall_unsupported_reason =
b1acbc08 857 log_debug_errno(r, "bpf-firewall: Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
2ae7ee58 858 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5
LP
859 }
860
861 r = bpf_program_load_kernel(program, NULL, 0);
862 if (r < 0) {
f140ed02 863 bpf_firewall_unsupported_reason =
b1acbc08 864 log_debug_errno(r, "bpf-firewall: Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
2ae7ee58 865 return supported = BPF_FIREWALL_UNSUPPORTED;
93e93da5
LP
866 }
867
e583759b
LP
868 /* Unfortunately the kernel allows us to create BPF_PROG_TYPE_CGROUP_SKB programs even when CONFIG_CGROUP_BPF
869 * is turned off at kernel compilation time. This sucks of course: why does it allow us to create a cgroup BPF
870 * program if we can't do a thing with it later?
871 *
047de7e1 872 * We detect this case by issuing the BPF_PROG_DETACH bpf() call with invalid file descriptors: if
e583759b
LP
873 * CONFIG_CGROUP_BPF is turned off, then the call will fail early with EINVAL. If it is turned on the
874 * parameters are validated however, and that'll fail with EBADF then. */
875
9ca600e2
LB
876 // FIXME: Clang doesn't 0-pad with structured initialization, causing
877 // the kernel to reject the bpf_attr as invalid. See:
878 // https://github.com/torvalds/linux/blob/v5.9/kernel/bpf/syscall.c#L65
879 // Ideally it should behave like GCC, so that we can remove these workarounds.
880 zero(attr);
881 attr.attach_type = BPF_CGROUP_INET_EGRESS;
254d1313
ZJS
882 attr.target_fd = -EBADF;
883 attr.attach_bpf_fd = -EBADF;
e583759b 884
047de7e1 885 if (bpf(BPF_PROG_DETACH, &attr, sizeof(attr)) < 0) {
2ae7ee58 886 if (errno != EBADF) {
f140ed02 887 bpf_firewall_unsupported_reason =
b1acbc08 888 log_debug_errno(errno, "bpf-firewall: Didn't get EBADF from BPF_PROG_DETACH, BPF firewalling is not supported: %m");
2ae7ee58
LP
889 return supported = BPF_FIREWALL_UNSUPPORTED;
890 }
891
892 /* YAY! */
893 } else {
8751bb6f
YW
894 bpf_firewall_unsupported_reason =
895 log_debug_errno(SYNTHETIC_ERRNO(EBADE),
b1acbc08 896 "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_DETACH call? "
8751bb6f 897 "Something is weird, assuming BPF firewalling is broken and hence not supported.");
2ae7ee58
LP
898 return supported = BPF_FIREWALL_UNSUPPORTED;
899 }
e583759b 900
2ae7ee58 901 /* So now we know that the BPF program is generally available, let's see if BPF_F_ALLOW_MULTI is also supported
047de7e1
AF
902 * (which was added in kernel 4.15). We use a similar logic as before, but this time we use the BPF_PROG_ATTACH
903 * bpf() call and the BPF_F_ALLOW_MULTI flags value. Since the flags are checked early in the system call we'll
e0c694c7
JK
904 * get EINVAL if it's not supported, and EBADF as before if it is available.
905 * Use probe result as the indicator that program name is also supported since they both were
906 * added in kernel 4.15. */
e583759b 907
9ca600e2
LB
908 zero(attr);
909 attr.attach_type = BPF_CGROUP_INET_EGRESS;
254d1313
ZJS
910 attr.target_fd = -EBADF;
911 attr.attach_bpf_fd = -EBADF;
9ca600e2 912 attr.attach_flags = BPF_F_ALLOW_MULTI;
2ae7ee58 913
b1c05b98 914 if (bpf(BPF_PROG_ATTACH, &attr, sizeof(attr)) < 0) {
2ae7ee58 915 if (errno == EBADF) {
b1acbc08 916 log_debug_errno(errno, "bpf-firewall: Got EBADF when using BPF_F_ALLOW_MULTI, which indicates it is supported. Yay!");
2ae7ee58
LP
917 return supported = BPF_FIREWALL_SUPPORTED_WITH_MULTI;
918 }
919
920 if (errno == EINVAL)
b1acbc08 921 log_debug_errno(errno, "bpf-firewall: Got EINVAL error when using BPF_F_ALLOW_MULTI, which indicates it's not supported.");
2ae7ee58 922 else
b1acbc08 923 log_debug_errno(errno, "bpf-firewall: Got unexpected error when using BPF_F_ALLOW_MULTI, assuming it's not supported: %m");
2ae7ee58
LP
924
925 return supported = BPF_FIREWALL_SUPPORTED;
926 } else {
8751bb6f
YW
927 bpf_firewall_unsupported_reason =
928 log_debug_errno(SYNTHETIC_ERRNO(EBADE),
b1acbc08 929 "bpf-firewall: Wut? Kernel accepted our invalid BPF_PROG_ATTACH+BPF_F_ALLOW_MULTI call? "
8751bb6f 930 "Something is weird, assuming BPF firewalling is broken and hence not supported.");
2ae7ee58
LP
931 return supported = BPF_FIREWALL_UNSUPPORTED;
932 }
1988a9d1 933}
84d2744b
ZJS
934
935void emit_bpf_firewall_warning(Unit *u) {
936 static bool warned = false;
937
a42232a1
LB
938 assert(u);
939 assert(u->manager);
940
d0113312
LP
941 if (warned || MANAGER_IS_TEST_RUN(u->manager))
942 return;
943
944 bool quiet = ERRNO_IS_PRIVILEGE(bpf_firewall_unsupported_reason) && detect_container() > 0;
945
946 log_unit_full_errno(u, quiet ? LOG_DEBUG : LOG_WARNING, bpf_firewall_unsupported_reason,
947 "unit configures an IP firewall, but %s.\n"
948 "(This warning is only shown for the first unit using IP firewalling.)",
949 getuid() != 0 ? "not running as root" :
950 "the local system does not support BPF/cgroup firewalling");
951 warned = true;
84d2744b 952}
0fd9c28c
LP
953
954void bpf_firewall_close(Unit *u) {
955 assert(u);
956
957 u->ip_accounting_ingress_map_fd = safe_close(u->ip_accounting_ingress_map_fd);
958 u->ip_accounting_egress_map_fd = safe_close(u->ip_accounting_egress_map_fd);
959
960 u->ipv4_allow_map_fd = safe_close(u->ipv4_allow_map_fd);
961 u->ipv6_allow_map_fd = safe_close(u->ipv6_allow_map_fd);
962 u->ipv4_deny_map_fd = safe_close(u->ipv4_deny_map_fd);
963 u->ipv6_deny_map_fd = safe_close(u->ipv6_deny_map_fd);
964
76dc1725 965 u->ip_bpf_ingress = bpf_program_free(u->ip_bpf_ingress);
966 u->ip_bpf_ingress_installed = bpf_program_free(u->ip_bpf_ingress_installed);
967 u->ip_bpf_egress = bpf_program_free(u->ip_bpf_egress);
968 u->ip_bpf_egress_installed = bpf_program_free(u->ip_bpf_egress_installed);
0fd9c28c
LP
969
970 u->ip_bpf_custom_ingress = set_free(u->ip_bpf_custom_ingress);
971 u->ip_bpf_custom_egress = set_free(u->ip_bpf_custom_egress);
972 u->ip_bpf_custom_ingress_installed = set_free(u->ip_bpf_custom_ingress_installed);
973 u->ip_bpf_custom_egress_installed = set_free(u->ip_bpf_custom_egress_installed);
974}