1 /* SPDX-License-Identifier: LGPL-2.1+ */
3 This file is part of systemd.
5 Copyright 2016 Daniel Mack
7 systemd is free software; you can redistribute it and/or modify it
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
10 (at your option) any later version.
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 Lesser General Public License for more details.
17 You should have received a copy of the GNU Lesser General Public License
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
21 #include <arpa/inet.h>
25 #include <linux/libbpf.h>
26 #include <net/ethernet.h>
28 #include <netinet/ip.h>
29 #include <netinet/ip6.h>
36 #include "alloc-util.h"
37 #include "bpf-firewall.h"
38 #include "bpf-program.h"
40 #include "ip-address-access.h"
53 /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
55 static int add_lookup_instructions(
62 int r
, addr_offset
, addr_size
;
70 addr_size
= sizeof(uint32_t);
71 addr_offset
= is_ingress
?
72 offsetof(struct iphdr
, saddr
) :
73 offsetof(struct iphdr
, daddr
);
77 addr_size
= 4 * sizeof(uint32_t);
78 addr_offset
= is_ingress
?
79 offsetof(struct ip6_hdr
, ip6_src
.s6_addr
) :
80 offsetof(struct ip6_hdr
, ip6_dst
.s6_addr
);
88 /* Compare IPv4 with one word instruction (32bit) */
89 struct bpf_insn insn
[] = {
90 /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
91 BPF_JMP_IMM(BPF_JNE
, BPF_REG_7
, htobe16(protocol
), 0),
94 * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
96 * R1: Pointer to the skb
98 * R3: Destination buffer on the stack (r10 - 4)
99 * R4: Number of bytes to read (4)
102 BPF_MOV64_REG(BPF_REG_1
, BPF_REG_6
),
103 BPF_MOV32_IMM(BPF_REG_2
, addr_offset
),
105 BPF_MOV64_REG(BPF_REG_3
, BPF_REG_10
),
106 BPF_ALU64_IMM(BPF_ADD
, BPF_REG_3
, -addr_size
),
108 BPF_MOV32_IMM(BPF_REG_4
, addr_size
),
109 BPF_RAW_INSN(BPF_JMP
| BPF_CALL
, 0, 0, 0, BPF_FUNC_skb_load_bytes
),
112 * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
113 * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
114 * has to be set to the maximum possible value.
116 * On success, the looked up value is stored in R0. For this application, the actual
117 * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
121 BPF_LD_MAP_FD(BPF_REG_1
, map_fd
),
122 BPF_MOV64_REG(BPF_REG_2
, BPF_REG_10
),
123 BPF_ALU64_IMM(BPF_ADD
, BPF_REG_2
, -addr_size
- sizeof(uint32_t)),
124 BPF_ST_MEM(BPF_W
, BPF_REG_2
, 0, addr_size
* 8),
126 BPF_RAW_INSN(BPF_JMP
| BPF_CALL
, 0, 0, 0, BPF_FUNC_map_lookup_elem
),
127 BPF_JMP_IMM(BPF_JEQ
, BPF_REG_0
, 0, 1),
128 BPF_ALU32_IMM(BPF_OR
, BPF_REG_8
, verdict
),
131 /* Jump label fixup */
132 insn
[0].off
= ELEMENTSOF(insn
) - 1;
134 r
= bpf_program_add_instructions(p
, insn
, ELEMENTSOF(insn
));
143 static int bpf_firewall_compile_bpf(
148 struct bpf_insn pre_insn
[] = {
150 * When the eBPF program is entered, R1 contains the address of the skb.
151 * However, R1-R5 are scratch registers that are not preserved when calling
152 * into kernel functions, so we need to save anything that's supposed to
153 * stay around to R6-R9. Save the skb to R6.
155 BPF_MOV64_REG(BPF_REG_6
, BPF_REG_1
),
158 * Although we cannot access the skb data directly from eBPF programs used in this
159 * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
160 * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
163 BPF_LDX_MEM(BPF_W
, BPF_REG_7
, BPF_REG_6
, offsetof(struct __sk_buff
, protocol
)),
166 * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
167 * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
169 BPF_MOV32_IMM(BPF_REG_8
, 0),
173 * The access checkers compiled for the configured allowance and denial lists
174 * write to R8 at runtime. The following code prepares for an early exit that
175 * skip the accounting if the packet is denied.
178 * if (R8 == ACCESS_DENIED)
181 * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
182 * is allowed to pass.
184 struct bpf_insn post_insn
[] = {
185 BPF_MOV64_IMM(BPF_REG_0
, 1),
186 BPF_JMP_IMM(BPF_JNE
, BPF_REG_8
, ACCESS_DENIED
, 1),
187 BPF_MOV64_IMM(BPF_REG_0
, 0),
190 _cleanup_(bpf_program_unrefp
) BPFProgram
*p
= NULL
;
191 int accounting_map_fd
, r
;
197 accounting_map_fd
= is_ingress
?
198 u
->ip_accounting_ingress_map_fd
:
199 u
->ip_accounting_egress_map_fd
;
202 u
->ipv4_allow_map_fd
>= 0 ||
203 u
->ipv6_allow_map_fd
>= 0 ||
204 u
->ipv4_deny_map_fd
>= 0 ||
205 u
->ipv6_deny_map_fd
>= 0;
207 if (accounting_map_fd
< 0 && !access_enabled
) {
212 r
= bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB
, &p
);
216 r
= bpf_program_add_instructions(p
, pre_insn
, ELEMENTSOF(pre_insn
));
220 if (access_enabled
) {
222 * The simple rule this function translates into eBPF instructions is:
224 * - Access will be granted when an address matches an entry in @list_allow
225 * - Otherwise, access will be denied when an address matches an entry in @list_deny
226 * - Otherwise, access will be granted
229 if (u
->ipv4_deny_map_fd
>= 0) {
230 r
= add_lookup_instructions(p
, u
->ipv4_deny_map_fd
, ETH_P_IP
, is_ingress
, ACCESS_DENIED
);
235 if (u
->ipv6_deny_map_fd
>= 0) {
236 r
= add_lookup_instructions(p
, u
->ipv6_deny_map_fd
, ETH_P_IPV6
, is_ingress
, ACCESS_DENIED
);
241 if (u
->ipv4_allow_map_fd
>= 0) {
242 r
= add_lookup_instructions(p
, u
->ipv4_allow_map_fd
, ETH_P_IP
, is_ingress
, ACCESS_ALLOWED
);
247 if (u
->ipv6_allow_map_fd
>= 0) {
248 r
= add_lookup_instructions(p
, u
->ipv6_allow_map_fd
, ETH_P_IPV6
, is_ingress
, ACCESS_ALLOWED
);
254 r
= bpf_program_add_instructions(p
, post_insn
, ELEMENTSOF(post_insn
));
258 if (accounting_map_fd
>= 0) {
259 struct bpf_insn insn
[] = {
261 * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
262 * The jump label will be fixed up later.
264 BPF_JMP_IMM(BPF_JEQ
, BPF_REG_0
, 0, 0),
267 BPF_MOV64_IMM(BPF_REG_0
, MAP_KEY_PACKETS
), /* r0 = 0 */
268 BPF_STX_MEM(BPF_W
, BPF_REG_10
, BPF_REG_0
, -4), /* *(u32 *)(fp - 4) = r0 */
269 BPF_MOV64_REG(BPF_REG_2
, BPF_REG_10
),
270 BPF_ALU64_IMM(BPF_ADD
, BPF_REG_2
, -4), /* r2 = fp - 4 */
271 BPF_LD_MAP_FD(BPF_REG_1
, accounting_map_fd
), /* load map fd to r1 */
272 BPF_RAW_INSN(BPF_JMP
| BPF_CALL
, 0, 0, 0, BPF_FUNC_map_lookup_elem
),
273 BPF_JMP_IMM(BPF_JEQ
, BPF_REG_0
, 0, 2),
274 BPF_MOV64_IMM(BPF_REG_1
, 1), /* r1 = 1 */
275 BPF_RAW_INSN(BPF_STX
| BPF_XADD
| BPF_DW
, BPF_REG_0
, BPF_REG_1
, 0, 0), /* xadd r0 += r1 */
278 BPF_MOV64_IMM(BPF_REG_0
, MAP_KEY_BYTES
), /* r0 = 1 */
279 BPF_STX_MEM(BPF_W
, BPF_REG_10
, BPF_REG_0
, -4), /* *(u32 *)(fp - 4) = r0 */
280 BPF_MOV64_REG(BPF_REG_2
, BPF_REG_10
),
281 BPF_ALU64_IMM(BPF_ADD
, BPF_REG_2
, -4), /* r2 = fp - 4 */
282 BPF_LD_MAP_FD(BPF_REG_1
, accounting_map_fd
),
283 BPF_RAW_INSN(BPF_JMP
| BPF_CALL
, 0, 0, 0, BPF_FUNC_map_lookup_elem
),
284 BPF_JMP_IMM(BPF_JEQ
, BPF_REG_0
, 0, 2),
285 BPF_LDX_MEM(BPF_W
, BPF_REG_1
, BPF_REG_6
, offsetof(struct __sk_buff
, len
)), /* r1 = skb->len */
286 BPF_RAW_INSN(BPF_STX
| BPF_XADD
| BPF_DW
, BPF_REG_0
, BPF_REG_1
, 0, 0), /* xadd r0 += r1 */
288 /* Allow the packet to pass */
289 BPF_MOV64_IMM(BPF_REG_0
, 1),
292 /* Jump label fixup */
293 insn
[0].off
= ELEMENTSOF(insn
) - 1;
295 r
= bpf_program_add_instructions(p
, insn
, ELEMENTSOF(insn
));
302 * Exit from the eBPF program, R0 contains the verdict.
303 * 0 means the packet is denied, 1 means the packet may pass.
305 struct bpf_insn insn
[] = {
309 r
= bpf_program_add_instructions(p
, insn
, ELEMENTSOF(insn
));
320 static int bpf_firewall_count_access_items(IPAddressAccessItem
*list
, size_t *n_ipv4
, size_t *n_ipv6
) {
321 IPAddressAccessItem
*a
;
326 LIST_FOREACH(items
, a
, list
) {
338 return -EAFNOSUPPORT
;
345 static int bpf_firewall_add_access_items(
346 IPAddressAccessItem
*list
,
351 struct bpf_lpm_trie_key
*key_ipv4
, *key_ipv6
;
352 uint64_t value
= verdict
;
353 IPAddressAccessItem
*a
;
356 key_ipv4
= alloca0(offsetof(struct bpf_lpm_trie_key
, data
) + sizeof(uint32_t));
357 key_ipv6
= alloca0(offsetof(struct bpf_lpm_trie_key
, data
) + sizeof(uint32_t) * 4);
359 LIST_FOREACH(items
, a
, list
) {
363 key_ipv4
->prefixlen
= a
->prefixlen
;
364 memcpy(key_ipv4
->data
, &a
->address
, sizeof(uint32_t));
366 r
= bpf_map_update_element(ipv4_map_fd
, key_ipv4
, &value
);
373 key_ipv6
->prefixlen
= a
->prefixlen
;
374 memcpy(key_ipv6
->data
, &a
->address
, 4 * sizeof(uint32_t));
376 r
= bpf_map_update_element(ipv6_map_fd
, key_ipv6
, &value
);
383 return -EAFNOSUPPORT
;
390 static int bpf_firewall_prepare_access_maps(
393 int *ret_ipv4_map_fd
,
394 int *ret_ipv6_map_fd
) {
396 _cleanup_close_
int ipv4_map_fd
= -1, ipv6_map_fd
= -1;
397 size_t n_ipv4
= 0, n_ipv6
= 0;
401 assert(ret_ipv4_map_fd
);
402 assert(ret_ipv6_map_fd
);
404 for (p
= u
; p
; p
= UNIT_DEREF(p
->slice
)) {
407 cc
= unit_get_cgroup_context(p
);
411 bpf_firewall_count_access_items(verdict
== ACCESS_ALLOWED
? cc
->ip_address_allow
: cc
->ip_address_deny
, &n_ipv4
, &n_ipv6
);
415 ipv4_map_fd
= bpf_map_new(
416 BPF_MAP_TYPE_LPM_TRIE
,
417 offsetof(struct bpf_lpm_trie_key
, data
) + sizeof(uint32_t),
426 ipv6_map_fd
= bpf_map_new(
427 BPF_MAP_TYPE_LPM_TRIE
,
428 offsetof(struct bpf_lpm_trie_key
, data
) + sizeof(uint32_t)*4,
436 for (p
= u
; p
; p
= UNIT_DEREF(p
->slice
)) {
439 cc
= unit_get_cgroup_context(p
);
443 r
= bpf_firewall_add_access_items(verdict
== ACCESS_ALLOWED
? cc
->ip_address_allow
: cc
->ip_address_deny
,
444 ipv4_map_fd
, ipv6_map_fd
, verdict
);
449 *ret_ipv4_map_fd
= ipv4_map_fd
;
450 *ret_ipv6_map_fd
= ipv6_map_fd
;
452 ipv4_map_fd
= ipv6_map_fd
= -1;
456 static int bpf_firewall_prepare_accounting_maps(bool enabled
, int *fd_ingress
, int *fd_egress
) {
463 if (*fd_ingress
< 0) {
464 r
= bpf_map_new(BPF_MAP_TYPE_ARRAY
, sizeof(int), sizeof(uint64_t), 2, 0);
471 if (*fd_egress
< 0) {
473 r
= bpf_map_new(BPF_MAP_TYPE_ARRAY
, sizeof(int), sizeof(uint64_t), 2, 0);
480 *fd_ingress
= safe_close(*fd_ingress
);
481 *fd_egress
= safe_close(*fd_egress
);
487 int bpf_firewall_compile(Unit
*u
) {
493 r
= bpf_firewall_supported();
497 log_debug("BPF firewalling not supported on this systemd, proceeding without.");
501 /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
502 * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
503 * configuration, but we don't flush out the accounting unnecessarily */
505 u
->ip_bpf_ingress
= bpf_program_unref(u
->ip_bpf_ingress
);
506 u
->ip_bpf_egress
= bpf_program_unref(u
->ip_bpf_egress
);
508 u
->ipv4_allow_map_fd
= safe_close(u
->ipv4_allow_map_fd
);
509 u
->ipv4_deny_map_fd
= safe_close(u
->ipv4_deny_map_fd
);
511 u
->ipv6_allow_map_fd
= safe_close(u
->ipv6_allow_map_fd
);
512 u
->ipv6_deny_map_fd
= safe_close(u
->ipv6_deny_map_fd
);
514 cc
= unit_get_cgroup_context(u
);
518 r
= bpf_firewall_prepare_access_maps(u
, ACCESS_ALLOWED
, &u
->ipv4_allow_map_fd
, &u
->ipv6_allow_map_fd
);
520 return log_error_errno(r
, "Preparation of eBPF allow maps failed: %m");
522 r
= bpf_firewall_prepare_access_maps(u
, ACCESS_DENIED
, &u
->ipv4_deny_map_fd
, &u
->ipv6_deny_map_fd
);
524 return log_error_errno(r
, "Preparation of eBPF deny maps failed: %m");
526 r
= bpf_firewall_prepare_accounting_maps(cc
->ip_accounting
, &u
->ip_accounting_ingress_map_fd
, &u
->ip_accounting_egress_map_fd
);
528 return log_error_errno(r
, "Preparation of eBPF accounting maps failed: %m");
530 r
= bpf_firewall_compile_bpf(u
, true, &u
->ip_bpf_ingress
);
532 return log_error_errno(r
, "Compilation for ingress BPF program failed: %m");
534 r
= bpf_firewall_compile_bpf(u
, false, &u
->ip_bpf_egress
);
536 return log_error_errno(r
, "Compilation for egress BPF program failed: %m");
541 int bpf_firewall_install(Unit
*u
) {
542 _cleanup_free_
char *path
= NULL
;
551 cc
= unit_get_cgroup_context(u
);
555 r
= bpf_firewall_supported();
559 log_debug("BPF firewalling not supported on this systemd, proceeding without.");
563 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, u
->cgroup_path
, NULL
, &path
);
565 return log_error_errno(r
, "Failed to determine cgroup path: %m");
567 if (u
->ip_bpf_egress
) {
568 r
= bpf_program_load_kernel(u
->ip_bpf_egress
, NULL
, 0);
570 return log_error_errno(r
, "Kernel upload of egress BPF program failed: %m");
572 r
= bpf_program_cgroup_attach(u
->ip_bpf_egress
, BPF_CGROUP_INET_EGRESS
, path
, cc
->delegate
? BPF_F_ALLOW_OVERRIDE
: 0);
574 return log_error_errno(r
, "Attaching egress BPF program to cgroup %s failed: %m", path
);
576 r
= bpf_program_cgroup_detach(BPF_CGROUP_INET_EGRESS
, path
);
578 return log_full_errno(r
== -ENOENT
? LOG_DEBUG
: LOG_ERR
, r
,
579 "Detaching egress BPF program from cgroup failed: %m");
582 if (u
->ip_bpf_ingress
) {
583 r
= bpf_program_load_kernel(u
->ip_bpf_ingress
, NULL
, 0);
585 return log_error_errno(r
, "Kernel upload of ingress BPF program failed: %m");
587 r
= bpf_program_cgroup_attach(u
->ip_bpf_ingress
, BPF_CGROUP_INET_INGRESS
, path
, cc
->delegate
? BPF_F_ALLOW_OVERRIDE
: 0);
589 return log_error_errno(r
, "Attaching ingress BPF program to cgroup %s failed: %m", path
);
591 r
= bpf_program_cgroup_detach(BPF_CGROUP_INET_INGRESS
, path
);
593 return log_full_errno(r
== -ENOENT
? LOG_DEBUG
: LOG_ERR
, r
,
594 "Detaching ingress BPF program from cgroup failed: %m");
600 int bpf_firewall_read_accounting(int map_fd
, uint64_t *ret_bytes
, uint64_t *ret_packets
) {
601 uint64_t key
, packets
;
608 key
= MAP_KEY_PACKETS
;
609 r
= bpf_map_lookup_element(map_fd
, &key
, &packets
);
616 r
= bpf_map_lookup_element(map_fd
, &key
, ret_bytes
);
622 *ret_packets
= packets
;
627 int bpf_firewall_reset_accounting(int map_fd
) {
628 uint64_t key
, value
= 0;
634 key
= MAP_KEY_PACKETS
;
635 r
= bpf_map_update_element(map_fd
, &key
, &value
);
640 return bpf_map_update_element(map_fd
, &key
, &value
);
644 int bpf_firewall_supported(void) {
645 struct bpf_insn trivial
[] = {
646 BPF_MOV64_IMM(BPF_REG_0
, 1),
650 _cleanup_(bpf_program_unrefp
) BPFProgram
*program
= NULL
;
651 static int supported
= -1;
654 /* Checks whether BPF firewalling is supported. For this, we check three things:
656 * a) whether we are privileged
657 * b) whether the unified hierarchy is being used
658 * c) the BPF implementation in the kernel supports BPF LPM TRIE maps, which we require
665 if (geteuid() != 0) {
666 log_debug("Not enough privileges, BPF firewalling is not supported.");
667 return supported
= false;
670 r
= cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
);
672 return log_error_errno(r
, "Can't determine whether the unified hierarchy is used: %m");
674 return supported
= false;
676 fd
= bpf_map_new(BPF_MAP_TYPE_LPM_TRIE
,
677 offsetof(struct bpf_lpm_trie_key
, data
) + sizeof(uint64_t),
682 log_debug_errno(r
, "Can't allocate BPF LPM TRIE map, BPF firewalling is not supported: %m");
683 return supported
= false;
688 if (bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB
, &program
) < 0) {
689 log_debug_errno(r
, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
690 return supported
= false;
693 r
= bpf_program_add_instructions(program
, trivial
, ELEMENTSOF(trivial
));
695 log_debug_errno(r
, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
696 return supported
= false;
699 r
= bpf_program_load_kernel(program
, NULL
, 0);
701 log_debug_errno(r
, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
702 return supported
= false;
705 return supported
= true;