2 This file is part of systemd.
4 Copyright 2016 Daniel Mack
6 systemd is free software; you can redistribute it and/or modify it
7 under the terms of the GNU Lesser General Public License as published by
8 the Free Software Foundation; either version 2.1 of the License, or
9 (at your option) any later version.
11 systemd is distributed in the hope that it will be useful, but
12 WITHOUT ANY WARRANTY; without even the implied warranty of
13 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 Lesser General Public License for more details.
16 You should have received a copy of the GNU Lesser General Public License
17 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 #include <arpa/inet.h>
24 #include <linux/libbpf.h>
25 #include <net/ethernet.h>
27 #include <netinet/ip.h>
28 #include <netinet/ip6.h>
35 #include "alloc-util.h"
36 #include "bpf-firewall.h"
37 #include "bpf-program.h"
39 #include "ip-address-access.h"
52 /* Compile instructions for one list of addresses, one direction and one specific verdict on matches. */
54 static int add_lookup_instructions(
61 int r
, addr_offset
, addr_size
;
69 addr_size
= sizeof(uint32_t);
70 addr_offset
= is_ingress
?
71 offsetof(struct iphdr
, saddr
) :
72 offsetof(struct iphdr
, daddr
);
76 addr_size
= 4 * sizeof(uint32_t);
77 addr_offset
= is_ingress
?
78 offsetof(struct ip6_hdr
, ip6_src
.s6_addr
) :
79 offsetof(struct ip6_hdr
, ip6_dst
.s6_addr
);
87 /* Compare IPv4 with one word instruction (32bit) */
88 struct bpf_insn insn
[] = {
89 /* If skb->protocol != ETH_P_IP, skip this whole block. The offset will be set later. */
90 BPF_JMP_IMM(BPF_JNE
, BPF_REG_7
, htobe16(protocol
), 0),
93 * Call into BPF_FUNC_skb_load_bytes to load the dst/src IP address
95 * R1: Pointer to the skb
97 * R3: Destination buffer on the stack (r10 - 4)
98 * R4: Number of bytes to read (4)
101 BPF_MOV64_REG(BPF_REG_1
, BPF_REG_6
),
102 BPF_MOV32_IMM(BPF_REG_2
, addr_offset
),
104 BPF_MOV64_REG(BPF_REG_3
, BPF_REG_10
),
105 BPF_ALU64_IMM(BPF_ADD
, BPF_REG_3
, -addr_size
),
107 BPF_MOV32_IMM(BPF_REG_4
, addr_size
),
108 BPF_RAW_INSN(BPF_JMP
| BPF_CALL
, 0, 0, 0, BPF_FUNC_skb_load_bytes
),
111 * Call into BPF_FUNC_map_lookup_elem to see if the address matches any entry in the
112 * LPM trie map. For this to work, the prefixlen field of 'struct bpf_lpm_trie_key'
113 * has to be set to the maximum possible value.
115 * On success, the looked up value is stored in R0. For this application, the actual
116 * value doesn't matter, however; we just set the bit in @verdict in R8 if we found any
120 BPF_LD_MAP_FD(BPF_REG_1
, map_fd
),
121 BPF_MOV64_REG(BPF_REG_2
, BPF_REG_10
),
122 BPF_ALU64_IMM(BPF_ADD
, BPF_REG_2
, -addr_size
- sizeof(uint32_t)),
123 BPF_ST_MEM(BPF_W
, BPF_REG_2
, 0, addr_size
* 8),
125 BPF_RAW_INSN(BPF_JMP
| BPF_CALL
, 0, 0, 0, BPF_FUNC_map_lookup_elem
),
126 BPF_JMP_IMM(BPF_JEQ
, BPF_REG_0
, 0, 1),
127 BPF_ALU32_IMM(BPF_OR
, BPF_REG_8
, verdict
),
130 /* Jump label fixup */
131 insn
[0].off
= ELEMENTSOF(insn
) - 1;
133 r
= bpf_program_add_instructions(p
, insn
, ELEMENTSOF(insn
));
142 static int bpf_firewall_compile_bpf(
147 struct bpf_insn pre_insn
[] = {
149 * When the eBPF program is entered, R1 contains the address of the skb.
150 * However, R1-R5 are scratch registers that are not preserved when calling
151 * into kernel functions, so we need to save anything that's supposed to
152 * stay around to R6-R9. Save the skb to R6.
154 BPF_MOV64_REG(BPF_REG_6
, BPF_REG_1
),
157 * Although we cannot access the skb data directly from eBPF programs used in this
158 * scenario, the kernel has prepared some fields for us to access through struct __sk_buff.
159 * Load the protocol (IPv4, IPv6) used by the packet in flight once and cache it in R7
162 BPF_LDX_MEM(BPF_W
, BPF_REG_7
, BPF_REG_6
, offsetof(struct __sk_buff
, protocol
)),
165 * R8 is used to keep track of whether any address check has explicitly allowed or denied the packet
166 * through ACCESS_DENIED or ACCESS_ALLOWED bits. Reset them both to 0 in the beginning.
168 BPF_MOV32_IMM(BPF_REG_8
, 0),
172 * The access checkers compiled for the configured allowance and denial lists
173 * write to R8 at runtime. The following code prepares for an early exit that
174 * skip the accounting if the packet is denied.
177 * if (R8 == ACCESS_DENIED)
180 * This means that if both ACCESS_DENIED and ACCESS_ALLOWED are set, the packet
181 * is allowed to pass.
183 struct bpf_insn post_insn
[] = {
184 BPF_MOV64_IMM(BPF_REG_0
, 1),
185 BPF_JMP_IMM(BPF_JNE
, BPF_REG_8
, ACCESS_DENIED
, 1),
186 BPF_MOV64_IMM(BPF_REG_0
, 0),
189 _cleanup_(bpf_program_unrefp
) BPFProgram
*p
= NULL
;
190 int accounting_map_fd
, r
;
196 accounting_map_fd
= is_ingress
?
197 u
->ip_accounting_ingress_map_fd
:
198 u
->ip_accounting_egress_map_fd
;
201 u
->ipv4_allow_map_fd
>= 0 ||
202 u
->ipv6_allow_map_fd
>= 0 ||
203 u
->ipv4_deny_map_fd
>= 0 ||
204 u
->ipv6_deny_map_fd
>= 0;
206 if (accounting_map_fd
< 0 && !access_enabled
) {
211 r
= bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB
, &p
);
215 r
= bpf_program_add_instructions(p
, pre_insn
, ELEMENTSOF(pre_insn
));
219 if (access_enabled
) {
221 * The simple rule this function translates into eBPF instructions is:
223 * - Access will be granted when an address matches an entry in @list_allow
224 * - Otherwise, access will be denied when an address matches an entry in @list_deny
225 * - Otherwise, access will be granted
228 if (u
->ipv4_deny_map_fd
>= 0) {
229 r
= add_lookup_instructions(p
, u
->ipv4_deny_map_fd
, ETH_P_IP
, is_ingress
, ACCESS_DENIED
);
234 if (u
->ipv6_deny_map_fd
>= 0) {
235 r
= add_lookup_instructions(p
, u
->ipv6_deny_map_fd
, ETH_P_IPV6
, is_ingress
, ACCESS_DENIED
);
240 if (u
->ipv4_allow_map_fd
>= 0) {
241 r
= add_lookup_instructions(p
, u
->ipv4_allow_map_fd
, ETH_P_IP
, is_ingress
, ACCESS_ALLOWED
);
246 if (u
->ipv6_allow_map_fd
>= 0) {
247 r
= add_lookup_instructions(p
, u
->ipv6_allow_map_fd
, ETH_P_IPV6
, is_ingress
, ACCESS_ALLOWED
);
253 r
= bpf_program_add_instructions(p
, post_insn
, ELEMENTSOF(post_insn
));
257 if (accounting_map_fd
>= 0) {
258 struct bpf_insn insn
[] = {
260 * If R0 == 0, the packet will be denied; skip the accounting instructions in this case.
261 * The jump label will be fixed up later.
263 BPF_JMP_IMM(BPF_JEQ
, BPF_REG_0
, 0, 0),
266 BPF_MOV64_IMM(BPF_REG_0
, MAP_KEY_PACKETS
), /* r0 = 0 */
267 BPF_STX_MEM(BPF_W
, BPF_REG_10
, BPF_REG_0
, -4), /* *(u32 *)(fp - 4) = r0 */
268 BPF_MOV64_REG(BPF_REG_2
, BPF_REG_10
),
269 BPF_ALU64_IMM(BPF_ADD
, BPF_REG_2
, -4), /* r2 = fp - 4 */
270 BPF_LD_MAP_FD(BPF_REG_1
, accounting_map_fd
), /* load map fd to r1 */
271 BPF_RAW_INSN(BPF_JMP
| BPF_CALL
, 0, 0, 0, BPF_FUNC_map_lookup_elem
),
272 BPF_JMP_IMM(BPF_JEQ
, BPF_REG_0
, 0, 2),
273 BPF_MOV64_IMM(BPF_REG_1
, 1), /* r1 = 1 */
274 BPF_RAW_INSN(BPF_STX
| BPF_XADD
| BPF_DW
, BPF_REG_0
, BPF_REG_1
, 0, 0), /* xadd r0 += r1 */
277 BPF_MOV64_IMM(BPF_REG_0
, MAP_KEY_BYTES
), /* r0 = 1 */
278 BPF_STX_MEM(BPF_W
, BPF_REG_10
, BPF_REG_0
, -4), /* *(u32 *)(fp - 4) = r0 */
279 BPF_MOV64_REG(BPF_REG_2
, BPF_REG_10
),
280 BPF_ALU64_IMM(BPF_ADD
, BPF_REG_2
, -4), /* r2 = fp - 4 */
281 BPF_LD_MAP_FD(BPF_REG_1
, accounting_map_fd
),
282 BPF_RAW_INSN(BPF_JMP
| BPF_CALL
, 0, 0, 0, BPF_FUNC_map_lookup_elem
),
283 BPF_JMP_IMM(BPF_JEQ
, BPF_REG_0
, 0, 2),
284 BPF_LDX_MEM(BPF_W
, BPF_REG_1
, BPF_REG_6
, offsetof(struct __sk_buff
, len
)), /* r1 = skb->len */
285 BPF_RAW_INSN(BPF_STX
| BPF_XADD
| BPF_DW
, BPF_REG_0
, BPF_REG_1
, 0, 0), /* xadd r0 += r1 */
287 /* Allow the packet to pass */
288 BPF_MOV64_IMM(BPF_REG_0
, 1),
291 /* Jump label fixup */
292 insn
[0].off
= ELEMENTSOF(insn
) - 1;
294 r
= bpf_program_add_instructions(p
, insn
, ELEMENTSOF(insn
));
301 * Exit from the eBPF program, R0 contains the verdict.
302 * 0 means the packet is denied, 1 means the packet may pass.
304 struct bpf_insn insn
[] = {
308 r
= bpf_program_add_instructions(p
, insn
, ELEMENTSOF(insn
));
319 static int bpf_firewall_count_access_items(IPAddressAccessItem
*list
, size_t *n_ipv4
, size_t *n_ipv6
) {
320 IPAddressAccessItem
*a
;
325 LIST_FOREACH(items
, a
, list
) {
337 return -EAFNOSUPPORT
;
344 static int bpf_firewall_add_access_items(
345 IPAddressAccessItem
*list
,
350 struct bpf_lpm_trie_key
*key_ipv4
, *key_ipv6
;
351 uint64_t value
= verdict
;
352 IPAddressAccessItem
*a
;
355 key_ipv4
= alloca0(offsetof(struct bpf_lpm_trie_key
, data
) + sizeof(uint32_t));
356 key_ipv6
= alloca0(offsetof(struct bpf_lpm_trie_key
, data
) + sizeof(uint32_t) * 4);
358 LIST_FOREACH(items
, a
, list
) {
362 key_ipv4
->prefixlen
= a
->prefixlen
;
363 memcpy(key_ipv4
->data
, &a
->address
, sizeof(uint32_t));
365 r
= bpf_map_update_element(ipv4_map_fd
, key_ipv4
, &value
);
372 key_ipv6
->prefixlen
= a
->prefixlen
;
373 memcpy(key_ipv6
->data
, &a
->address
, 4 * sizeof(uint32_t));
375 r
= bpf_map_update_element(ipv6_map_fd
, key_ipv6
, &value
);
382 return -EAFNOSUPPORT
;
389 static int bpf_firewall_prepare_access_maps(
392 int *ret_ipv4_map_fd
,
393 int *ret_ipv6_map_fd
) {
395 _cleanup_close_
int ipv4_map_fd
= -1, ipv6_map_fd
= -1;
396 size_t n_ipv4
= 0, n_ipv6
= 0;
400 assert(ret_ipv4_map_fd
);
401 assert(ret_ipv6_map_fd
);
403 for (p
= u
; p
; p
= UNIT_DEREF(p
->slice
)) {
406 cc
= unit_get_cgroup_context(p
);
410 bpf_firewall_count_access_items(verdict
== ACCESS_ALLOWED
? cc
->ip_address_allow
: cc
->ip_address_deny
, &n_ipv4
, &n_ipv6
);
414 ipv4_map_fd
= bpf_map_new(
415 BPF_MAP_TYPE_LPM_TRIE
,
416 offsetof(struct bpf_lpm_trie_key
, data
) + sizeof(uint32_t),
425 ipv6_map_fd
= bpf_map_new(
426 BPF_MAP_TYPE_LPM_TRIE
,
427 offsetof(struct bpf_lpm_trie_key
, data
) + sizeof(uint32_t)*4,
435 for (p
= u
; p
; p
= UNIT_DEREF(p
->slice
)) {
438 cc
= unit_get_cgroup_context(p
);
442 r
= bpf_firewall_add_access_items(verdict
== ACCESS_ALLOWED
? cc
->ip_address_allow
: cc
->ip_address_deny
,
443 ipv4_map_fd
, ipv6_map_fd
, verdict
);
448 *ret_ipv4_map_fd
= ipv4_map_fd
;
449 *ret_ipv6_map_fd
= ipv6_map_fd
;
451 ipv4_map_fd
= ipv6_map_fd
= -1;
455 static int bpf_firewall_prepare_accounting_maps(bool enabled
, int *fd_ingress
, int *fd_egress
) {
462 if (*fd_ingress
< 0) {
463 r
= bpf_map_new(BPF_MAP_TYPE_ARRAY
, sizeof(int), sizeof(uint64_t), 2, 0);
470 if (*fd_egress
< 0) {
472 r
= bpf_map_new(BPF_MAP_TYPE_ARRAY
, sizeof(int), sizeof(uint64_t), 2, 0);
479 *fd_ingress
= safe_close(*fd_ingress
);
480 *fd_egress
= safe_close(*fd_egress
);
486 int bpf_firewall_compile(Unit
*u
) {
492 r
= bpf_firewall_supported();
496 log_debug("BPF firewalling not supported on this systemd, proceeding without.");
500 /* Note that when we compile a new firewall we first flush out the access maps and the BPF programs themselves,
501 * but we reuse the the accounting maps. That way the firewall in effect always maps to the actual
502 * configuration, but we don't flush out the accounting unnecessarily */
504 u
->ip_bpf_ingress
= bpf_program_unref(u
->ip_bpf_ingress
);
505 u
->ip_bpf_egress
= bpf_program_unref(u
->ip_bpf_egress
);
507 u
->ipv4_allow_map_fd
= safe_close(u
->ipv4_allow_map_fd
);
508 u
->ipv4_deny_map_fd
= safe_close(u
->ipv4_deny_map_fd
);
510 u
->ipv6_allow_map_fd
= safe_close(u
->ipv6_allow_map_fd
);
511 u
->ipv6_deny_map_fd
= safe_close(u
->ipv6_deny_map_fd
);
513 cc
= unit_get_cgroup_context(u
);
517 r
= bpf_firewall_prepare_access_maps(u
, ACCESS_ALLOWED
, &u
->ipv4_allow_map_fd
, &u
->ipv6_allow_map_fd
);
519 return log_error_errno(r
, "Preparation of eBPF allow maps failed: %m");
521 r
= bpf_firewall_prepare_access_maps(u
, ACCESS_DENIED
, &u
->ipv4_deny_map_fd
, &u
->ipv6_deny_map_fd
);
523 return log_error_errno(r
, "Preparation of eBPF deny maps failed: %m");
525 r
= bpf_firewall_prepare_accounting_maps(cc
->ip_accounting
, &u
->ip_accounting_ingress_map_fd
, &u
->ip_accounting_egress_map_fd
);
527 return log_error_errno(r
, "Preparation of eBPF accounting maps failed: %m");
529 r
= bpf_firewall_compile_bpf(u
, true, &u
->ip_bpf_ingress
);
531 return log_error_errno(r
, "Compilation for ingress BPF program failed: %m");
533 r
= bpf_firewall_compile_bpf(u
, false, &u
->ip_bpf_egress
);
535 return log_error_errno(r
, "Compilation for egress BPF program failed: %m");
540 int bpf_firewall_install(Unit
*u
) {
541 _cleanup_free_
char *path
= NULL
;
550 cc
= unit_get_cgroup_context(u
);
554 r
= bpf_firewall_supported();
558 log_debug("BPF firewalling not supported on this systemd, proceeding without.");
562 r
= cg_get_path(SYSTEMD_CGROUP_CONTROLLER
, u
->cgroup_path
, NULL
, &path
);
564 return log_error_errno(r
, "Failed to determine cgroup path: %m");
566 if (u
->ip_bpf_egress
) {
567 r
= bpf_program_load_kernel(u
->ip_bpf_egress
, NULL
, 0);
569 return log_error_errno(r
, "Kernel upload of egress BPF program failed: %m");
571 r
= bpf_program_cgroup_attach(u
->ip_bpf_egress
, BPF_CGROUP_INET_EGRESS
, path
, cc
->delegate
? BPF_F_ALLOW_OVERRIDE
: 0);
573 return log_error_errno(r
, "Attaching egress BPF program to cgroup %s failed: %m", path
);
575 r
= bpf_program_cgroup_detach(BPF_CGROUP_INET_EGRESS
, path
);
577 return log_full_errno(r
== -ENOENT
? LOG_DEBUG
: LOG_ERR
, r
,
578 "Detaching egress BPF program from cgroup failed: %m");
581 if (u
->ip_bpf_ingress
) {
582 r
= bpf_program_load_kernel(u
->ip_bpf_ingress
, NULL
, 0);
584 return log_error_errno(r
, "Kernel upload of ingress BPF program failed: %m");
586 r
= bpf_program_cgroup_attach(u
->ip_bpf_ingress
, BPF_CGROUP_INET_INGRESS
, path
, cc
->delegate
? BPF_F_ALLOW_OVERRIDE
: 0);
588 return log_error_errno(r
, "Attaching ingress BPF program to cgroup %s failed: %m", path
);
590 r
= bpf_program_cgroup_detach(BPF_CGROUP_INET_INGRESS
, path
);
592 return log_full_errno(r
== -ENOENT
? LOG_DEBUG
: LOG_ERR
, r
,
593 "Detaching ingress BPF program from cgroup failed: %m");
599 int bpf_firewall_read_accounting(int map_fd
, uint64_t *ret_bytes
, uint64_t *ret_packets
) {
600 uint64_t key
, packets
;
607 key
= MAP_KEY_PACKETS
;
608 r
= bpf_map_lookup_element(map_fd
, &key
, &packets
);
615 r
= bpf_map_lookup_element(map_fd
, &key
, ret_bytes
);
621 *ret_packets
= packets
;
626 int bpf_firewall_reset_accounting(int map_fd
) {
627 uint64_t key
, value
= 0;
633 key
= MAP_KEY_PACKETS
;
634 r
= bpf_map_update_element(map_fd
, &key
, &value
);
639 return bpf_map_update_element(map_fd
, &key
, &value
);
643 int bpf_firewall_supported(void) {
644 struct bpf_insn trivial
[] = {
645 BPF_MOV64_IMM(BPF_REG_0
, 1),
649 _cleanup_(bpf_program_unrefp
) BPFProgram
*program
= NULL
;
650 static int supported
= -1;
653 /* Checks whether BPF firewalling is supported. For this, we check three things:
655 * a) whether we are privileged
656 * b) whether the unified hierarchy is being used
657 * c) the BPF implementation in the kernel supports BPF LPM TRIE maps, which we require
664 if (geteuid() != 0) {
665 log_debug("Not enough privileges, BPF firewalling is not supported.");
666 return supported
= false;
669 r
= cg_unified_controller(SYSTEMD_CGROUP_CONTROLLER
);
671 return log_error_errno(r
, "Can't determine whether the unified hierarchy is used: %m");
673 return supported
= false;
675 fd
= bpf_map_new(BPF_MAP_TYPE_LPM_TRIE
,
676 offsetof(struct bpf_lpm_trie_key
, data
) + sizeof(uint64_t),
681 log_debug_errno(r
, "Can't allocate BPF LPM TRIE map, BPF firewalling is not supported: %m");
682 return supported
= false;
687 if (bpf_program_new(BPF_PROG_TYPE_CGROUP_SKB
, &program
) < 0) {
688 log_debug_errno(r
, "Can't allocate CGROUP SKB BPF program, BPF firewalling is not supported: %m");
689 return supported
= false;
692 r
= bpf_program_add_instructions(program
, trivial
, ELEMENTSOF(trivial
));
694 log_debug_errno(r
, "Can't add trivial instructions to CGROUP SKB BPF program, BPF firewalling is not supported: %m");
695 return supported
= false;
698 r
= bpf_program_load_kernel(program
, NULL
, 0);
700 log_debug_errno(r
, "Can't load kernel CGROUP SKB BPF program, BPF firewalling is not supported: %m");
701 return supported
= false;
704 return supported
= true;