1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
8 #include <linux/netfilter/nf_tables.h>
9 #include <linux/netfilter/nf_nat.h>
10 #include <linux/netfilter_ipv4.h>
11 #include <netinet/ip.h>
12 #include <netinet/ip6.h>
14 #include "sd-netlink.h"
16 #include "alloc-util.h"
17 #include "firewall-util.h"
18 #include "firewall-util-private.h"
19 #include "in-addr-util.h"
21 #include "netlink-internal.h"
22 #include "netlink-util.h"
23 #include "socket-util.h"
24 #include "time-util.h"
26 #define NFT_SYSTEMD_DNAT_MAP_NAME "map_port_ipport"
27 #define NFT_SYSTEMD_TABLE_NAME "io.systemd.nat"
28 #define NFT_SYSTEMD_MASQ_SET_NAME "masq_saddr"
30 #define NFNL_DEFAULT_TIMEOUT_USECS (1ULL * USEC_PER_SEC)
32 #define UDP_DPORT_OFFSET 2
34 static sd_netlink_message
**netlink_message_unref_many(sd_netlink_message
**m
) {
38 /* This does not free array. The end of the array must be NULL. */
40 for (sd_netlink_message
**p
= m
; *p
; p
++)
41 *p
= sd_netlink_message_unref(*p
);
46 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_netlink_message
**, netlink_message_unref_many
);
48 static int nfnl_netlink_sendv(
50 sd_netlink_message
*messages
[static 1],
53 _cleanup_free_
uint32_t *serial
= NULL
;
60 r
= sd_netlink_sendv(nfnl
, messages
, msgcount
, &serial
);
65 for (size_t i
= 1; i
< msgcount
- 1; i
++) {
68 /* If message is an error, this returns embedded errno */
69 tmp
= sd_netlink_read(nfnl
, serial
[i
], NFNL_DEFAULT_TIMEOUT_USECS
, NULL
);
70 if (tmp
< 0 && r
== 0)
77 static int nfnl_add_open_expr_container(sd_netlink_message
*m
, const char *name
) {
83 r
= sd_netlink_message_open_array(m
, NFTA_LIST_ELEM
);
87 return sd_netlink_message_open_container_union(m
, NFTA_EXPR_DATA
, name
);
90 static int nfnl_add_expr_fib(
91 sd_netlink_message
*m
,
92 uint32_t nft_fib_flags
,
93 enum nft_fib_result result
,
94 enum nft_registers dreg
) {
100 r
= nfnl_add_open_expr_container(m
, "fib");
104 r
= sd_netlink_message_append_u32(m
, NFTA_FIB_FLAGS
, htobe32(nft_fib_flags
));
108 r
= sd_netlink_message_append_u32(m
, NFTA_FIB_RESULT
, htobe32(result
));
112 r
= sd_netlink_message_append_u32(m
, NFTA_FIB_DREG
, htobe32(dreg
));
116 r
= sd_netlink_message_close_container(m
); /* NFTA_EXPR_DATA */
120 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
123 static int nfnl_add_expr_meta(
124 sd_netlink_message
*m
,
125 enum nft_meta_keys key
,
126 enum nft_registers dreg
) {
132 r
= nfnl_add_open_expr_container(m
, "meta");
136 r
= sd_netlink_message_append_u32(m
, NFTA_META_KEY
, htobe32(key
));
140 r
= sd_netlink_message_append_u32(m
, NFTA_META_DREG
, htobe32(dreg
));
144 r
= sd_netlink_message_close_container(m
); /* NFTA_EXPR_DATA */
148 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
151 static int nfnl_add_expr_payload(
152 sd_netlink_message
*m
,
153 enum nft_payload_bases pb
,
156 enum nft_registers dreg
) {
162 r
= nfnl_add_open_expr_container(m
, "payload");
166 r
= sd_netlink_message_append_u32(m
, NFTA_PAYLOAD_DREG
, htobe32(dreg
));
170 r
= sd_netlink_message_append_u32(m
, NFTA_PAYLOAD_BASE
, htobe32(pb
));
174 r
= sd_netlink_message_append_u32(m
, NFTA_PAYLOAD_OFFSET
, htobe32(offset
));
178 r
= sd_netlink_message_append_u32(m
, NFTA_PAYLOAD_LEN
, htobe32(len
));
182 r
= sd_netlink_message_close_container(m
); /* NFTA_EXPR_DATA */
186 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
189 static int nfnl_add_expr_lookup_set_data(
190 sd_netlink_message
*m
,
191 const char *set_name
,
192 enum nft_registers sreg
) {
199 r
= nfnl_add_open_expr_container(m
, "lookup");
203 r
= sd_netlink_message_append_string(m
, NFTA_LOOKUP_SET
, set_name
);
207 return sd_netlink_message_append_u32(m
, NFTA_LOOKUP_SREG
, htobe32(sreg
));
210 static int nfnl_add_expr_lookup_set(
211 sd_netlink_message
*m
,
212 const char *set_name
,
213 enum nft_registers sreg
) {
220 r
= nfnl_add_expr_lookup_set_data(m
, set_name
, sreg
);
224 r
= sd_netlink_message_close_container(m
); /* NFTA_EXPR_DATA */
228 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
231 static int nfnl_add_expr_lookup_map(
232 sd_netlink_message
*m
,
233 const char *set_name
,
234 enum nft_registers sreg
,
235 enum nft_registers dreg
) {
242 r
= nfnl_add_expr_lookup_set_data(m
, set_name
, sreg
);
246 r
= sd_netlink_message_append_u32(m
, NFTA_LOOKUP_DREG
, htobe32(dreg
));
250 r
= sd_netlink_message_close_container(m
); /* NFTA_EXPR_DATA */
254 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
257 static int nfnl_add_expr_cmp(
258 sd_netlink_message
*m
,
259 enum nft_cmp_ops cmp_op
,
260 enum nft_registers sreg
,
269 r
= nfnl_add_open_expr_container(m
, "cmp");
273 r
= sd_netlink_message_append_u32(m
, NFTA_CMP_OP
, htobe32(cmp_op
));
277 r
= sd_netlink_message_append_u32(m
, NFTA_CMP_SREG
, htobe32(sreg
));
281 r
= sd_netlink_message_append_container_data(m
, NFTA_CMP_DATA
, NFTA_DATA_VALUE
, data
, dlen
);
285 r
= sd_netlink_message_close_container(m
); /* NFTA_EXPR_DATA */
289 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
292 static int nfnl_add_expr_bitwise(
293 sd_netlink_message
*m
,
294 enum nft_registers sreg
,
295 enum nft_registers dreg
,
306 r
= nfnl_add_open_expr_container(m
, "bitwise");
310 r
= sd_netlink_message_append_u32(m
, NFTA_BITWISE_SREG
, htobe32(sreg
));
314 r
= sd_netlink_message_append_u32(m
, NFTA_BITWISE_DREG
, htobe32(dreg
));
318 r
= sd_netlink_message_append_u32(m
, NFTA_BITWISE_LEN
, htobe32(len
));
322 r
= sd_netlink_message_append_container_data(m
, NFTA_BITWISE_MASK
, NFTA_DATA_VALUE
, and, len
);
326 r
= sd_netlink_message_append_container_data(m
, NFTA_BITWISE_XOR
, NFTA_DATA_VALUE
, xor, len
);
330 r
= sd_netlink_message_close_container(m
); /* NFTA_EXPR_DATA */
334 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
337 static int nfnl_add_expr_dnat(
338 sd_netlink_message
*m
,
340 enum nft_registers areg
,
341 enum nft_registers preg
) {
347 r
= nfnl_add_open_expr_container(m
, "nat");
351 r
= sd_netlink_message_append_u32(m
, NFTA_NAT_TYPE
, htobe32(NFT_NAT_DNAT
));
355 r
= sd_netlink_message_append_u32(m
, NFTA_NAT_FAMILY
, htobe32(family
));
359 r
= sd_netlink_message_append_u32(m
, NFTA_NAT_REG_ADDR_MIN
, htobe32(areg
));
363 r
= sd_netlink_message_append_u32(m
, NFTA_NAT_REG_PROTO_MIN
, htobe32(preg
));
367 r
= sd_netlink_message_close_container(m
);
371 return sd_netlink_message_close_container(m
);
374 static int nfnl_add_expr_masq(sd_netlink_message
*m
) {
377 r
= sd_netlink_message_open_array(m
, NFTA_LIST_ELEM
);
381 r
= sd_netlink_message_append_string(m
, NFTA_EXPR_NAME
, "masq");
385 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
388 static int sd_nfnl_message_new_masq_rule(
390 sd_netlink_message
**ret
,
394 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
397 /* -t nat -A POSTROUTING -p protocol -s source/pflen -o out_interface -d destination/pflen -j MASQUERADE */
401 assert(IN_SET(family
, AF_INET
, AF_INET6
));
404 r
= sd_nfnl_nft_message_new_rule(nfnl
, &m
, family
, NFT_SYSTEMD_TABLE_NAME
, chain
);
408 r
= sd_netlink_message_open_container(m
, NFTA_RULE_EXPRESSIONS
);
412 /* 1st statement: ip saddr @masq_saddr. Place iph->saddr in reg1, resp. ipv6 in reg1..reg4. */
413 if (family
== AF_INET
)
414 r
= nfnl_add_expr_payload(m
, NFT_PAYLOAD_NETWORK_HEADER
, offsetof(struct iphdr
, saddr
),
415 sizeof(uint32_t), NFT_REG32_01
);
417 r
= nfnl_add_expr_payload(m
, NFT_PAYLOAD_NETWORK_HEADER
, offsetof(struct ip6_hdr
, ip6_src
.s6_addr
),
418 sizeof(struct in6_addr
), NFT_REG32_01
);
422 /* 1st statement: use reg1 content to make lookup in @masq_saddr set. */
423 r
= nfnl_add_expr_lookup_set(m
, NFT_SYSTEMD_MASQ_SET_NAME
, NFT_REG32_01
);
427 /* 2nd statement: masq. Only executed by kernel if the previous lookup was successful. */
428 r
= nfnl_add_expr_masq(m
);
432 r
= sd_netlink_message_close_container(m
); /* NFTA_RULE_EXPRESSIONS */
440 static int sd_nfnl_message_new_dnat_rule_pre(
442 sd_netlink_message
**ret
,
446 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
447 enum nft_registers proto_reg
;
448 uint32_t local
= RTN_LOCAL
;
451 /* -t nat -A PREROUTING -p protocol --dport local_port -i in_interface -s source/pflen
452 * -d destination/pflen -j DNAT --to-destination remote_addr:remote_port */
456 assert(IN_SET(family
, AF_INET
, AF_INET6
));
459 r
= sd_nfnl_nft_message_new_rule(nfnl
, &m
, family
, NFT_SYSTEMD_TABLE_NAME
, chain
);
463 r
= sd_netlink_message_open_container(m
, NFTA_RULE_EXPRESSIONS
);
467 /* 1st statement: fib daddr type local */
468 r
= nfnl_add_expr_fib(m
, NFTA_FIB_F_DADDR
, NFT_FIB_RESULT_ADDRTYPE
, NFT_REG32_01
);
472 /* 1st statement (cont.): compare RTN_LOCAL */
473 r
= nfnl_add_expr_cmp(m
, NFT_CMP_EQ
, NFT_REG32_01
, &local
, sizeof(local
));
477 /* 2nd statement: lookup local port in map, fetch address:dport to map to */
478 r
= nfnl_add_expr_meta(m
, NFT_META_L4PROTO
, NFT_REG32_01
);
482 r
= nfnl_add_expr_payload(m
, NFT_PAYLOAD_TRANSPORT_HEADER
, UDP_DPORT_OFFSET
,
483 sizeof(uint16_t), NFT_REG32_02
);
487 /* 3rd statement: lookup 'l4proto . dport', e.g. 'tcp . 22' as key and
488 * store address and port for the dnat mapping in REG1/REG2. */
489 r
= nfnl_add_expr_lookup_map(m
, NFT_SYSTEMD_DNAT_MAP_NAME
, NFT_REG32_01
, NFT_REG32_01
);
493 proto_reg
= family
== AF_INET
? NFT_REG32_02
: NFT_REG32_05
;
494 r
= nfnl_add_expr_dnat(m
, family
, NFT_REG32_01
, proto_reg
);
498 r
= sd_netlink_message_close_container(m
); /* NFTA_RULE_EXPRESSIONS */
506 static int sd_nfnl_message_new_dnat_rule_out(
508 sd_netlink_message
**ret
,
512 static const uint32_t zero
= 0, one
= 1;
513 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
514 enum nft_registers proto_reg
;
519 assert(IN_SET(family
, AF_INET
, AF_INET6
));
522 r
= sd_nfnl_nft_message_new_rule(nfnl
, &m
, family
, NFT_SYSTEMD_TABLE_NAME
, chain
);
526 r
= sd_netlink_message_open_container(m
, NFTA_RULE_EXPRESSIONS
);
530 /* 1st statement: exclude 127.0.0.1/8: ip daddr != 127.0.0.1/8, resp. avoid ::1 */
531 if (family
== AF_INET
) {
532 uint32_t lonet
= htobe32(UINT32_C(0x7F000000)), lomask
= htobe32(UINT32_C(0xff000000));
534 r
= nfnl_add_expr_payload(m
, NFT_PAYLOAD_NETWORK_HEADER
, offsetof(struct iphdr
, daddr
),
535 sizeof(lonet
), NFT_REG32_01
);
538 /* 1st statement (cont.): bitops/prefix */
539 r
= nfnl_add_expr_bitwise(m
, NFT_REG32_01
, NFT_REG32_01
, &lomask
, &zero
, sizeof(lomask
));
543 /* 1st statement (cont.): compare reg1 with 127/8 */
544 r
= nfnl_add_expr_cmp(m
, NFT_CMP_NEQ
, NFT_REG32_01
, &lonet
, sizeof(lonet
));
546 struct in6_addr loaddr
= IN6ADDR_LOOPBACK_INIT
;
548 r
= nfnl_add_expr_payload(m
, NFT_PAYLOAD_NETWORK_HEADER
, offsetof(struct ip6_hdr
, ip6_dst
.s6_addr
),
549 sizeof(loaddr
), NFT_REG32_01
);
553 r
= nfnl_add_expr_cmp(m
, NFT_CMP_NEQ
, NFT_REG32_01
, &loaddr
, sizeof(loaddr
));
558 /* 2nd statement: meta oif lo */
559 r
= nfnl_add_expr_meta(m
, NFT_META_OIF
, NFT_REG32_01
);
563 /* 2nd statement (cont.): compare to lo ifindex (1) */
564 r
= nfnl_add_expr_cmp(m
, NFT_CMP_EQ
, NFT_REG32_01
, &one
, sizeof(one
));
568 /* 3rd statement: meta l4proto . th dport dnat ip . port to map @map_port_ipport */
569 r
= nfnl_add_expr_meta(m
, NFT_META_L4PROTO
, NFT_REG32_01
);
573 /* 3rd statement (cont): store the port number in reg2 */
574 r
= nfnl_add_expr_payload(m
, NFT_PAYLOAD_TRANSPORT_HEADER
, UDP_DPORT_OFFSET
,
575 sizeof(uint16_t), NFT_REG32_02
);
579 /* 3rd statement (cont): use reg1 and reg2 and retrieve
580 * the new destination ip and port number.
582 * reg1 and reg2 are clobbered and will then contain the new
583 * address/port number. */
584 r
= nfnl_add_expr_lookup_map(m
, NFT_SYSTEMD_DNAT_MAP_NAME
, NFT_REG32_01
, NFT_REG32_01
);
588 /* 4th statement: dnat connection to address/port retrieved by the
589 * preceding expression. */
590 proto_reg
= family
== AF_INET
? NFT_REG32_02
: NFT_REG32_05
;
591 r
= nfnl_add_expr_dnat(m
, family
, NFT_REG32_01
, proto_reg
);
595 r
= sd_netlink_message_close_container(m
); /* NFTA_RULE_EXPRESSIONS */
603 static int nft_new_set(
604 struct sd_netlink
*nfnl
,
605 sd_netlink_message
**ret
,
607 const char *set_name
,
613 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
618 assert(IN_SET(family
, AF_INET
, AF_INET6
));
621 r
= sd_nfnl_nft_message_new_set(nfnl
, &m
, family
, NFT_SYSTEMD_TABLE_NAME
, set_name
, set_id
, klen
);
626 r
= sd_netlink_message_append_u32(m
, NFTA_SET_FLAGS
, htobe32(flags
));
631 r
= sd_netlink_message_append_u32(m
, NFTA_SET_KEY_TYPE
, htobe32(type
));
639 static int nft_new_map(
640 struct sd_netlink
*nfnl
,
641 sd_netlink_message
**ret
,
643 const char *set_name
,
651 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
656 assert(IN_SET(family
, AF_INET
, AF_INET6
));
659 r
= nft_new_set(nfnl
, &m
, family
, set_name
, set_id
, flags
| NFT_SET_MAP
, type
, klen
);
663 r
= sd_netlink_message_append_u32(m
, NFTA_SET_DATA_TYPE
, htobe32(dtype
));
667 r
= sd_netlink_message_append_u32(m
, NFTA_SET_DATA_LEN
, htobe32(dlen
));
675 static int nft_add_element(
677 sd_netlink_message
**ret
,
679 const char *set_name
,
685 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
690 assert(IN_SET(family
, AF_INET
, AF_INET6
));
696 * Ideally there would be an API that provides:
698 * 1) an init function to add the main ruleset skeleton
699 * 2) a function that populates the sets with all known address/port pairs to s/dnat for
700 * 3) a function that can remove address/port pairs again.
702 * At this time, the existing API is used which is built on a
703 * 'add/delete a rule' paradigm.
705 * This replicated here and each element gets added to the set
708 r
= sd_nfnl_nft_message_new_setelems_begin(nfnl
, &m
, family
, NFT_SYSTEMD_TABLE_NAME
, set_name
);
712 r
= sd_nfnl_nft_message_add_setelem(m
, 0, key
, klen
, data
, dlen
);
716 /* could theoretically append more set elements to add here */
717 r
= sd_nfnl_nft_message_add_setelem_end(m
);
725 static int nft_del_element(
727 sd_netlink_message
**ret
,
729 const char *set_name
,
735 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
740 assert(IN_SET(family
, AF_INET
, AF_INET6
));
745 r
= sd_nfnl_nft_message_del_setelems_begin(nfnl
, &m
, family
, NFT_SYSTEMD_TABLE_NAME
, set_name
);
749 r
= sd_nfnl_nft_message_add_setelem(m
, 0, key
, klen
, data
, dlen
);
753 r
= sd_nfnl_nft_message_add_setelem_end(m
);
761 /* This is needed so 'nft' userspace tool can properly format the contents
762 * of the set/map when someone uses 'nft' to inspect their content.
764 * The values cannot be changed, they are part of the nft tool type identifier ABI. */
770 TYPE_INET_PROTOCOL
= 12,
771 TYPE_INET_SERVICE
= 13,
774 static uint32_t concat_types2(enum nft_key_types a
, enum nft_key_types b
) {
775 uint32_t type
= (uint32_t)a
;
783 static int fw_nftables_init_family(sd_netlink
*nfnl
, int family
) {
784 sd_netlink_message
*messages
[12] = {};
785 _unused_
_cleanup_(netlink_message_unref_manyp
) sd_netlink_message
**unref
= messages
;
786 size_t msgcnt
= 0, ip_type_size
;
791 assert(IN_SET(family
, AF_INET
, AF_INET6
));
793 r
= sd_nfnl_message_batch_begin(nfnl
, &messages
[msgcnt
++]);
797 /* Set F_EXCL so table add fails if the table already exists. */
798 r
= sd_nfnl_nft_message_new_table(nfnl
, &messages
[msgcnt
++], family
, NFT_SYSTEMD_TABLE_NAME
);
802 r
= sd_nfnl_nft_message_new_basechain(nfnl
, &messages
[msgcnt
++], family
, NFT_SYSTEMD_TABLE_NAME
,
804 NF_INET_PRE_ROUTING
, NF_IP_PRI_NAT_DST
+ 1);
808 r
= sd_nfnl_nft_message_new_basechain(nfnl
, &messages
[msgcnt
++], family
, NFT_SYSTEMD_TABLE_NAME
,
810 NF_INET_LOCAL_OUT
, NF_IP_PRI_NAT_DST
+ 1);
814 r
= sd_nfnl_nft_message_new_basechain(nfnl
, &messages
[msgcnt
++], family
, NFT_SYSTEMD_TABLE_NAME
,
815 "postrouting", "nat",
816 NF_INET_POST_ROUTING
, NF_IP_PRI_NAT_SRC
+ 1);
820 if (family
== AF_INET
) {
821 ip_type_size
= sizeof(uint32_t);
822 ip_type
= TYPE_IPADDR
;
824 assert(family
== AF_INET6
);
825 ip_type_size
= sizeof(struct in6_addr
);
826 ip_type
= TYPE_IP6ADDR
;
828 /* set to store ip address ranges we should masquerade for */
829 r
= nft_new_set(nfnl
, &messages
[msgcnt
++], family
, NFT_SYSTEMD_MASQ_SET_NAME
, ++set_id
, NFT_SET_INTERVAL
, ip_type
, ip_type_size
);
834 * map to store ip address:port pair to dnat to. elements in concatenation
835 * are rounded up to 4 bytes.
837 * Example: ip protocol . tcp daddr is sizeof(uint32_t) + sizeof(uint32_t), not
838 * sizeof(uint8_t) + sizeof(uint16_t).
840 r
= nft_new_map(nfnl
, &messages
[msgcnt
++], family
, NFT_SYSTEMD_DNAT_MAP_NAME
, ++set_id
, 0,
841 concat_types2(TYPE_INET_PROTOCOL
, TYPE_INET_SERVICE
), sizeof(uint32_t) * 2,
842 concat_types2(ip_type
, TYPE_INET_SERVICE
), ip_type_size
+ sizeof(uint32_t));
846 r
= sd_nfnl_message_new_dnat_rule_pre(nfnl
, &messages
[msgcnt
++], family
, "prerouting");
850 r
= sd_nfnl_message_new_dnat_rule_out(nfnl
, &messages
[msgcnt
++], family
, "output");
854 r
= sd_nfnl_message_new_masq_rule(nfnl
, &messages
[msgcnt
++], family
, "postrouting");
858 r
= sd_nfnl_message_batch_end(nfnl
, &messages
[msgcnt
++]);
862 assert(msgcnt
< ELEMENTSOF(messages
));
863 r
= nfnl_netlink_sendv(nfnl
, messages
, msgcnt
);
864 if (r
< 0 && r
!= -EEXIST
)
870 int fw_nftables_init(FirewallContext
*ctx
) {
871 _cleanup_(sd_netlink_unrefp
) sd_netlink
*nfnl
= NULL
;
877 r
= sd_nfnl_socket_open(&nfnl
);
881 r
= fw_nftables_init_family(nfnl
, AF_INET
);
885 if (socket_ipv6_is_supported()) {
886 r
= fw_nftables_init_family(nfnl
, AF_INET6
);
888 log_debug_errno(r
, "Failed to init ipv6 NAT: %m");
891 ctx
->nfnl
= TAKE_PTR(nfnl
);
895 void fw_nftables_exit(FirewallContext
*ctx
) {
898 ctx
->nfnl
= sd_netlink_unref(ctx
->nfnl
);
901 static int nft_message_add_setelem_iprange(
902 sd_netlink_message
*m
,
903 const union in_addr_union
*source
,
904 unsigned int prefixlen
) {
906 uint32_t mask
, start
, end
;
912 assert(prefixlen
<= 32);
914 nplen
= 32 - prefixlen
;
916 mask
= (1U << nplen
) - 1U;
917 mask
= htobe32(~mask
);
918 start
= source
->in
.s_addr
& mask
;
920 r
= sd_nfnl_nft_message_add_setelem(m
, 0, &start
, sizeof(start
), NULL
, 0);
924 r
= sd_nfnl_nft_message_add_setelem_end(m
);
928 end
= be32toh(start
) + (1U << nplen
);
929 if (end
< be32toh(start
))
933 r
= sd_nfnl_nft_message_add_setelem(m
, 1, &end
, sizeof(end
), NULL
, 0);
937 r
= sd_netlink_message_append_u32(m
, NFTA_SET_ELEM_FLAGS
, htobe32(NFT_SET_ELEM_INTERVAL_END
));
941 return sd_nfnl_nft_message_add_setelem_end(m
);
944 static int nft_message_add_setelem_ip6range(
945 sd_netlink_message
*m
,
946 const union in_addr_union
*source
,
947 unsigned int prefixlen
) {
949 union in_addr_union start
, end
;
955 r
= in_addr_prefix_range(AF_INET6
, source
, prefixlen
, &start
, &end
);
959 r
= sd_nfnl_nft_message_add_setelem(m
, 0, &start
.in6
, sizeof(start
.in6
), NULL
, 0);
963 r
= sd_nfnl_nft_message_add_setelem_end(m
);
967 r
= sd_nfnl_nft_message_add_setelem(m
, 1, &end
.in6
, sizeof(end
.in6
), NULL
, 0);
971 r
= sd_netlink_message_append_u32(m
, NFTA_SET_ELEM_FLAGS
, htobe32(NFT_SET_ELEM_INTERVAL_END
));
975 return sd_nfnl_nft_message_add_setelem_end(m
);
978 static int fw_nftables_add_masquerade_internal(
982 const union in_addr_union
*source
,
983 unsigned int source_prefixlen
) {
985 sd_netlink_message
*messages
[4] = {};
986 _unused_
_cleanup_(netlink_message_unref_manyp
) sd_netlink_message
**unref
= messages
;
991 assert(IN_SET(af
, AF_INET
, AF_INET6
));
993 if (!source
|| source_prefixlen
== 0)
996 if (af
== AF_INET6
&& source_prefixlen
< 8)
999 r
= sd_nfnl_message_batch_begin(nfnl
, &messages
[msgcnt
++]);
1004 r
= sd_nfnl_nft_message_new_setelems_begin(nfnl
, &messages
[msgcnt
++], af
, NFT_SYSTEMD_TABLE_NAME
, NFT_SYSTEMD_MASQ_SET_NAME
);
1006 r
= sd_nfnl_nft_message_del_setelems_begin(nfnl
, &messages
[msgcnt
++], af
, NFT_SYSTEMD_TABLE_NAME
, NFT_SYSTEMD_MASQ_SET_NAME
);
1011 r
= nft_message_add_setelem_iprange(messages
[msgcnt
-1], source
, source_prefixlen
);
1013 r
= nft_message_add_setelem_ip6range(messages
[msgcnt
-1], source
, source_prefixlen
);
1017 r
= sd_nfnl_message_batch_end(nfnl
, &messages
[msgcnt
++]);
1021 assert(msgcnt
< ELEMENTSOF(messages
));
1022 r
= nfnl_netlink_sendv(nfnl
, messages
, msgcnt
);
1029 int fw_nftables_add_masquerade(
1030 FirewallContext
*ctx
,
1033 const union in_addr_union
*source
,
1034 unsigned int source_prefixlen
) {
1040 assert(IN_SET(af
, AF_INET
, AF_INET6
));
1042 if (!socket_ipv6_is_supported() && af
== AF_INET6
)
1045 r
= fw_nftables_add_masquerade_internal(ctx
->nfnl
, add
, af
, source
, source_prefixlen
);
1049 /* When someone runs 'nft flush ruleset' in the same net namespace this will also tear down the
1050 * systemd nat table.
1052 * Unlike iptables -t nat -F (which will remove all rules added by the systemd iptables
1053 * backend, iptables has builtin chains that cannot be deleted -- the next add operation will
1056 * In the nftables case, everything gets removed. The next add operation will yield -ENOENT.
1058 * If we see -ENOENT on add, replay the initial table setup. If that works, re-do the add
1061 * Note that this doesn't protect against external sabotage such as a
1062 * 'while true; nft flush ruleset; done'. There is nothing that could be done about that short
1063 * of extending the kernel to allow tables to be owned by stystemd-networkd and making them
1064 * non-deleteable except by the 'owning process'. */
1066 r
= fw_nftables_init_family(ctx
->nfnl
, af
);
1070 return fw_nftables_add_masquerade_internal(ctx
->nfnl
, add
, af
, source
, source_prefixlen
);
1073 static int fw_nftables_add_local_dnat_internal(
1078 uint16_t local_port
,
1079 const union in_addr_union
*remote
,
1080 uint16_t remote_port
,
1081 const union in_addr_union
*previous_remote
) {
1083 sd_netlink_message
*messages
[5] = {};
1084 _unused_
_cleanup_(netlink_message_unref_manyp
) sd_netlink_message
**unref
= messages
;
1085 static bool ipv6_supported
= true;
1086 uint32_t data
[5], key
[2], dlen
;
1091 assert(add
|| !previous_remote
);
1092 assert(IN_SET(af
, AF_INET
, AF_INET6
));
1094 if (!ipv6_supported
&& af
== AF_INET6
)
1097 if (!IN_SET(protocol
, IPPROTO_TCP
, IPPROTO_UDP
))
1098 return -EPROTONOSUPPORT
;
1100 if (local_port
<= 0)
1104 key
[1] = htobe16(local_port
);
1109 if (remote_port
<= 0)
1112 if (af
== AF_INET
) {
1114 data
[1] = htobe16(remote_port
);
1116 assert(af
== AF_INET6
);
1117 dlen
= sizeof(data
);
1118 data
[4] = htobe16(remote_port
);
1121 r
= sd_nfnl_message_batch_begin(nfnl
, &messages
[msgcnt
++]);
1125 /* If a previous remote is set, remove its entry */
1126 if (add
&& previous_remote
&& !in_addr_equal(af
, previous_remote
, remote
)) {
1128 data
[0] = previous_remote
->in
.s_addr
;
1130 memcpy(data
, &previous_remote
->in6
, sizeof(previous_remote
->in6
));
1132 r
= nft_del_element(nfnl
, &messages
[msgcnt
++], af
, NFT_SYSTEMD_DNAT_MAP_NAME
, key
, sizeof(key
), data
, dlen
);
1138 data
[0] = remote
->in
.s_addr
;
1140 memcpy(data
, &remote
->in6
, sizeof(remote
->in6
));
1143 r
= nft_add_element(nfnl
, &messages
[msgcnt
++], af
, NFT_SYSTEMD_DNAT_MAP_NAME
, key
, sizeof(key
), data
, dlen
);
1145 r
= nft_del_element(nfnl
, &messages
[msgcnt
++], af
, NFT_SYSTEMD_DNAT_MAP_NAME
, key
, sizeof(key
), data
, dlen
);
1149 r
= sd_nfnl_message_batch_end(nfnl
, &messages
[msgcnt
++]);
1153 assert(msgcnt
< ELEMENTSOF(messages
));
1154 r
= nfnl_netlink_sendv(nfnl
, messages
, msgcnt
);
1155 if (r
== -EOVERFLOW
&& af
== AF_INET6
) {
1156 /* The current implementation of DNAT in systemd requires kernel's
1157 * fdb9c405e35bdc6e305b9b4e20ebc141ed14fc81 (v5.8), and the older kernel returns
1158 * -EOVERFLOW. Let's treat the error as -EOPNOTSUPP. */
1159 log_debug_errno(r
, "The current implementation of IPv6 DNAT in systemd requires kernel 5.8 or newer, ignoring: %m");
1160 ipv6_supported
= false;
1169 int fw_nftables_add_local_dnat(
1170 FirewallContext
*ctx
,
1174 uint16_t local_port
,
1175 const union in_addr_union
*remote
,
1176 uint16_t remote_port
,
1177 const union in_addr_union
*previous_remote
) {
1183 assert(IN_SET(af
, AF_INET
, AF_INET6
));
1185 if (!socket_ipv6_is_supported() && af
== AF_INET6
)
1188 r
= fw_nftables_add_local_dnat_internal(ctx
->nfnl
, add
, af
, protocol
, local_port
, remote
, remote_port
, previous_remote
);
1192 /* See comment in fw_nftables_add_masquerade(). */
1193 r
= fw_nftables_init_family(ctx
->nfnl
, af
);
1197 /* table created anew; previous address already gone */
1198 return fw_nftables_add_local_dnat_internal(ctx
->nfnl
, add
, af
, protocol
, local_port
, remote
, remote_port
, NULL
);