1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
8 #include <linux/netfilter/nf_tables.h>
9 #include <linux/netfilter/nf_nat.h>
10 #include <linux/netfilter_ipv4.h>
11 #include <netinet/ip.h>
12 #include <netinet/ip6.h>
14 #include "sd-netlink.h"
16 #include "alloc-util.h"
17 #include "firewall-util.h"
18 #include "firewall-util-private.h"
19 #include "in-addr-util.h"
21 #include "socket-util.h"
22 #include "time-util.h"
24 #define NFT_SYSTEMD_DNAT_MAP_NAME "map_port_ipport"
25 #define NFT_SYSTEMD_TABLE_NAME "io.systemd.nat"
26 #define NFT_SYSTEMD_MASQ_SET_NAME "masq_saddr"
28 #define NFNL_DEFAULT_TIMEOUT_USECS (1ULL * USEC_PER_SEC)
30 #define UDP_DPORT_OFFSET 2
32 static int nfnl_netlink_sendv(sd_netlink
*nfnl
,
33 sd_netlink_message
*messages
[],
35 _cleanup_free_
uint32_t *serial
= NULL
;
41 r
= sd_netlink_sendv(nfnl
, messages
, msgcount
, &serial
);
46 for (i
= 1; i
< msgcount
- 1; i
++) {
49 /* If message is an error, this returns embedded errno */
50 tmp
= sd_netlink_read(nfnl
, serial
[i
], NFNL_DEFAULT_TIMEOUT_USECS
, NULL
);
51 if (tmp
< 0 && r
== 0)
58 static int nfnl_add_open_expr_container(sd_netlink_message
*m
, const char *name
) {
61 r
= sd_netlink_message_open_array(m
, NFTA_LIST_ELEM
);
65 r
= sd_netlink_message_append_string(m
, NFTA_EXPR_NAME
, name
);
69 return sd_netlink_message_open_container_union(m
, NFTA_EXPR_DATA
, name
);
72 static int nfnl_add_expr_fib(sd_netlink_message
*m
, uint32_t nft_fib_flags
,
73 enum nft_fib_result result
,
74 enum nft_registers dreg
) {
77 r
= nfnl_add_open_expr_container(m
, "fib");
81 r
= sd_netlink_message_append_u32(m
, NFTA_FIB_FLAGS
, htobe32(nft_fib_flags
));
84 r
= sd_netlink_message_append_u32(m
, NFTA_FIB_RESULT
, htobe32(result
));
87 r
= sd_netlink_message_append_u32(m
, NFTA_FIB_DREG
, htobe32(dreg
));
91 r
= sd_netlink_message_close_container(m
); /* NFTA_EXPR_DATA */
95 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
98 static int nfnl_add_expr_meta(sd_netlink_message
*m
, enum nft_meta_keys key
,
99 enum nft_registers dreg
) {
102 r
= nfnl_add_open_expr_container(m
, "meta");
106 r
= sd_netlink_message_append_u32(m
, NFTA_META_KEY
, htobe32(key
));
110 r
= sd_netlink_message_append_u32(m
, NFTA_META_DREG
, htobe32(dreg
));
114 r
= sd_netlink_message_close_container(m
); /* NFTA_EXPR_DATA */
118 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
121 static int nfnl_add_expr_payload(sd_netlink_message
*m
, enum nft_payload_bases pb
,
122 uint32_t offset
, uint32_t len
, enum nft_registers dreg
) {
125 r
= nfnl_add_open_expr_container(m
, "payload");
129 r
= sd_netlink_message_append_u32(m
, NFTA_PAYLOAD_DREG
, htobe32(dreg
));
132 r
= sd_netlink_message_append_u32(m
, NFTA_PAYLOAD_BASE
, htobe32(pb
));
135 r
= sd_netlink_message_append_u32(m
, NFTA_PAYLOAD_OFFSET
, htobe32(offset
));
138 r
= sd_netlink_message_append_u32(m
, NFTA_PAYLOAD_LEN
, htobe32(len
));
142 r
= sd_netlink_message_close_container(m
); /* NFTA_EXPR_DATA */
145 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
148 static int nfnl_add_expr_lookup_set_data(sd_netlink_message
*m
, const char *set_name
,
149 enum nft_registers sreg
) {
152 r
= nfnl_add_open_expr_container(m
, "lookup");
156 r
= sd_netlink_message_append_string(m
, NFTA_LOOKUP_SET
, set_name
);
160 return sd_netlink_message_append_u32(m
, NFTA_LOOKUP_SREG
, htobe32(sreg
));
163 static int nfnl_add_expr_lookup_set(sd_netlink_message
*m
, const char *set_name
,
164 enum nft_registers sreg
) {
167 r
= nfnl_add_expr_lookup_set_data(m
, set_name
, sreg
);
171 r
= sd_netlink_message_close_container(m
); /* NFTA_EXPR_DATA */
174 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
177 static int nfnl_add_expr_lookup_map(sd_netlink_message
*m
, const char *set_name
,
178 enum nft_registers sreg
, enum nft_registers dreg
) {
181 r
= nfnl_add_expr_lookup_set_data(m
, set_name
, sreg
);
185 r
= sd_netlink_message_append_u32(m
, NFTA_LOOKUP_DREG
, htobe32(dreg
));
189 r
= sd_netlink_message_close_container(m
); /* NFTA_EXPR_DATA */
193 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
196 static int nfnl_add_expr_data(sd_netlink_message
*m
, int attr
, const void *data
, uint32_t dlen
) {
199 r
= sd_netlink_message_open_container(m
, attr
);
202 r
= sd_netlink_message_append_data(m
, NFTA_DATA_VALUE
, data
, dlen
);
206 return sd_netlink_message_close_container(m
); /* attr */
209 static int nfnl_add_expr_cmp_data(sd_netlink_message
*m
, const void *data
, uint32_t dlen
) {
210 return nfnl_add_expr_data(m
, NFTA_CMP_DATA
, data
, dlen
);
213 static int nfnl_add_expr_cmp(sd_netlink_message
*m
, enum nft_cmp_ops cmp_op
,
214 enum nft_registers sreg
, const void *data
, uint32_t dlen
) {
217 r
= nfnl_add_open_expr_container(m
, "cmp");
221 r
= sd_netlink_message_append_u32(m
, NFTA_CMP_OP
, htobe32(cmp_op
));
224 r
= sd_netlink_message_append_u32(m
, NFTA_CMP_SREG
, htobe32(sreg
));
228 r
= nfnl_add_expr_cmp_data(m
, data
, dlen
);
232 r
= sd_netlink_message_close_container(m
); /* NFTA_EXPR_DATA */
235 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
238 static int nfnl_add_expr_bitwise(sd_netlink_message
*m
,
239 enum nft_registers sreg
,
240 enum nft_registers dreg
,
242 const void *xor, uint32_t len
) {
245 r
= nfnl_add_open_expr_container(m
, "bitwise");
249 r
= sd_netlink_message_append_u32(m
, NFTA_BITWISE_SREG
, htobe32(sreg
));
252 r
= sd_netlink_message_append_u32(m
, NFTA_BITWISE_DREG
, htobe32(dreg
));
255 r
= sd_netlink_message_append_u32(m
, NFTA_BITWISE_LEN
, htobe32(len
));
259 r
= nfnl_add_expr_data(m
, NFTA_BITWISE_MASK
, and, len
);
263 r
= nfnl_add_expr_data(m
, NFTA_BITWISE_XOR
, xor, len
);
267 r
= sd_netlink_message_close_container(m
); /* NFTA_EXPR_DATA */
270 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
273 static int nfnl_add_expr_dnat(sd_netlink_message
*m
,
275 enum nft_registers areg
,
276 enum nft_registers preg
) {
279 r
= nfnl_add_open_expr_container(m
, "nat");
283 r
= sd_netlink_message_append_u32(m
, NFTA_NAT_TYPE
, htobe32(NFT_NAT_DNAT
));
287 r
= sd_netlink_message_append_u32(m
, NFTA_NAT_FAMILY
, htobe32(family
));
291 r
= sd_netlink_message_append_u32(m
, NFTA_NAT_REG_ADDR_MIN
, htobe32(areg
));
294 r
= sd_netlink_message_append_u32(m
, NFTA_NAT_REG_PROTO_MIN
, htobe32(preg
));
297 r
= sd_netlink_message_close_container(m
);
301 return sd_netlink_message_close_container(m
);
304 static int nfnl_add_expr_masq(sd_netlink_message
*m
) {
307 r
= sd_netlink_message_open_array(m
, NFTA_LIST_ELEM
);
311 r
= sd_netlink_message_append_string(m
, NFTA_EXPR_NAME
, "masq");
315 return sd_netlink_message_close_container(m
); /* NFTA_LIST_ELEM */
318 static int sd_nfnl_message_new_masq_rule(sd_netlink
*nfnl
, sd_netlink_message
**ret
, int family
,
320 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
323 /* -t nat -A POSTROUTING -p protocol -s source/pflen -o out_interface -d destination/pflen -j MASQUERADE */
325 r
= sd_nfnl_nft_message_new_rule(nfnl
, &m
, family
, NFT_SYSTEMD_TABLE_NAME
, chain
);
329 r
= sd_netlink_message_open_container(m
, NFTA_RULE_EXPRESSIONS
);
333 /* 1st statement: ip saddr @masq_saddr. Place iph->saddr in reg1, resp. ipv6 in reg1..reg4. */
334 if (family
== AF_INET
)
335 r
= nfnl_add_expr_payload(m
, NFT_PAYLOAD_NETWORK_HEADER
, offsetof(struct iphdr
, saddr
),
336 sizeof(uint32_t), NFT_REG32_01
);
338 r
= nfnl_add_expr_payload(m
, NFT_PAYLOAD_NETWORK_HEADER
, offsetof(struct ip6_hdr
, ip6_src
.s6_addr
),
339 sizeof(struct in6_addr
), NFT_REG32_01
);
343 /* 1st statement: use reg1 content to make lookup in @masq_saddr set. */
344 r
= nfnl_add_expr_lookup_set(m
, NFT_SYSTEMD_MASQ_SET_NAME
, NFT_REG32_01
);
348 /* 2nd statement: masq. Only executed by kernel if the previous lookup was successful. */
349 r
= nfnl_add_expr_masq(m
);
353 r
= sd_netlink_message_close_container(m
); /* NFTA_RULE_EXPRESSIONS */
360 static int sd_nfnl_message_new_dnat_rule_pre(sd_netlink
*nfnl
, sd_netlink_message
**ret
, int family
,
362 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
363 enum nft_registers proto_reg
;
364 uint32_t local
= RTN_LOCAL
;
367 /* -t nat -A PREROUTING -p protocol --dport local_port -i in_interface -s source/pflen
368 * -d destination/pflen -j DNAT --to-destination remote_addr:remote_port */
370 r
= sd_nfnl_nft_message_new_rule(nfnl
, &m
, family
, NFT_SYSTEMD_TABLE_NAME
, chain
);
374 r
= sd_netlink_message_open_container(m
, NFTA_RULE_EXPRESSIONS
);
378 /* 1st statement: fib daddr type local */
379 r
= nfnl_add_expr_fib(m
, NFTA_FIB_F_DADDR
, NFT_FIB_RESULT_ADDRTYPE
, NFT_REG32_01
);
383 /* 1st statement (cont.): compare RTN_LOCAL */
384 r
= nfnl_add_expr_cmp(m
, NFT_CMP_EQ
, NFT_REG32_01
, &local
, sizeof(local
));
388 /* 2nd statement: lookup local port in map, fetch address:dport to map to */
389 r
= nfnl_add_expr_meta(m
, NFT_META_L4PROTO
, NFT_REG32_01
);
393 r
= nfnl_add_expr_payload(m
, NFT_PAYLOAD_TRANSPORT_HEADER
, UDP_DPORT_OFFSET
,
394 sizeof(uint16_t), NFT_REG32_02
);
398 /* 3rd statement: lookup 'l4proto . dport', e.g. 'tcp . 22' as key and
399 * store address and port for the dnat mapping in REG1/REG2.
401 r
= nfnl_add_expr_lookup_map(m
, NFT_SYSTEMD_DNAT_MAP_NAME
, NFT_REG32_01
, NFT_REG32_01
);
405 proto_reg
= family
== AF_INET
? NFT_REG32_02
: NFT_REG32_05
;
406 r
= nfnl_add_expr_dnat(m
, family
, NFT_REG32_01
, proto_reg
);
410 r
= sd_netlink_message_close_container(m
); /* NFTA_RULE_EXPRESSIONS */
417 static int sd_nfnl_message_new_dnat_rule_out(sd_netlink
*nfnl
, sd_netlink_message
**ret
,
418 int family
, const char *chain
) {
419 static const uint32_t zero
= 0, one
= 1;
421 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
422 enum nft_registers proto_reg
;
425 r
= sd_nfnl_nft_message_new_rule(nfnl
, &m
, family
, NFT_SYSTEMD_TABLE_NAME
, chain
);
429 r
= sd_netlink_message_open_container(m
, NFTA_RULE_EXPRESSIONS
);
433 /* 1st statement: exclude 127.0.0.1/8: ip daddr != 127.0.0.1/8, resp. avoid ::1 */
434 if (family
== AF_INET
) {
435 uint32_t lonet
= htobe32(UINT32_C(0x7F000000)), lomask
= htobe32(UINT32_C(0xff000000));
437 r
= nfnl_add_expr_payload(m
, NFT_PAYLOAD_NETWORK_HEADER
, offsetof(struct iphdr
, daddr
),
438 sizeof(lonet
), NFT_REG32_01
);
441 /* 1st statement (cont.): bitops/prefix */
442 r
= nfnl_add_expr_bitwise(m
, NFT_REG32_01
, NFT_REG32_01
, &lomask
, &zero
, sizeof(lomask
));
446 /* 1st statement (cont.): compare reg1 with 127/8 */
447 r
= nfnl_add_expr_cmp(m
, NFT_CMP_NEQ
, NFT_REG32_01
, &lonet
, sizeof(lonet
));
449 struct in6_addr loaddr
= IN6ADDR_LOOPBACK_INIT
;
451 r
= nfnl_add_expr_payload(m
, NFT_PAYLOAD_NETWORK_HEADER
, offsetof(struct ip6_hdr
, ip6_dst
.s6_addr
),
452 sizeof(loaddr
), NFT_REG32_01
);
456 r
= nfnl_add_expr_cmp(m
, NFT_CMP_NEQ
, NFT_REG32_01
, &loaddr
, sizeof(loaddr
));
461 /* 2nd statement: meta oif lo */
462 r
= nfnl_add_expr_meta(m
, NFT_META_OIF
, NFT_REG32_01
);
466 /* 2nd statement (cont.): compare to lo ifindex (1) */
467 r
= nfnl_add_expr_cmp(m
, NFT_CMP_EQ
, NFT_REG32_01
, &one
, sizeof(one
));
471 /* 3rd statement: meta l4proto . th dport dnat ip . port to map @map_port_ipport */
472 r
= nfnl_add_expr_meta(m
, NFT_META_L4PROTO
, NFT_REG32_01
);
476 /* 3rd statement (cont): store the port number in reg2 */
477 r
= nfnl_add_expr_payload(m
, NFT_PAYLOAD_TRANSPORT_HEADER
, UDP_DPORT_OFFSET
,
478 sizeof(uint16_t), NFT_REG32_02
);
482 /* 3rd statement (cont): use reg1 and reg2 and retrieve
483 * the new destination ip and port number.
485 * reg1 and reg2 are clobbered and will then contain the new
486 * address/port number.
488 r
= nfnl_add_expr_lookup_map(m
, NFT_SYSTEMD_DNAT_MAP_NAME
, NFT_REG32_01
, NFT_REG32_01
);
492 /* 4th statement: dnat connection to address/port retrieved by the
493 * preceding expression. */
494 proto_reg
= family
== AF_INET
? NFT_REG32_02
: NFT_REG32_05
;
495 r
= nfnl_add_expr_dnat(m
, family
, NFT_REG32_01
, proto_reg
);
499 r
= sd_netlink_message_close_container(m
); /* NFTA_RULE_EXPRESSIONS */
506 static int nft_new_set(struct sd_netlink
*nfnl
,
507 sd_netlink_message
**ret
,
508 int family
, const char *set_name
,
510 uint32_t flags
, uint32_t type
, uint32_t klen
) {
511 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
514 r
= sd_nfnl_nft_message_new_set(nfnl
, &m
, family
, NFT_SYSTEMD_TABLE_NAME
, set_name
, set_id
, klen
);
519 r
= sd_netlink_message_append_u32(m
, NFTA_SET_FLAGS
, htobe32(flags
));
524 r
= sd_netlink_message_append_u32(m
, NFTA_SET_KEY_TYPE
, htobe32(type
));
532 static int nft_new_map(struct sd_netlink
*nfnl
,
533 sd_netlink_message
**ret
,
534 int family
, const char *set_name
, uint32_t set_id
,
535 uint32_t flags
, uint32_t type
, uint32_t klen
, uint32_t dtype
, uint32_t dlen
) {
536 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
539 r
= nft_new_set(nfnl
, &m
, family
, set_name
, set_id
, flags
| NFT_SET_MAP
, type
, klen
);
543 r
= sd_netlink_message_append_u32(m
, NFTA_SET_DATA_TYPE
, htobe32(dtype
));
547 r
= sd_netlink_message_append_u32(m
, NFTA_SET_DATA_LEN
, htobe32(dlen
));
554 static int nft_add_element(sd_netlink
*nfnl
, sd_netlink_message
**ret
,
555 int family
, const char *set_name
,
556 const void *key
, uint32_t klen
,
557 const void *data
, uint32_t dlen
) {
558 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
562 * Ideally there would be an API that provides:
564 * 1) a init function to add the main ruleset skeleton
565 * 2) a function that populates the sets with all known address/port pairs to s/dnat for
566 * 3) a function that can remove address/port pairs again.
568 * At this time, the existing API is used which is built on a
569 * 'add/delete a rule' paradigm.
571 * This replicated here and each element gets added to the set
574 r
= sd_nfnl_nft_message_new_setelems_begin(nfnl
, &m
, family
, NFT_SYSTEMD_TABLE_NAME
, set_name
);
578 r
= sd_nfnl_nft_message_add_setelem(m
, 0, key
, klen
, data
, dlen
);
582 /* could theoretically append more set elements to add here */
583 r
= sd_nfnl_nft_message_add_setelem_end(m
);
590 static int nft_del_element(sd_netlink
*nfnl
,
591 sd_netlink_message
**ret
, int family
, const char *set_name
,
592 const void *key
, uint32_t klen
,
593 const void *data
, uint32_t dlen
) {
594 _cleanup_(sd_netlink_message_unrefp
) sd_netlink_message
*m
= NULL
;
597 r
= sd_nfnl_nft_message_del_setelems_begin(nfnl
, &m
, family
, NFT_SYSTEMD_TABLE_NAME
, set_name
);
601 r
= sd_nfnl_nft_message_add_setelem(m
, 0, key
, klen
, data
, dlen
);
605 r
= sd_nfnl_nft_message_add_setelem_end(m
);
612 /* This is needed so 'nft' userspace tool can properly format the contents
613 * of the set/map when someone uses 'nft' to inspect their content.
615 * The values cannot be changed, they are part of the nft tool type identifier ABI.
622 TYPE_INET_PROTOCOL
= 12,
623 TYPE_INET_SERVICE
= 13,
626 static uint32_t concat_types2(enum nft_key_types a
, enum nft_key_types b
) {
627 uint32_t type
= (uint32_t)a
;
635 /* enough space to hold netlink messages for table skeleton */
636 #define NFT_INIT_MSGS 16
637 static int fw_nftables_init_family(sd_netlink
*nfnl
, int family
) {
638 sd_netlink_message
*batch
[NFT_INIT_MSGS
] = {};
639 size_t msgcnt
= 0, i
, ip_type_size
;
643 assert(IN_SET(family
, AF_INET
, AF_INET6
));
645 r
= sd_nfnl_message_batch_begin(nfnl
, &batch
[msgcnt
]);
650 assert(msgcnt
< NFT_INIT_MSGS
);
651 /* Set F_EXCL so table add fails if the table already exists. */
652 r
= sd_nfnl_nft_message_new_table(nfnl
, &batch
[msgcnt
], family
, NFT_SYSTEMD_TABLE_NAME
, NLM_F_EXCL
| NLM_F_ACK
);
657 assert(msgcnt
< NFT_INIT_MSGS
);
659 r
= sd_nfnl_nft_message_new_basechain(nfnl
, &batch
[msgcnt
], family
, NFT_SYSTEMD_TABLE_NAME
,
661 NF_INET_PRE_ROUTING
, NF_IP_PRI_NAT_DST
+ 1);
666 assert(msgcnt
< NFT_INIT_MSGS
);
667 r
= sd_nfnl_nft_message_new_basechain(nfnl
, &batch
[msgcnt
], family
, NFT_SYSTEMD_TABLE_NAME
,
669 NF_INET_LOCAL_OUT
, NF_IP_PRI_NAT_DST
+ 1);
674 assert(msgcnt
< NFT_INIT_MSGS
);
675 r
= sd_nfnl_nft_message_new_basechain(nfnl
, &batch
[msgcnt
], family
, NFT_SYSTEMD_TABLE_NAME
,
676 "postrouting", "nat",
677 NF_INET_POST_ROUTING
, NF_IP_PRI_NAT_SRC
+ 1);
681 if (family
== AF_INET
) {
682 ip_type_size
= sizeof(uint32_t);
683 ip_type
= TYPE_IPADDR
;
685 assert(family
== AF_INET6
);
686 ip_type_size
= sizeof(struct in6_addr
);
687 ip_type
= TYPE_IP6ADDR
;
690 assert(msgcnt
< NFT_INIT_MSGS
);
691 /* set to store ip address ranges we should masquerade for */
692 r
= nft_new_set(nfnl
, &batch
[msgcnt
], family
, NFT_SYSTEMD_MASQ_SET_NAME
, ++set_id
, NFT_SET_INTERVAL
, ip_type
, ip_type_size
);
697 * map to store ip address:port pair to dnat to. elements in concatenation
698 * are rounded up to 4 bytes.
700 * Example: ip protocol . tcp daddr is sizeof(uint32_t) + sizeof(uint32_t), not
701 * sizeof(uint8_t) + sizeof(uint16_t).
704 assert(msgcnt
< NFT_INIT_MSGS
);
705 r
= nft_new_map(nfnl
, &batch
[msgcnt
], family
, NFT_SYSTEMD_DNAT_MAP_NAME
, ++set_id
, 0,
706 concat_types2(TYPE_INET_PROTOCOL
, TYPE_INET_SERVICE
), sizeof(uint32_t) * 2,
707 concat_types2(ip_type
, TYPE_INET_SERVICE
), ip_type_size
+ sizeof(uint32_t));
712 assert(msgcnt
< NFT_INIT_MSGS
);
713 r
= sd_nfnl_message_new_dnat_rule_pre(nfnl
, &batch
[msgcnt
], family
, "prerouting");
718 assert(msgcnt
< NFT_INIT_MSGS
);
719 r
= sd_nfnl_message_new_dnat_rule_out(nfnl
, &batch
[msgcnt
], family
, "output");
724 r
= sd_nfnl_message_new_masq_rule(nfnl
, &batch
[msgcnt
], family
, "postrouting");
729 assert(msgcnt
< NFT_INIT_MSGS
);
730 r
= sd_nfnl_message_batch_end(nfnl
, &batch
[msgcnt
]);
735 assert(msgcnt
<= NFT_INIT_MSGS
);
736 r
= nfnl_netlink_sendv(nfnl
, batch
, msgcnt
);
741 for (i
= 0; i
< msgcnt
; i
++)
742 sd_netlink_message_unref(batch
[i
]);
747 int fw_nftables_init(FirewallContext
*ctx
) {
748 _cleanup_(sd_netlink_unrefp
) sd_netlink
*nfnl
= NULL
;
751 r
= sd_nfnl_socket_open(&nfnl
);
755 r
= fw_nftables_init_family(nfnl
, AF_INET
);
759 if (socket_ipv6_is_supported()) {
760 r
= fw_nftables_init_family(nfnl
, AF_INET6
);
762 log_debug_errno(r
, "Failed to init ipv6 NAT: %m");
765 ctx
->nfnl
= TAKE_PTR(nfnl
);
769 void fw_nftables_exit(FirewallContext
*ctx
) {
770 ctx
->nfnl
= sd_netlink_unref(ctx
->nfnl
);
773 static int nft_message_add_setelem_iprange(sd_netlink_message
*m
,
774 const union in_addr_union
*source
,
775 unsigned int prefixlen
) {
776 uint32_t mask
, start
, end
;
780 assert(prefixlen
<= 32);
781 nplen
= 32 - prefixlen
;
783 mask
= (1U << nplen
) - 1U;
784 mask
= htobe32(~mask
);
785 start
= source
->in
.s_addr
& mask
;
787 r
= sd_nfnl_nft_message_add_setelem(m
, 0, &start
, sizeof(start
), NULL
, 0);
791 r
= sd_nfnl_nft_message_add_setelem_end(m
);
795 end
= be32toh(start
) + (1U << nplen
);
796 if (end
< be32toh(start
))
800 r
= sd_nfnl_nft_message_add_setelem(m
, 1, &end
, sizeof(end
), NULL
, 0);
804 r
= sd_netlink_message_append_u32(m
, NFTA_SET_ELEM_FLAGS
, htobe32(NFT_SET_ELEM_INTERVAL_END
));
808 r
= sd_nfnl_nft_message_add_setelem_end(m
);
815 static int nft_message_add_setelem_ip6range(
816 sd_netlink_message
*m
,
817 const union in_addr_union
*source
,
818 unsigned int prefixlen
) {
820 union in_addr_union start
, end
;
823 r
= in_addr_prefix_range(AF_INET6
, source
, prefixlen
, &start
, &end
);
827 r
= sd_nfnl_nft_message_add_setelem(m
, 0, &start
.in6
, sizeof(start
.in6
), NULL
, 0);
831 r
= sd_nfnl_nft_message_add_setelem_end(m
);
835 r
= sd_nfnl_nft_message_add_setelem(m
, 1, &end
.in6
, sizeof(end
.in6
), NULL
, 0);
839 r
= sd_netlink_message_append_u32(m
, NFTA_SET_ELEM_FLAGS
, htobe32(NFT_SET_ELEM_INTERVAL_END
));
843 return sd_nfnl_nft_message_add_setelem_end(m
);
846 #define NFT_MASQ_MSGS 3
848 static int fw_nftables_add_masquerade_internal(
849 FirewallContext
*ctx
,
852 const union in_addr_union
*source
,
853 unsigned int source_prefixlen
) {
855 sd_netlink_message
*transaction
[NFT_MASQ_MSGS
] = {};
859 if (!source
|| source_prefixlen
== 0)
862 if (af
== AF_INET6
&& source_prefixlen
< 8)
865 r
= sd_nfnl_message_batch_begin(ctx
->nfnl
, &transaction
[0]);
870 r
= sd_nfnl_nft_message_new_setelems_begin(ctx
->nfnl
, &transaction
[tsize
], af
, NFT_SYSTEMD_TABLE_NAME
, NFT_SYSTEMD_MASQ_SET_NAME
);
872 r
= sd_nfnl_nft_message_del_setelems_begin(ctx
->nfnl
, &transaction
[tsize
], af
, NFT_SYSTEMD_TABLE_NAME
, NFT_SYSTEMD_MASQ_SET_NAME
);
877 r
= nft_message_add_setelem_iprange(transaction
[tsize
], source
, source_prefixlen
);
879 r
= nft_message_add_setelem_ip6range(transaction
[tsize
], source
, source_prefixlen
);
884 assert(tsize
< NFT_MASQ_MSGS
);
885 r
= sd_nfnl_message_batch_end(ctx
->nfnl
, &transaction
[tsize
]);
890 r
= nfnl_netlink_sendv(ctx
->nfnl
, transaction
, tsize
);
894 sd_netlink_message_unref(transaction
[--tsize
]);
895 return r
< 0 ? r
: 0;
898 int fw_nftables_add_masquerade(
899 FirewallContext
*ctx
,
902 const union in_addr_union
*source
,
903 unsigned int source_prefixlen
) {
907 if (!socket_ipv6_is_supported() && af
== AF_INET6
)
910 r
= fw_nftables_add_masquerade_internal(ctx
, add
, af
, source
, source_prefixlen
);
914 /* When someone runs 'nft flush ruleset' in the same net namespace this will also tear down the
917 * Unlike iptables -t nat -F (which will remove all rules added by the systemd iptables
918 * backend, iptables has builtin chains that cannot be deleted -- the next add operation will
921 * In the nftables case, everything gets removed. The next add operation will yield -ENOENT.
923 * If we see -ENOENT on add, replay the initial table setup. If that works, re-do the add
926 * Note that this doesn't protect against external sabotage such as a
927 * 'while true; nft flush ruleset; done'. There is nothing that could be done about that short
928 * of extending the kernel to allow tables to be owned by stystemd-networkd and making them
929 * non-deleteable except by the 'owning process'. */
931 r
= fw_nftables_init_family(ctx
->nfnl
, af
);
935 return fw_nftables_add_masquerade_internal(ctx
, add
, af
, source
, source_prefixlen
);
938 #define NFT_DNAT_MSGS 4
940 static int fw_nftables_add_local_dnat_internal(
941 FirewallContext
*ctx
,
946 const union in_addr_union
*remote
,
947 uint16_t remote_port
,
948 const union in_addr_union
*previous_remote
) {
950 sd_netlink_message
*transaction
[NFT_DNAT_MSGS
] = {};
951 static bool ipv6_supported
= true;
952 uint32_t data
[5], key
[2], dlen
;
956 assert(add
|| !previous_remote
);
958 if (!ipv6_supported
&& af
== AF_INET6
)
961 if (!IN_SET(protocol
, IPPROTO_TCP
, IPPROTO_UDP
))
962 return -EPROTONOSUPPORT
;
968 key
[1] = htobe16(local_port
);
973 if (remote_port
<= 0)
978 data
[1] = htobe16(remote_port
);
980 assert(af
== AF_INET6
);
982 data
[4] = htobe16(remote_port
);
985 r
= sd_nfnl_message_batch_begin(ctx
->nfnl
, &transaction
[0]);
990 /* If a previous remote is set, remove its entry */
991 if (add
&& previous_remote
&& !in_addr_equal(af
, previous_remote
, remote
)) {
993 data
[0] = previous_remote
->in
.s_addr
;
995 memcpy(data
, &previous_remote
->in6
, sizeof(previous_remote
->in6
));
997 r
= nft_del_element(ctx
->nfnl
, &transaction
[tsize
], af
, NFT_SYSTEMD_DNAT_MAP_NAME
, key
, sizeof(key
), data
, dlen
);
1005 data
[0] = remote
->in
.s_addr
;
1007 memcpy(data
, &remote
->in6
, sizeof(remote
->in6
));
1009 assert(tsize
< NFT_DNAT_MSGS
);
1011 r
= nft_add_element(ctx
->nfnl
, &transaction
[tsize
], af
, NFT_SYSTEMD_DNAT_MAP_NAME
, key
, sizeof(key
), data
, dlen
);
1013 r
= nft_del_element(ctx
->nfnl
, &transaction
[tsize
], af
, NFT_SYSTEMD_DNAT_MAP_NAME
, key
, sizeof(key
), data
, dlen
);
1018 assert(tsize
< NFT_DNAT_MSGS
);
1020 r
= sd_nfnl_message_batch_end(ctx
->nfnl
, &transaction
[tsize
]);
1025 assert(tsize
<= NFT_DNAT_MSGS
);
1027 r
= nfnl_netlink_sendv(ctx
->nfnl
, transaction
, tsize
);
1028 if (r
== -EOVERFLOW
&& af
== AF_INET6
) {
1029 /* The current implementation of DNAT in systemd requires kernel's
1030 * fdb9c405e35bdc6e305b9b4e20ebc141ed14fc81 (v5.8), and the older kernel returns
1031 * -EOVERFLOW. Let's treat the error as -EOPNOTSUPP. */
1032 log_debug_errno(r
, "The current implementation of IPv6 DNAT in systemd requires kernel 5.8 or newer, ignoring: %m");
1033 ipv6_supported
= false;
1039 sd_netlink_message_unref(transaction
[--tsize
]);
1041 return r
< 0 ? r
: 0;
1044 int fw_nftables_add_local_dnat(
1045 FirewallContext
*ctx
,
1049 uint16_t local_port
,
1050 const union in_addr_union
*remote
,
1051 uint16_t remote_port
,
1052 const union in_addr_union
*previous_remote
) {
1056 if (!socket_ipv6_is_supported() && af
== AF_INET6
)
1059 r
= fw_nftables_add_local_dnat_internal(ctx
, add
, af
, protocol
, local_port
, remote
, remote_port
, previous_remote
);
1063 /* See comment in fw_nftables_add_masquerade(). */
1064 r
= fw_nftables_init_family(ctx
->nfnl
, af
);
1068 /* table created anew; previous address already gone */
1069 return fw_nftables_add_local_dnat_internal(ctx
, add
, af
, protocol
, local_port
, remote
, remote_port
, NULL
);