]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/shared/firewall-util-nft.c
Merge pull request #19768 from poettering/homectl-fido2-lock-with
[thirdparty/systemd.git] / src / shared / firewall-util-nft.c
CommitLineData
7a6eb60b 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
715a70e7
FW
2
3#include <arpa/inet.h>
4#include <endian.h>
5#include <errno.h>
6#include <stddef.h>
7#include <string.h>
8#include <linux/netfilter/nf_tables.h>
9#include <linux/netfilter/nf_nat.h>
10#include <linux/netfilter_ipv4.h>
11#include <netinet/ip.h>
0e544221 12#include <netinet/ip6.h>
715a70e7
FW
13
14#include "sd-netlink.h"
15
16#include "alloc-util.h"
17#include "firewall-util.h"
18#include "firewall-util-private.h"
19#include "in-addr-util.h"
20#include "macro.h"
21#include "socket-util.h"
22#include "time-util.h"
23
24#define NFT_SYSTEMD_DNAT_MAP_NAME "map_port_ipport"
25#define NFT_SYSTEMD_TABLE_NAME "io.systemd.nat"
26#define NFT_SYSTEMD_MASQ_SET_NAME "masq_saddr"
27
28#define NFNL_DEFAULT_TIMEOUT_USECS (1ULL * USEC_PER_SEC)
29
30#define UDP_DPORT_OFFSET 2
31
32static int nfnl_netlink_sendv(sd_netlink *nfnl,
33 sd_netlink_message *messages[],
34 size_t msgcount) {
35 _cleanup_free_ uint32_t *serial = NULL;
36 size_t i;
37 int r;
38
39 assert(msgcount > 0);
40
41 r = sd_netlink_sendv(nfnl, messages, msgcount, &serial);
42 if (r < 0)
43 return r;
44
45 r = 0;
46 for (i = 1; i < msgcount - 1; i++) {
47 int tmp;
48
49 /* If message is an error, this returns embedded errno */
50 tmp = sd_netlink_read(nfnl, serial[i], NFNL_DEFAULT_TIMEOUT_USECS, NULL);
51 if (tmp < 0 && r == 0)
52 r = tmp;
53 }
54
55 return r;
56}
57
58static int nfnl_add_open_expr_container(sd_netlink_message *m, const char *name) {
59 int r;
60
61 r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM);
62 if (r < 0)
63 return r;
64
65 r = sd_netlink_message_append_string(m, NFTA_EXPR_NAME, name);
66 if (r < 0)
67 return r;
68
69 return sd_netlink_message_open_container_union(m, NFTA_EXPR_DATA, name);
70}
71
72static int nfnl_add_expr_fib(sd_netlink_message *m, uint32_t nft_fib_flags,
73 enum nft_fib_result result,
74 enum nft_registers dreg) {
75 int r;
76
77 r = nfnl_add_open_expr_container(m, "fib");
78 if (r < 0)
79 return r;
80
81 r = sd_netlink_message_append_u32(m, NFTA_FIB_FLAGS, htobe32(nft_fib_flags));
82 if (r < 0)
83 return r;
84 r = sd_netlink_message_append_u32(m, NFTA_FIB_RESULT, htobe32(result));
85 if (r < 0)
86 return r;
87 r = sd_netlink_message_append_u32(m, NFTA_FIB_DREG, htobe32(dreg));
88 if (r < 0)
89 return r;
90
91 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
92 if (r < 0)
93 return r;
94
95 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
96}
97
98static int nfnl_add_expr_meta(sd_netlink_message *m, enum nft_meta_keys key,
99 enum nft_registers dreg) {
100 int r;
101
102 r = nfnl_add_open_expr_container(m, "meta");
103 if (r < 0)
104 return r;
105
106 r = sd_netlink_message_append_u32(m, NFTA_META_KEY, htobe32(key));
107 if (r < 0)
108 return r;
109
110 r = sd_netlink_message_append_u32(m, NFTA_META_DREG, htobe32(dreg));
111 if (r < 0)
112 return r;
113
114 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
115 if (r < 0)
116 return r;
117
118 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
119}
120
121static int nfnl_add_expr_payload(sd_netlink_message *m, enum nft_payload_bases pb,
122 uint32_t offset, uint32_t len, enum nft_registers dreg) {
123 int r;
124
125 r = nfnl_add_open_expr_container(m, "payload");
126 if (r < 0)
127 return r;
128
129 r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_DREG, htobe32(dreg));
130 if (r < 0)
131 return r;
132 r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_BASE, htobe32(pb));
133 if (r < 0)
134 return r;
135 r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_OFFSET, htobe32(offset));
136 if (r < 0)
137 return r;
138 r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_LEN, htobe32(len));
139 if (r < 0)
140 return r;
141
142 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
143 if (r < 0)
144 return r;
145 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
146}
147
148static int nfnl_add_expr_lookup_set_data(sd_netlink_message *m, const char *set_name,
149 enum nft_registers sreg) {
150 int r;
151
152 r = nfnl_add_open_expr_container(m, "lookup");
153 if (r < 0)
154 return r;
155
156 r = sd_netlink_message_append_string(m, NFTA_LOOKUP_SET, set_name);
157 if (r < 0)
158 return r;
159
160 return sd_netlink_message_append_u32(m, NFTA_LOOKUP_SREG, htobe32(sreg));
161}
162
163static int nfnl_add_expr_lookup_set(sd_netlink_message *m, const char *set_name,
164 enum nft_registers sreg) {
165 int r;
166
167 r = nfnl_add_expr_lookup_set_data(m, set_name, sreg);
168 if (r < 0)
169 return r;
170
171 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
172 if (r < 0)
173 return r;
174 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
175}
176
177static int nfnl_add_expr_lookup_map(sd_netlink_message *m, const char *set_name,
178 enum nft_registers sreg, enum nft_registers dreg) {
179 int r;
180
181 r = nfnl_add_expr_lookup_set_data(m, set_name, sreg);
182 if (r < 0)
183 return r;
184
185 r = sd_netlink_message_append_u32(m, NFTA_LOOKUP_DREG, htobe32(dreg));
186 if (r < 0)
187 return r;
188
189 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
190 if (r < 0)
191 return r;
192
193 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
194}
195
196static int nfnl_add_expr_data(sd_netlink_message *m, int attr, const void *data, uint32_t dlen) {
197 int r;
198
199 r = sd_netlink_message_open_container(m, attr);
200 if (r < 0)
201 return r;
202 r = sd_netlink_message_append_data(m, NFTA_DATA_VALUE, data, dlen);
203 if (r < 0)
204 return r;
205
206 return sd_netlink_message_close_container(m); /* attr */
207}
208
209static int nfnl_add_expr_cmp_data(sd_netlink_message *m, const void *data, uint32_t dlen) {
210 return nfnl_add_expr_data(m, NFTA_CMP_DATA, data, dlen);
211}
212
213static int nfnl_add_expr_cmp(sd_netlink_message *m, enum nft_cmp_ops cmp_op,
214 enum nft_registers sreg, const void *data, uint32_t dlen) {
215 int r;
216
217 r = nfnl_add_open_expr_container(m, "cmp");
218 if (r < 0)
219 return r;
220
221 r = sd_netlink_message_append_u32(m, NFTA_CMP_OP, htobe32(cmp_op));
222 if (r < 0)
223 return r;
224 r = sd_netlink_message_append_u32(m, NFTA_CMP_SREG, htobe32(sreg));
225 if (r < 0)
226 return r;
227
228 r = nfnl_add_expr_cmp_data(m, data, dlen);
229 if (r < 0)
230 return r;
231
232 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
233 if (r < 0)
234 return r;
235 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
236}
237
238static int nfnl_add_expr_bitwise(sd_netlink_message *m,
239 enum nft_registers sreg,
240 enum nft_registers dreg,
241 const void *and,
242 const void *xor, uint32_t len) {
243 int r;
244
245 r = nfnl_add_open_expr_container(m, "bitwise");
246 if (r < 0)
247 return r;
248
249 r = sd_netlink_message_append_u32(m, NFTA_BITWISE_SREG, htobe32(sreg));
250 if (r < 0)
251 return r;
252 r = sd_netlink_message_append_u32(m, NFTA_BITWISE_DREG, htobe32(dreg));
253 if (r < 0)
254 return r;
255 r = sd_netlink_message_append_u32(m, NFTA_BITWISE_LEN, htobe32(len));
256 if (r < 0)
257 return r;
258
259 r = nfnl_add_expr_data(m, NFTA_BITWISE_MASK, and, len);
260 if (r < 0)
261 return r;
262
263 r = nfnl_add_expr_data(m, NFTA_BITWISE_XOR, xor, len);
264 if (r < 0)
265 return r;
266
267 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
268 if (r < 0)
269 return r;
270 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
271}
272
273static int nfnl_add_expr_dnat(sd_netlink_message *m,
274 int family,
275 enum nft_registers areg,
276 enum nft_registers preg) {
277 int r;
278
279 r = nfnl_add_open_expr_container(m, "nat");
280 if (r < 0)
281 return r;
282
283 r = sd_netlink_message_append_u32(m, NFTA_NAT_TYPE, htobe32(NFT_NAT_DNAT));
284 if (r < 0)
285 return r;
286
287 r = sd_netlink_message_append_u32(m, NFTA_NAT_FAMILY, htobe32(family));
288 if (r < 0)
289 return r;
290
291 r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_ADDR_MIN, htobe32(areg));
292 if (r < 0)
293 return r;
294 r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_PROTO_MIN, htobe32(preg));
295 if (r < 0)
296 return r;
297 r = sd_netlink_message_close_container(m);
298 if (r < 0)
299 return r;
300
301 return sd_netlink_message_close_container(m);
302}
303
304static int nfnl_add_expr_masq(sd_netlink_message *m) {
305 int r;
306
307 r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM);
308 if (r < 0)
309 return r;
310
311 r = sd_netlink_message_append_string(m, NFTA_EXPR_NAME, "masq");
312 if (r < 0)
313 return r;
314
315 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
316}
317
715a70e7
FW
318static int sd_nfnl_message_new_masq_rule(sd_netlink *nfnl, sd_netlink_message **ret, int family,
319 const char *chain) {
320 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
321 int r;
322
45861042
YW
323 /* -t nat -A POSTROUTING -p protocol -s source/pflen -o out_interface -d destination/pflen -j MASQUERADE */
324
715a70e7
FW
325 r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
326 if (r < 0)
327 return r;
328
329 r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
330 if (r < 0)
331 return r;
332
0e544221
FW
333 /* 1st statement: ip saddr @masq_saddr. Place iph->saddr in reg1, resp. ipv6 in reg1..reg4. */
334 if (family == AF_INET)
335 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, saddr),
336 sizeof(uint32_t), NFT_REG32_01);
337 else
338 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct ip6_hdr, ip6_src.s6_addr),
339 sizeof(struct in6_addr), NFT_REG32_01);
715a70e7
FW
340 if (r < 0)
341 return r;
342
343 /* 1st statement: use reg1 content to make lookup in @masq_saddr set. */
344 r = nfnl_add_expr_lookup_set(m, NFT_SYSTEMD_MASQ_SET_NAME, NFT_REG32_01);
345 if (r < 0)
346 return r;
347
348 /* 2nd statement: masq. Only executed by kernel if the previous lookup was successful. */
349 r = nfnl_add_expr_masq(m);
350 if (r < 0)
351 return r;
352
353 r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
354 if (r < 0)
355 return r;
356 *ret = TAKE_PTR(m);
357 return 0;
358}
359
715a70e7
FW
360static int sd_nfnl_message_new_dnat_rule_pre(sd_netlink *nfnl, sd_netlink_message **ret, int family,
361 const char *chain) {
362 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
363 enum nft_registers proto_reg;
364 uint32_t local = RTN_LOCAL;
365 int r;
366
45861042
YW
367 /* -t nat -A PREROUTING -p protocol --dport local_port -i in_interface -s source/pflen
368 * -d destination/pflen -j DNAT --to-destination remote_addr:remote_port */
369
715a70e7
FW
370 r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
371 if (r < 0)
372 return r;
373
374 r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
375 if (r < 0)
376 return r;
377
378 /* 1st statement: fib daddr type local */
379 r = nfnl_add_expr_fib(m, NFTA_FIB_F_DADDR, NFT_FIB_RESULT_ADDRTYPE, NFT_REG32_01);
380 if (r < 0)
381 return r;
382
383 /* 1st statement (cont.): compare RTN_LOCAL */
384 r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &local, sizeof(local));
385 if (r < 0)
386 return r;
387
388 /* 2nd statement: lookup local port in map, fetch address:dport to map to */
389 r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01);
390 if (r < 0)
391 return r;
392
393 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET,
394 sizeof(uint16_t), NFT_REG32_02);
395 if (r < 0)
396 return r;
397
398 /* 3rd statement: lookup 'l4proto . dport', e.g. 'tcp . 22' as key and
399 * store address and port for the dnat mapping in REG1/REG2.
400 */
401 r = nfnl_add_expr_lookup_map(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01);
402 if (r < 0)
403 return r;
404
0e544221 405 proto_reg = family == AF_INET ? NFT_REG32_02 : NFT_REG32_05;
715a70e7
FW
406 r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg);
407 if (r < 0)
408 return r;
409
410 r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
411 if (r < 0)
412 return r;
413 *ret = TAKE_PTR(m);
414 return 0;
415}
416
417static int sd_nfnl_message_new_dnat_rule_out(sd_netlink *nfnl, sd_netlink_message **ret,
418 int family, const char *chain) {
0e544221 419 static const uint32_t zero = 0, one = 1;
715a70e7 420
715a70e7
FW
421 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
422 enum nft_registers proto_reg;
423 int r;
424
425 r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
426 if (r < 0)
427 return r;
428
429 r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
430 if (r < 0)
431 return r;
432
0e544221
FW
433 /* 1st statement: exclude 127.0.0.1/8: ip daddr != 127.0.0.1/8, resp. avoid ::1 */
434 if (family == AF_INET) {
435 uint32_t lonet = htobe32(UINT32_C(0x7F000000)), lomask = htobe32(UINT32_C(0xff000000));
715a70e7 436
0e544221
FW
437 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, daddr),
438 sizeof(lonet), NFT_REG32_01);
439 if (r < 0)
440 return r;
441 /* 1st statement (cont.): bitops/prefix */
442 r = nfnl_add_expr_bitwise(m, NFT_REG32_01, NFT_REG32_01, &lomask, &zero, sizeof(lomask));
443 if (r < 0)
444 return r;
715a70e7 445
0e544221
FW
446 /* 1st statement (cont.): compare reg1 with 127/8 */
447 r = nfnl_add_expr_cmp(m, NFT_CMP_NEQ, NFT_REG32_01, &lonet, sizeof(lonet));
448 } else {
449 struct in6_addr loaddr = IN6ADDR_LOOPBACK_INIT;
715a70e7 450
0e544221
FW
451 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct ip6_hdr, ip6_dst.s6_addr),
452 sizeof(loaddr), NFT_REG32_01);
453 if (r < 0)
454 return r;
455
456 r = nfnl_add_expr_cmp(m, NFT_CMP_NEQ, NFT_REG32_01, &loaddr, sizeof(loaddr));
457 }
715a70e7
FW
458 if (r < 0)
459 return r;
460
461 /* 2nd statement: meta oif lo */
462 r = nfnl_add_expr_meta(m, NFT_META_OIF, NFT_REG32_01);
463 if (r < 0)
464 return r;
465
466 /* 2nd statement (cont.): compare to lo ifindex (1) */
467 r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &one, sizeof(one));
468 if (r < 0)
469 return r;
470
471 /* 3rd statement: meta l4proto . th dport dnat ip . port to map @map_port_ipport */
472 r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01);
473 if (r < 0)
474 return r;
475
476 /* 3rd statement (cont): store the port number in reg2 */
477 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET,
478 sizeof(uint16_t), NFT_REG32_02);
479 if (r < 0)
480 return r;
481
482 /* 3rd statement (cont): use reg1 and reg2 and retrieve
483 * the new destination ip and port number.
484 *
485 * reg1 and reg2 are clobbered and will then contain the new
486 * address/port number.
487 */
488 r = nfnl_add_expr_lookup_map(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01);
489 if (r < 0)
490 return r;
491
492 /* 4th statement: dnat connection to address/port retrieved by the
45861042 493 * preceding expression. */
0e544221 494 proto_reg = family == AF_INET ? NFT_REG32_02 : NFT_REG32_05;
715a70e7
FW
495 r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg);
496 if (r < 0)
497 return r;
498
499 r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
500 if (r < 0)
501 return r;
502 *ret = TAKE_PTR(m);
503 return 0;
504}
505
506static int nft_new_set(struct sd_netlink *nfnl,
507 sd_netlink_message **ret,
508 int family, const char *set_name,
509 uint32_t set_id,
510 uint32_t flags, uint32_t type, uint32_t klen) {
511 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
512 int r;
513
514 r = sd_nfnl_nft_message_new_set(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name, set_id, klen);
515 if (r < 0)
516 return r;
517
518 if (flags != 0) {
519 r = sd_netlink_message_append_u32(m, NFTA_SET_FLAGS, htobe32(flags));
520 if (r < 0)
521 return r;
522 }
523
524 r = sd_netlink_message_append_u32(m, NFTA_SET_KEY_TYPE, htobe32(type));
525 if (r < 0)
526 return r;
527
528 *ret = TAKE_PTR(m);
529 return r;
530}
531
532static int nft_new_map(struct sd_netlink *nfnl,
533 sd_netlink_message **ret,
534 int family, const char *set_name, uint32_t set_id,
535 uint32_t flags, uint32_t type, uint32_t klen, uint32_t dtype, uint32_t dlen) {
536 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
537 int r;
538
539 r = nft_new_set(nfnl, &m, family, set_name, set_id, flags | NFT_SET_MAP, type, klen);
540 if (r < 0)
541 return r;
542
543 r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_TYPE, htobe32(dtype));
544 if (r < 0)
545 return r;
546
547 r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_LEN, htobe32(dlen));
548 if (r < 0)
549 return r;
550 *ret = TAKE_PTR(m);
551 return 0;
552}
553
554static int nft_add_element(sd_netlink *nfnl, sd_netlink_message **ret,
555 int family, const char *set_name,
556 const void *key, uint32_t klen,
557 const void *data, uint32_t dlen) {
558 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
559 int r;
560
561 /*
562 * Ideally there would be an API that provides:
563 *
564 * 1) a init function to add the main ruleset skeleton
565 * 2) a function that populates the sets with all known address/port pairs to s/dnat for
566 * 3) a function that can remove address/port pairs again.
567 *
568 * At this time, the existing API is used which is built on a
569 * 'add/delete a rule' paradigm.
570 *
571 * This replicated here and each element gets added to the set
572 * one-by-one.
573 */
574 r = sd_nfnl_nft_message_new_setelems_begin(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name);
575 if (r < 0)
576 return r;
577
578 r = sd_nfnl_nft_message_add_setelem(m, 0, key, klen, data, dlen);
579 if (r < 0)
580 return r;
581
582 /* could theoretically append more set elements to add here */
583 r = sd_nfnl_nft_message_add_setelem_end(m);
584 if (r < 0)
585 return r;
586 *ret = TAKE_PTR(m);
587 return 0;
588}
589
590static int nft_del_element(sd_netlink *nfnl,
591 sd_netlink_message **ret, int family, const char *set_name,
592 const void *key, uint32_t klen,
593 const void *data, uint32_t dlen) {
594 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
595 int r;
596
597 r = sd_nfnl_nft_message_del_setelems_begin(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name);
598 if (r < 0)
599 return r;
600
601 r = sd_nfnl_nft_message_add_setelem(m, 0, key, klen, data, dlen);
602 if (r < 0)
603 return r;
604
605 r = sd_nfnl_nft_message_add_setelem_end(m);
606 if (r < 0)
607 return r;
608 *ret = TAKE_PTR(m);
609 return 0;
610}
611
612/* This is needed so 'nft' userspace tool can properly format the contents
613 * of the set/map when someone uses 'nft' to inspect their content.
614 *
615 * The values cannot be changed, they are part of the nft tool type identifier ABI.
616 */
617#define TYPE_BITS 6
618
619enum nft_key_types {
620 TYPE_IPADDR = 7,
621 TYPE_IP6ADDR = 8,
622 TYPE_INET_PROTOCOL = 12,
623 TYPE_INET_SERVICE = 13,
624};
625
626static uint32_t concat_types2(enum nft_key_types a, enum nft_key_types b) {
627 uint32_t type = (uint32_t)a;
628
629 type <<= TYPE_BITS;
630 type |= (uint32_t)b;
631
632 return type;
633}
634
635/* enough space to hold netlink messages for table skeleton */
636#define NFT_INIT_MSGS 16
637static int fw_nftables_init_family(sd_netlink *nfnl, int family) {
638 sd_netlink_message *batch[NFT_INIT_MSGS] = {};
0e544221 639 size_t msgcnt = 0, i, ip_type_size;
715a70e7 640 uint32_t set_id = 0;
0e544221
FW
641 int ip_type, r;
642
643 assert(IN_SET(family, AF_INET, AF_INET6));
715a70e7
FW
644
645 r = sd_nfnl_message_batch_begin(nfnl, &batch[msgcnt]);
646 if (r < 0)
647 goto out_unref;
648
649 msgcnt++;
650 assert(msgcnt < NFT_INIT_MSGS);
651 /* Set F_EXCL so table add fails if the table already exists. */
652 r = sd_nfnl_nft_message_new_table(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME, NLM_F_EXCL | NLM_F_ACK);
653 if (r < 0)
654 goto out_unref;
655
656 msgcnt++;
657 assert(msgcnt < NFT_INIT_MSGS);
658
659 r = sd_nfnl_nft_message_new_basechain(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME,
660 "prerouting", "nat",
661 NF_INET_PRE_ROUTING, NF_IP_PRI_NAT_DST + 1);
662 if (r < 0)
663 goto out_unref;
664
665 msgcnt++;
666 assert(msgcnt < NFT_INIT_MSGS);
667 r = sd_nfnl_nft_message_new_basechain(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME,
668 "output", "nat",
669 NF_INET_LOCAL_OUT, NF_IP_PRI_NAT_DST + 1);
670 if (r < 0)
671 goto out_unref;
672
673 msgcnt++;
674 assert(msgcnt < NFT_INIT_MSGS);
675 r = sd_nfnl_nft_message_new_basechain(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_TABLE_NAME,
676 "postrouting", "nat",
677 NF_INET_POST_ROUTING, NF_IP_PRI_NAT_SRC + 1);
678 if (r < 0)
679 goto out_unref;
680
0e544221
FW
681 if (family == AF_INET) {
682 ip_type_size = sizeof(uint32_t);
683 ip_type = TYPE_IPADDR;
684 } else {
685 assert(family == AF_INET6);
686 ip_type_size = sizeof(struct in6_addr);
687 ip_type = TYPE_IP6ADDR;
688 }
715a70e7
FW
689 msgcnt++;
690 assert(msgcnt < NFT_INIT_MSGS);
691 /* set to store ip address ranges we should masquerade for */
692 r = nft_new_set(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_MASQ_SET_NAME, ++set_id, NFT_SET_INTERVAL, ip_type, ip_type_size);
693 if (r < 0)
694 goto out_unref;
695
696 /*
697 * map to store ip address:port pair to dnat to. elements in concatenation
698 * are rounded up to 4 bytes.
699 *
700 * Example: ip protocol . tcp daddr is sizeof(uint32_t) + sizeof(uint32_t), not
701 * sizeof(uint8_t) + sizeof(uint16_t).
702 */
703 msgcnt++;
704 assert(msgcnt < NFT_INIT_MSGS);
705 r = nft_new_map(nfnl, &batch[msgcnt], family, NFT_SYSTEMD_DNAT_MAP_NAME, ++set_id, 0,
706 concat_types2(TYPE_INET_PROTOCOL, TYPE_INET_SERVICE), sizeof(uint32_t) * 2,
707 concat_types2(ip_type, TYPE_INET_SERVICE), ip_type_size + sizeof(uint32_t));
708 if (r < 0)
709 goto out_unref;
710
711 msgcnt++;
712 assert(msgcnt < NFT_INIT_MSGS);
713 r = sd_nfnl_message_new_dnat_rule_pre(nfnl, &batch[msgcnt], family, "prerouting");
714 if (r < 0)
715 goto out_unref;
716
717 msgcnt++;
718 assert(msgcnt < NFT_INIT_MSGS);
719 r = sd_nfnl_message_new_dnat_rule_out(nfnl, &batch[msgcnt], family, "output");
720 if (r < 0)
721 goto out_unref;
722
723 msgcnt++;
724 r = sd_nfnl_message_new_masq_rule(nfnl, &batch[msgcnt], family, "postrouting");
725 if (r < 0)
726 goto out_unref;
727
728 msgcnt++;
729 assert(msgcnt < NFT_INIT_MSGS);
730 r = sd_nfnl_message_batch_end(nfnl, &batch[msgcnt]);
731 if (r < 0)
732 goto out_unref;
733
734 msgcnt++;
735 assert(msgcnt <= NFT_INIT_MSGS);
736 r = nfnl_netlink_sendv(nfnl, batch, msgcnt);
737 if (r == -EEXIST)
738 r = 0;
739
740out_unref:
741 for (i = 0; i < msgcnt; i++)
742 sd_netlink_message_unref(batch[i]);
743
744 return r;
745}
746
747int fw_nftables_init(FirewallContext *ctx) {
748 _cleanup_(sd_netlink_unrefp) sd_netlink *nfnl = NULL;
749 int r;
750
751 r = sd_nfnl_socket_open(&nfnl);
752 if (r < 0)
753 return r;
754
755 r = fw_nftables_init_family(nfnl, AF_INET);
756 if (r < 0)
757 return r;
758
0c4363a0
YW
759 if (socket_ipv6_is_supported()) {
760 r = fw_nftables_init_family(nfnl, AF_INET6);
761 if (r < 0)
762 log_debug_errno(r, "Failed to init ipv6 NAT: %m");
763 }
0e544221 764
715a70e7
FW
765 ctx->nfnl = TAKE_PTR(nfnl);
766 return 0;
767}
768
769void fw_nftables_exit(FirewallContext *ctx) {
770 ctx->nfnl = sd_netlink_unref(ctx->nfnl);
771}
772
773static int nft_message_add_setelem_iprange(sd_netlink_message *m,
774 const union in_addr_union *source,
775 unsigned int prefixlen) {
776 uint32_t mask, start, end;
777 unsigned int nplen;
778 int r;
779
780 assert(prefixlen <= 32);
781 nplen = 32 - prefixlen;
782
783 mask = (1U << nplen) - 1U;
784 mask = htobe32(~mask);
785 start = source->in.s_addr & mask;
786
787 r = sd_nfnl_nft_message_add_setelem(m, 0, &start, sizeof(start), NULL, 0);
788 if (r < 0)
789 return r;
790
791 r = sd_nfnl_nft_message_add_setelem_end(m);
792 if (r < 0)
793 return r;
794
795 end = be32toh(start) + (1U << nplen);
796 if (end < be32toh(start))
797 end = 0U;
798 end = htobe32(end);
799
800 r = sd_nfnl_nft_message_add_setelem(m, 1, &end, sizeof(end), NULL, 0);
801 if (r < 0)
802 return r;
803
804 r = sd_netlink_message_append_u32(m, NFTA_SET_ELEM_FLAGS, htobe32(NFT_SET_ELEM_INTERVAL_END));
805 if (r < 0)
806 return r;
807
808 r = sd_nfnl_nft_message_add_setelem_end(m);
809 if (r < 0)
810 return r;
811
812 return 0;
813}
814
99975074
YW
815static int nft_message_add_setelem_ip6range(
816 sd_netlink_message *m,
817 const union in_addr_union *source,
818 unsigned int prefixlen) {
0e544221 819
99975074 820 union in_addr_union start, end;
0e544221
FW
821 int r;
822
99975074
YW
823 r = in_addr_prefix_range(AF_INET6, source, prefixlen, &start, &end);
824 if (r < 0)
825 return r;
0e544221 826
99975074 827 r = sd_nfnl_nft_message_add_setelem(m, 0, &start.in6, sizeof(start.in6), NULL, 0);
0e544221
FW
828 if (r < 0)
829 return r;
830
831 r = sd_nfnl_nft_message_add_setelem_end(m);
832 if (r < 0)
833 return r;
834
99975074 835 r = sd_nfnl_nft_message_add_setelem(m, 1, &end.in6, sizeof(end.in6), NULL, 0);
0e544221
FW
836 if (r < 0)
837 return r;
838
839 r = sd_netlink_message_append_u32(m, NFTA_SET_ELEM_FLAGS, htobe32(NFT_SET_ELEM_INTERVAL_END));
840 if (r < 0)
841 return r;
842
843 return sd_nfnl_nft_message_add_setelem_end(m);
844}
845
715a70e7
FW
846#define NFT_MASQ_MSGS 3
847
5ee7c719 848static int fw_nftables_add_masquerade_internal(
715a70e7
FW
849 FirewallContext *ctx,
850 bool add,
851 int af,
852 const union in_addr_union *source,
853 unsigned int source_prefixlen) {
5ee7c719 854
715a70e7
FW
855 sd_netlink_message *transaction[NFT_MASQ_MSGS] = {};
856 size_t tsize;
857 int r;
858
859 if (!source || source_prefixlen == 0)
860 return -EINVAL;
861
0e544221
FW
862 if (af == AF_INET6 && source_prefixlen < 8)
863 return -EINVAL;
5ee7c719 864
715a70e7
FW
865 r = sd_nfnl_message_batch_begin(ctx->nfnl, &transaction[0]);
866 if (r < 0)
867 return r;
868 tsize = 1;
869 if (add)
870 r = sd_nfnl_nft_message_new_setelems_begin(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME);
871 else
872 r = sd_nfnl_nft_message_del_setelems_begin(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME);
715a70e7
FW
873 if (r < 0)
874 goto out_unref;
875
0e544221
FW
876 if (af == AF_INET)
877 r = nft_message_add_setelem_iprange(transaction[tsize], source, source_prefixlen);
878 else
879 r = nft_message_add_setelem_ip6range(transaction[tsize], source, source_prefixlen);
715a70e7
FW
880 if (r < 0)
881 goto out_unref;
882
883 ++tsize;
884 assert(tsize < NFT_MASQ_MSGS);
885 r = sd_nfnl_message_batch_end(ctx->nfnl, &transaction[tsize]);
886 if (r < 0)
887 return r;
5ee7c719 888
715a70e7
FW
889 ++tsize;
890 r = nfnl_netlink_sendv(ctx->nfnl, transaction, tsize);
891
892out_unref:
893 while (tsize > 0)
894 sd_netlink_message_unref(transaction[--tsize]);
895 return r < 0 ? r : 0;
896}
897
5ee7c719
YW
898int fw_nftables_add_masquerade(
899 FirewallContext *ctx,
900 bool add,
901 int af,
902 const union in_addr_union *source,
903 unsigned int source_prefixlen) {
904
905 int r;
906
0c4363a0
YW
907 if (!socket_ipv6_is_supported() && af == AF_INET6)
908 return -EOPNOTSUPP;
909
5ee7c719
YW
910 r = fw_nftables_add_masquerade_internal(ctx, add, af, source, source_prefixlen);
911 if (r != -ENOENT)
912 return r;
913
914 /* When someone runs 'nft flush ruleset' in the same net namespace this will also tear down the
915 * systemd nat table.
916 *
917 * Unlike iptables -t nat -F (which will remove all rules added by the systemd iptables
918 * backend, iptables has builtin chains that cannot be deleted -- the next add operation will
919 * 'just work'.
920 *
921 * In the nftables case, everything gets removed. The next add operation will yield -ENOENT.
922 *
923 * If we see -ENOENT on add, replay the initial table setup. If that works, re-do the add
924 * operation.
925 *
926 * Note that this doesn't protect against external sabotage such as a
927 * 'while true; nft flush ruleset; done'. There is nothing that could be done about that short
928 * of extending the kernel to allow tables to be owned by stystemd-networkd and making them
929 * non-deleteable except by the 'owning process'. */
930
931 r = fw_nftables_init_family(ctx->nfnl, af);
932 if (r < 0)
933 return r;
934
935 return fw_nftables_add_masquerade_internal(ctx, add, af, source, source_prefixlen);
936}
937
715a70e7
FW
938#define NFT_DNAT_MSGS 4
939
5ee7c719 940static int fw_nftables_add_local_dnat_internal(
715a70e7
FW
941 FirewallContext *ctx,
942 bool add,
943 int af,
944 int protocol,
945 uint16_t local_port,
946 const union in_addr_union *remote,
947 uint16_t remote_port,
948 const union in_addr_union *previous_remote) {
5ee7c719 949
715a70e7 950 sd_netlink_message *transaction[NFT_DNAT_MSGS] = {};
175bc863 951 static bool ipv6_supported = true;
5ee7c719 952 uint32_t data[5], key[2], dlen;
715a70e7
FW
953 size_t tsize;
954 int r;
955
956 assert(add || !previous_remote);
957
175bc863
YW
958 if (!ipv6_supported && af == AF_INET6)
959 return -EOPNOTSUPP;
960
715a70e7
FW
961 if (!IN_SET(protocol, IPPROTO_TCP, IPPROTO_UDP))
962 return -EPROTONOSUPPORT;
963
964 if (local_port <= 0)
965 return -EINVAL;
966
967 key[0] = protocol;
968 key[1] = htobe16(local_port);
969
970 if (!remote)
971 return -EOPNOTSUPP;
972
973 if (remote_port <= 0)
974 return -EINVAL;
975
0e544221
FW
976 if (af == AF_INET) {
977 dlen = 8;
978 data[1] = htobe16(remote_port);
979 } else {
980 assert(af == AF_INET6);
981 dlen = sizeof(data);
982 data[4] = htobe16(remote_port);
983 }
715a70e7
FW
984
985 r = sd_nfnl_message_batch_begin(ctx->nfnl, &transaction[0]);
986 if (r < 0)
987 return r;
988
989 tsize = 1;
990 /* If a previous remote is set, remove its entry */
0e544221
FW
991 if (add && previous_remote && !in_addr_equal(af, previous_remote, remote)) {
992 if (af == AF_INET)
993 data[0] = previous_remote->in.s_addr;
994 else
995 memcpy(data, &previous_remote->in6, sizeof(previous_remote->in6));
715a70e7 996
0e544221 997 r = nft_del_element(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, dlen);
715a70e7
FW
998 if (r < 0)
999 goto out_unref;
1000
1001 tsize++;
1002 }
1003
0e544221
FW
1004 if (af == AF_INET)
1005 data[0] = remote->in.s_addr;
1006 else
1007 memcpy(data, &remote->in6, sizeof(remote->in6));
715a70e7
FW
1008
1009 assert(tsize < NFT_DNAT_MSGS);
1010 if (add)
84af90ba 1011 r = nft_add_element(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, dlen);
715a70e7 1012 else
84af90ba
YW
1013 r = nft_del_element(ctx->nfnl, &transaction[tsize], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, dlen);
1014 if (r < 0)
1015 goto out_unref;
715a70e7
FW
1016
1017 tsize++;
1018 assert(tsize < NFT_DNAT_MSGS);
1019
1020 r = sd_nfnl_message_batch_end(ctx->nfnl, &transaction[tsize]);
1021 if (r < 0)
1022 goto out_unref;
1023
1024 tsize++;
1025 assert(tsize <= NFT_DNAT_MSGS);
175bc863 1026
715a70e7 1027 r = nfnl_netlink_sendv(ctx->nfnl, transaction, tsize);
175bc863
YW
1028 if (r == -EOVERFLOW && af == AF_INET6) {
1029 /* The current implementation of DNAT in systemd requires kernel's
1030 * fdb9c405e35bdc6e305b9b4e20ebc141ed14fc81 (v5.8), and the older kernel returns
1031 * -EOVERFLOW. Let's treat the error as -EOPNOTSUPP. */
1032 log_debug_errno(r, "The current implementation of IPv6 DNAT in systemd requires kernel 5.8 or newer, ignoring: %m");
1033 ipv6_supported = false;
1034 r = -EOPNOTSUPP;
1035 }
715a70e7
FW
1036
1037out_unref:
1038 while (tsize > 0)
1039 sd_netlink_message_unref(transaction[--tsize]);
5ee7c719 1040
715a70e7
FW
1041 return r < 0 ? r : 0;
1042}
5ee7c719
YW
1043
1044int fw_nftables_add_local_dnat(
1045 FirewallContext *ctx,
1046 bool add,
1047 int af,
1048 int protocol,
1049 uint16_t local_port,
1050 const union in_addr_union *remote,
1051 uint16_t remote_port,
1052 const union in_addr_union *previous_remote) {
1053
1054 int r;
1055
0c4363a0
YW
1056 if (!socket_ipv6_is_supported() && af == AF_INET6)
1057 return -EOPNOTSUPP;
1058
5ee7c719
YW
1059 r = fw_nftables_add_local_dnat_internal(ctx, add, af, protocol, local_port, remote, remote_port, previous_remote);
1060 if (r != -ENOENT)
1061 return r;
1062
1063 /* See comment in fw_nftables_add_masquerade(). */
1064 r = fw_nftables_init_family(ctx->nfnl, af);
1065 if (r < 0)
1066 return r;
1067
1068 /* table created anew; previous address already gone */
1069 return fw_nftables_add_local_dnat_internal(ctx, add, af, protocol, local_port, remote, remote_port, NULL);
1070}