]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/shared/firewall-util-nft.c
66ea8ee0bdbdaa081109b52d777e409c01a00b46
[thirdparty/systemd.git] / src / shared / firewall-util-nft.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <arpa/inet.h>
4 #include <endian.h>
5 #include <errno.h>
6 #include <stddef.h>
7 #include <string.h>
8 #include <linux/netfilter/nf_tables.h>
9 #include <linux/netfilter/nf_nat.h>
10 #include <linux/netfilter_ipv4.h>
11 #include <netinet/ip.h>
12 #include <netinet/ip6.h>
13
14 #include "sd-netlink.h"
15
16 #include "alloc-util.h"
17 #include "firewall-util.h"
18 #include "firewall-util-private.h"
19 #include "in-addr-util.h"
20 #include "macro.h"
21 #include "netlink-internal.h"
22 #include "netlink-util.h"
23 #include "socket-util.h"
24 #include "time-util.h"
25
26 #define NFT_SYSTEMD_DNAT_MAP_NAME "map_port_ipport"
27 #define NFT_SYSTEMD_TABLE_NAME "io.systemd.nat"
28 #define NFT_SYSTEMD_MASQ_SET_NAME "masq_saddr"
29
30 #define NFNL_DEFAULT_TIMEOUT_USECS (1ULL * USEC_PER_SEC)
31
32 #define UDP_DPORT_OFFSET 2
33
34 static sd_netlink_message **netlink_message_unref_many(sd_netlink_message **m) {
35 if (!m)
36 return NULL;
37
38 /* This does not free array. The end of the array must be NULL. */
39
40 for (sd_netlink_message **p = m; *p; p++)
41 *p = sd_netlink_message_unref(*p);
42
43 return m;
44 }
45
46 DEFINE_TRIVIAL_CLEANUP_FUNC(sd_netlink_message**, netlink_message_unref_many);
47
48 static int nfnl_netlink_sendv(
49 sd_netlink *nfnl,
50 sd_netlink_message *messages[static 1],
51 size_t msgcount) {
52
53 _cleanup_free_ uint32_t *serial = NULL;
54 int r;
55
56 assert(nfnl);
57 assert(messages);
58 assert(msgcount > 0);
59
60 r = sd_netlink_sendv(nfnl, messages, msgcount, &serial);
61 if (r < 0)
62 return r;
63
64 r = 0;
65 for (size_t i = 1; i < msgcount - 1; i++) {
66 int tmp;
67
68 /* If message is an error, this returns embedded errno */
69 tmp = sd_netlink_read(nfnl, serial[i], NFNL_DEFAULT_TIMEOUT_USECS, NULL);
70 if (tmp < 0 && r == 0)
71 r = tmp;
72 }
73
74 return r;
75 }
76
77 static int nfnl_add_open_expr_container(sd_netlink_message *m, const char *name) {
78 int r;
79
80 assert(m);
81 assert(name);
82
83 r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM);
84 if (r < 0)
85 return r;
86
87 return sd_netlink_message_open_container_union(m, NFTA_EXPR_DATA, name);
88 }
89
90 static int nfnl_add_expr_fib(
91 sd_netlink_message *m,
92 uint32_t nft_fib_flags,
93 enum nft_fib_result result,
94 enum nft_registers dreg) {
95
96 int r;
97
98 assert(m);
99
100 r = nfnl_add_open_expr_container(m, "fib");
101 if (r < 0)
102 return r;
103
104 r = sd_netlink_message_append_u32(m, NFTA_FIB_FLAGS, htobe32(nft_fib_flags));
105 if (r < 0)
106 return r;
107
108 r = sd_netlink_message_append_u32(m, NFTA_FIB_RESULT, htobe32(result));
109 if (r < 0)
110 return r;
111
112 r = sd_netlink_message_append_u32(m, NFTA_FIB_DREG, htobe32(dreg));
113 if (r < 0)
114 return r;
115
116 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
117 if (r < 0)
118 return r;
119
120 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
121 }
122
123 static int nfnl_add_expr_meta(
124 sd_netlink_message *m,
125 enum nft_meta_keys key,
126 enum nft_registers dreg) {
127
128 int r;
129
130 assert(m);
131
132 r = nfnl_add_open_expr_container(m, "meta");
133 if (r < 0)
134 return r;
135
136 r = sd_netlink_message_append_u32(m, NFTA_META_KEY, htobe32(key));
137 if (r < 0)
138 return r;
139
140 r = sd_netlink_message_append_u32(m, NFTA_META_DREG, htobe32(dreg));
141 if (r < 0)
142 return r;
143
144 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
145 if (r < 0)
146 return r;
147
148 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
149 }
150
151 static int nfnl_add_expr_payload(
152 sd_netlink_message *m,
153 enum nft_payload_bases pb,
154 uint32_t offset,
155 uint32_t len,
156 enum nft_registers dreg) {
157
158 int r;
159
160 assert(m);
161
162 r = nfnl_add_open_expr_container(m, "payload");
163 if (r < 0)
164 return r;
165
166 r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_DREG, htobe32(dreg));
167 if (r < 0)
168 return r;
169
170 r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_BASE, htobe32(pb));
171 if (r < 0)
172 return r;
173
174 r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_OFFSET, htobe32(offset));
175 if (r < 0)
176 return r;
177
178 r = sd_netlink_message_append_u32(m, NFTA_PAYLOAD_LEN, htobe32(len));
179 if (r < 0)
180 return r;
181
182 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
183 if (r < 0)
184 return r;
185
186 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
187 }
188
189 static int nfnl_add_expr_lookup_set_data(
190 sd_netlink_message *m,
191 const char *set_name,
192 enum nft_registers sreg) {
193
194 int r;
195
196 assert(m);
197 assert(set_name);
198
199 r = nfnl_add_open_expr_container(m, "lookup");
200 if (r < 0)
201 return r;
202
203 r = sd_netlink_message_append_string(m, NFTA_LOOKUP_SET, set_name);
204 if (r < 0)
205 return r;
206
207 return sd_netlink_message_append_u32(m, NFTA_LOOKUP_SREG, htobe32(sreg));
208 }
209
210 static int nfnl_add_expr_lookup_set(
211 sd_netlink_message *m,
212 const char *set_name,
213 enum nft_registers sreg) {
214
215 int r;
216
217 assert(m);
218 assert(set_name);
219
220 r = nfnl_add_expr_lookup_set_data(m, set_name, sreg);
221 if (r < 0)
222 return r;
223
224 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
225 if (r < 0)
226 return r;
227
228 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
229 }
230
231 static int nfnl_add_expr_lookup_map(
232 sd_netlink_message *m,
233 const char *set_name,
234 enum nft_registers sreg,
235 enum nft_registers dreg) {
236
237 int r;
238
239 assert(m);
240 assert(set_name);
241
242 r = nfnl_add_expr_lookup_set_data(m, set_name, sreg);
243 if (r < 0)
244 return r;
245
246 r = sd_netlink_message_append_u32(m, NFTA_LOOKUP_DREG, htobe32(dreg));
247 if (r < 0)
248 return r;
249
250 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
251 if (r < 0)
252 return r;
253
254 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
255 }
256
257 static int nfnl_add_expr_cmp(
258 sd_netlink_message *m,
259 enum nft_cmp_ops cmp_op,
260 enum nft_registers sreg,
261 const void *data,
262 uint32_t dlen) {
263
264 int r;
265
266 assert(m);
267 assert(data);
268
269 r = nfnl_add_open_expr_container(m, "cmp");
270 if (r < 0)
271 return r;
272
273 r = sd_netlink_message_append_u32(m, NFTA_CMP_OP, htobe32(cmp_op));
274 if (r < 0)
275 return r;
276
277 r = sd_netlink_message_append_u32(m, NFTA_CMP_SREG, htobe32(sreg));
278 if (r < 0)
279 return r;
280
281 r = sd_netlink_message_append_container_data(m, NFTA_CMP_DATA, NFTA_DATA_VALUE, data, dlen);
282 if (r < 0)
283 return r;
284
285 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
286 if (r < 0)
287 return r;
288
289 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
290 }
291
292 static int nfnl_add_expr_bitwise(
293 sd_netlink_message *m,
294 enum nft_registers sreg,
295 enum nft_registers dreg,
296 const void *and,
297 const void *xor,
298 uint32_t len) {
299
300 int r;
301
302 assert(m);
303 assert(and);
304 assert(xor);
305
306 r = nfnl_add_open_expr_container(m, "bitwise");
307 if (r < 0)
308 return r;
309
310 r = sd_netlink_message_append_u32(m, NFTA_BITWISE_SREG, htobe32(sreg));
311 if (r < 0)
312 return r;
313
314 r = sd_netlink_message_append_u32(m, NFTA_BITWISE_DREG, htobe32(dreg));
315 if (r < 0)
316 return r;
317
318 r = sd_netlink_message_append_u32(m, NFTA_BITWISE_LEN, htobe32(len));
319 if (r < 0)
320 return r;
321
322 r = sd_netlink_message_append_container_data(m, NFTA_BITWISE_MASK, NFTA_DATA_VALUE, and, len);
323 if (r < 0)
324 return r;
325
326 r = sd_netlink_message_append_container_data(m, NFTA_BITWISE_XOR, NFTA_DATA_VALUE, xor, len);
327 if (r < 0)
328 return r;
329
330 r = sd_netlink_message_close_container(m); /* NFTA_EXPR_DATA */
331 if (r < 0)
332 return r;
333
334 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
335 }
336
337 static int nfnl_add_expr_dnat(
338 sd_netlink_message *m,
339 int family,
340 enum nft_registers areg,
341 enum nft_registers preg) {
342
343 int r;
344
345 assert(m);
346
347 r = nfnl_add_open_expr_container(m, "nat");
348 if (r < 0)
349 return r;
350
351 r = sd_netlink_message_append_u32(m, NFTA_NAT_TYPE, htobe32(NFT_NAT_DNAT));
352 if (r < 0)
353 return r;
354
355 r = sd_netlink_message_append_u32(m, NFTA_NAT_FAMILY, htobe32(family));
356 if (r < 0)
357 return r;
358
359 r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_ADDR_MIN, htobe32(areg));
360 if (r < 0)
361 return r;
362
363 r = sd_netlink_message_append_u32(m, NFTA_NAT_REG_PROTO_MIN, htobe32(preg));
364 if (r < 0)
365 return r;
366
367 r = sd_netlink_message_close_container(m);
368 if (r < 0)
369 return r;
370
371 return sd_netlink_message_close_container(m);
372 }
373
374 static int nfnl_add_expr_masq(sd_netlink_message *m) {
375 int r;
376
377 r = sd_netlink_message_open_array(m, NFTA_LIST_ELEM);
378 if (r < 0)
379 return r;
380
381 r = sd_netlink_message_append_string(m, NFTA_EXPR_NAME, "masq");
382 if (r < 0)
383 return r;
384
385 return sd_netlink_message_close_container(m); /* NFTA_LIST_ELEM */
386 }
387
388 static int sd_nfnl_message_new_masq_rule(
389 sd_netlink *nfnl,
390 sd_netlink_message **ret,
391 int family,
392 const char *chain) {
393
394 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
395 int r;
396
397 /* -t nat -A POSTROUTING -p protocol -s source/pflen -o out_interface -d destination/pflen -j MASQUERADE */
398
399 assert(nfnl);
400 assert(ret);
401 assert(IN_SET(family, AF_INET, AF_INET6));
402 assert(chain);
403
404 r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
405 if (r < 0)
406 return r;
407
408 r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
409 if (r < 0)
410 return r;
411
412 /* 1st statement: ip saddr @masq_saddr. Place iph->saddr in reg1, resp. ipv6 in reg1..reg4. */
413 if (family == AF_INET)
414 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, saddr),
415 sizeof(uint32_t), NFT_REG32_01);
416 else
417 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct ip6_hdr, ip6_src.s6_addr),
418 sizeof(struct in6_addr), NFT_REG32_01);
419 if (r < 0)
420 return r;
421
422 /* 1st statement: use reg1 content to make lookup in @masq_saddr set. */
423 r = nfnl_add_expr_lookup_set(m, NFT_SYSTEMD_MASQ_SET_NAME, NFT_REG32_01);
424 if (r < 0)
425 return r;
426
427 /* 2nd statement: masq. Only executed by kernel if the previous lookup was successful. */
428 r = nfnl_add_expr_masq(m);
429 if (r < 0)
430 return r;
431
432 r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
433 if (r < 0)
434 return r;
435
436 *ret = TAKE_PTR(m);
437 return 0;
438 }
439
440 static int sd_nfnl_message_new_dnat_rule_pre(
441 sd_netlink *nfnl,
442 sd_netlink_message **ret,
443 int family,
444 const char *chain) {
445
446 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
447 enum nft_registers proto_reg;
448 uint32_t local = RTN_LOCAL;
449 int r;
450
451 /* -t nat -A PREROUTING -p protocol --dport local_port -i in_interface -s source/pflen
452 * -d destination/pflen -j DNAT --to-destination remote_addr:remote_port */
453
454 assert(nfnl);
455 assert(ret);
456 assert(IN_SET(family, AF_INET, AF_INET6));
457 assert(chain);
458
459 r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
460 if (r < 0)
461 return r;
462
463 r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
464 if (r < 0)
465 return r;
466
467 /* 1st statement: fib daddr type local */
468 r = nfnl_add_expr_fib(m, NFTA_FIB_F_DADDR, NFT_FIB_RESULT_ADDRTYPE, NFT_REG32_01);
469 if (r < 0)
470 return r;
471
472 /* 1st statement (cont.): compare RTN_LOCAL */
473 r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &local, sizeof(local));
474 if (r < 0)
475 return r;
476
477 /* 2nd statement: lookup local port in map, fetch address:dport to map to */
478 r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01);
479 if (r < 0)
480 return r;
481
482 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET,
483 sizeof(uint16_t), NFT_REG32_02);
484 if (r < 0)
485 return r;
486
487 /* 3rd statement: lookup 'l4proto . dport', e.g. 'tcp . 22' as key and
488 * store address and port for the dnat mapping in REG1/REG2. */
489 r = nfnl_add_expr_lookup_map(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01);
490 if (r < 0)
491 return r;
492
493 proto_reg = family == AF_INET ? NFT_REG32_02 : NFT_REG32_05;
494 r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg);
495 if (r < 0)
496 return r;
497
498 r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
499 if (r < 0)
500 return r;
501
502 *ret = TAKE_PTR(m);
503 return 0;
504 }
505
506 static int sd_nfnl_message_new_dnat_rule_out(
507 sd_netlink *nfnl,
508 sd_netlink_message **ret,
509 int family,
510 const char *chain) {
511
512 static const uint32_t zero = 0, one = 1;
513 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
514 enum nft_registers proto_reg;
515 int r;
516
517 assert(nfnl);
518 assert(ret);
519 assert(IN_SET(family, AF_INET, AF_INET6));
520 assert(chain);
521
522 r = sd_nfnl_nft_message_new_rule(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, chain);
523 if (r < 0)
524 return r;
525
526 r = sd_netlink_message_open_container(m, NFTA_RULE_EXPRESSIONS);
527 if (r < 0)
528 return r;
529
530 /* 1st statement: exclude 127.0.0.1/8: ip daddr != 127.0.0.1/8, resp. avoid ::1 */
531 if (family == AF_INET) {
532 uint32_t lonet = htobe32(UINT32_C(0x7F000000)), lomask = htobe32(UINT32_C(0xff000000));
533
534 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct iphdr, daddr),
535 sizeof(lonet), NFT_REG32_01);
536 if (r < 0)
537 return r;
538 /* 1st statement (cont.): bitops/prefix */
539 r = nfnl_add_expr_bitwise(m, NFT_REG32_01, NFT_REG32_01, &lomask, &zero, sizeof(lomask));
540 if (r < 0)
541 return r;
542
543 /* 1st statement (cont.): compare reg1 with 127/8 */
544 r = nfnl_add_expr_cmp(m, NFT_CMP_NEQ, NFT_REG32_01, &lonet, sizeof(lonet));
545 } else {
546 struct in6_addr loaddr = IN6ADDR_LOOPBACK_INIT;
547
548 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_NETWORK_HEADER, offsetof(struct ip6_hdr, ip6_dst.s6_addr),
549 sizeof(loaddr), NFT_REG32_01);
550 if (r < 0)
551 return r;
552
553 r = nfnl_add_expr_cmp(m, NFT_CMP_NEQ, NFT_REG32_01, &loaddr, sizeof(loaddr));
554 }
555 if (r < 0)
556 return r;
557
558 /* 2nd statement: meta oif lo */
559 r = nfnl_add_expr_meta(m, NFT_META_OIF, NFT_REG32_01);
560 if (r < 0)
561 return r;
562
563 /* 2nd statement (cont.): compare to lo ifindex (1) */
564 r = nfnl_add_expr_cmp(m, NFT_CMP_EQ, NFT_REG32_01, &one, sizeof(one));
565 if (r < 0)
566 return r;
567
568 /* 3rd statement: meta l4proto . th dport dnat ip . port to map @map_port_ipport */
569 r = nfnl_add_expr_meta(m, NFT_META_L4PROTO, NFT_REG32_01);
570 if (r < 0)
571 return r;
572
573 /* 3rd statement (cont): store the port number in reg2 */
574 r = nfnl_add_expr_payload(m, NFT_PAYLOAD_TRANSPORT_HEADER, UDP_DPORT_OFFSET,
575 sizeof(uint16_t), NFT_REG32_02);
576 if (r < 0)
577 return r;
578
579 /* 3rd statement (cont): use reg1 and reg2 and retrieve
580 * the new destination ip and port number.
581 *
582 * reg1 and reg2 are clobbered and will then contain the new
583 * address/port number. */
584 r = nfnl_add_expr_lookup_map(m, NFT_SYSTEMD_DNAT_MAP_NAME, NFT_REG32_01, NFT_REG32_01);
585 if (r < 0)
586 return r;
587
588 /* 4th statement: dnat connection to address/port retrieved by the
589 * preceding expression. */
590 proto_reg = family == AF_INET ? NFT_REG32_02 : NFT_REG32_05;
591 r = nfnl_add_expr_dnat(m, family, NFT_REG32_01, proto_reg);
592 if (r < 0)
593 return r;
594
595 r = sd_netlink_message_close_container(m); /* NFTA_RULE_EXPRESSIONS */
596 if (r < 0)
597 return r;
598
599 *ret = TAKE_PTR(m);
600 return 0;
601 }
602
603 static int nft_new_set(
604 struct sd_netlink *nfnl,
605 sd_netlink_message **ret,
606 int family,
607 const char *set_name,
608 uint32_t set_id,
609 uint32_t flags,
610 uint32_t type,
611 uint32_t klen) {
612
613 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
614 int r;
615
616 assert(nfnl);
617 assert(ret);
618 assert(IN_SET(family, AF_INET, AF_INET6));
619 assert(set_name);
620
621 r = sd_nfnl_nft_message_new_set(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name, set_id, klen);
622 if (r < 0)
623 return r;
624
625 if (flags != 0) {
626 r = sd_netlink_message_append_u32(m, NFTA_SET_FLAGS, htobe32(flags));
627 if (r < 0)
628 return r;
629 }
630
631 r = sd_netlink_message_append_u32(m, NFTA_SET_KEY_TYPE, htobe32(type));
632 if (r < 0)
633 return r;
634
635 *ret = TAKE_PTR(m);
636 return r;
637 }
638
639 static int nft_new_map(
640 struct sd_netlink *nfnl,
641 sd_netlink_message **ret,
642 int family,
643 const char *set_name,
644 uint32_t set_id,
645 uint32_t flags,
646 uint32_t type,
647 uint32_t klen,
648 uint32_t dtype,
649 uint32_t dlen) {
650
651 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
652 int r;
653
654 assert(nfnl);
655 assert(ret);
656 assert(IN_SET(family, AF_INET, AF_INET6));
657 assert(set_name);
658
659 r = nft_new_set(nfnl, &m, family, set_name, set_id, flags | NFT_SET_MAP, type, klen);
660 if (r < 0)
661 return r;
662
663 r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_TYPE, htobe32(dtype));
664 if (r < 0)
665 return r;
666
667 r = sd_netlink_message_append_u32(m, NFTA_SET_DATA_LEN, htobe32(dlen));
668 if (r < 0)
669 return r;
670
671 *ret = TAKE_PTR(m);
672 return 0;
673 }
674
675 static int nft_add_element(
676 sd_netlink *nfnl,
677 sd_netlink_message **ret,
678 int family,
679 const char *set_name,
680 const void *key,
681 uint32_t klen,
682 const void *data,
683 uint32_t dlen) {
684
685 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
686 int r;
687
688 assert(nfnl);
689 assert(ret);
690 assert(IN_SET(family, AF_INET, AF_INET6));
691 assert(set_name);
692 assert(key);
693 assert(data);
694
695 /*
696 * Ideally there would be an API that provides:
697 *
698 * 1) an init function to add the main ruleset skeleton
699 * 2) a function that populates the sets with all known address/port pairs to s/dnat for
700 * 3) a function that can remove address/port pairs again.
701 *
702 * At this time, the existing API is used which is built on a
703 * 'add/delete a rule' paradigm.
704 *
705 * This replicated here and each element gets added to the set
706 * one-by-one.
707 */
708 r = sd_nfnl_nft_message_new_setelems_begin(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name);
709 if (r < 0)
710 return r;
711
712 r = sd_nfnl_nft_message_add_setelem(m, 0, key, klen, data, dlen);
713 if (r < 0)
714 return r;
715
716 /* could theoretically append more set elements to add here */
717 r = sd_nfnl_nft_message_add_setelem_end(m);
718 if (r < 0)
719 return r;
720
721 *ret = TAKE_PTR(m);
722 return 0;
723 }
724
725 static int nft_del_element(
726 sd_netlink *nfnl,
727 sd_netlink_message **ret,
728 int family,
729 const char *set_name,
730 const void *key,
731 uint32_t klen,
732 const void *data,
733 uint32_t dlen) {
734
735 _cleanup_(sd_netlink_message_unrefp) sd_netlink_message *m = NULL;
736 int r;
737
738 assert(nfnl);
739 assert(ret);
740 assert(IN_SET(family, AF_INET, AF_INET6));
741 assert(set_name);
742 assert(key);
743 assert(data);
744
745 r = sd_nfnl_nft_message_del_setelems_begin(nfnl, &m, family, NFT_SYSTEMD_TABLE_NAME, set_name);
746 if (r < 0)
747 return r;
748
749 r = sd_nfnl_nft_message_add_setelem(m, 0, key, klen, data, dlen);
750 if (r < 0)
751 return r;
752
753 r = sd_nfnl_nft_message_add_setelem_end(m);
754 if (r < 0)
755 return r;
756
757 *ret = TAKE_PTR(m);
758 return 0;
759 }
760
761 /* This is needed so 'nft' userspace tool can properly format the contents
762 * of the set/map when someone uses 'nft' to inspect their content.
763 *
764 * The values cannot be changed, they are part of the nft tool type identifier ABI. */
765 #define TYPE_BITS 6
766
767 enum nft_key_types {
768 TYPE_IPADDR = 7,
769 TYPE_IP6ADDR = 8,
770 TYPE_INET_PROTOCOL = 12,
771 TYPE_INET_SERVICE = 13,
772 };
773
774 static uint32_t concat_types2(enum nft_key_types a, enum nft_key_types b) {
775 uint32_t type = (uint32_t)a;
776
777 type <<= TYPE_BITS;
778 type |= (uint32_t)b;
779
780 return type;
781 }
782
783 static int fw_nftables_init_family(sd_netlink *nfnl, int family) {
784 sd_netlink_message *messages[12] = {};
785 _unused_ _cleanup_(netlink_message_unref_manyp) sd_netlink_message **unref = messages;
786 size_t msgcnt = 0, ip_type_size;
787 uint32_t set_id = 0;
788 int ip_type, r;
789
790 assert(nfnl);
791 assert(IN_SET(family, AF_INET, AF_INET6));
792
793 r = sd_nfnl_message_batch_begin(nfnl, &messages[msgcnt++]);
794 if (r < 0)
795 return r;
796
797 /* Set F_EXCL so table add fails if the table already exists. */
798 r = sd_nfnl_nft_message_new_table(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_TABLE_NAME);
799 if (r < 0)
800 return r;
801
802 r = sd_nfnl_nft_message_new_basechain(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_TABLE_NAME,
803 "prerouting", "nat",
804 NF_INET_PRE_ROUTING, NF_IP_PRI_NAT_DST + 1);
805 if (r < 0)
806 return r;
807
808 r = sd_nfnl_nft_message_new_basechain(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_TABLE_NAME,
809 "output", "nat",
810 NF_INET_LOCAL_OUT, NF_IP_PRI_NAT_DST + 1);
811 if (r < 0)
812 return r;
813
814 r = sd_nfnl_nft_message_new_basechain(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_TABLE_NAME,
815 "postrouting", "nat",
816 NF_INET_POST_ROUTING, NF_IP_PRI_NAT_SRC + 1);
817 if (r < 0)
818 return r;
819
820 if (family == AF_INET) {
821 ip_type_size = sizeof(uint32_t);
822 ip_type = TYPE_IPADDR;
823 } else {
824 assert(family == AF_INET6);
825 ip_type_size = sizeof(struct in6_addr);
826 ip_type = TYPE_IP6ADDR;
827 }
828 /* set to store ip address ranges we should masquerade for */
829 r = nft_new_set(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_MASQ_SET_NAME, ++set_id, NFT_SET_INTERVAL, ip_type, ip_type_size);
830 if (r < 0)
831 return r;
832
833 /*
834 * map to store ip address:port pair to dnat to. elements in concatenation
835 * are rounded up to 4 bytes.
836 *
837 * Example: ip protocol . tcp daddr is sizeof(uint32_t) + sizeof(uint32_t), not
838 * sizeof(uint8_t) + sizeof(uint16_t).
839 */
840 r = nft_new_map(nfnl, &messages[msgcnt++], family, NFT_SYSTEMD_DNAT_MAP_NAME, ++set_id, 0,
841 concat_types2(TYPE_INET_PROTOCOL, TYPE_INET_SERVICE), sizeof(uint32_t) * 2,
842 concat_types2(ip_type, TYPE_INET_SERVICE), ip_type_size + sizeof(uint32_t));
843 if (r < 0)
844 return r;
845
846 r = sd_nfnl_message_new_dnat_rule_pre(nfnl, &messages[msgcnt++], family, "prerouting");
847 if (r < 0)
848 return r;
849
850 r = sd_nfnl_message_new_dnat_rule_out(nfnl, &messages[msgcnt++], family, "output");
851 if (r < 0)
852 return r;
853
854 r = sd_nfnl_message_new_masq_rule(nfnl, &messages[msgcnt++], family, "postrouting");
855 if (r < 0)
856 return r;
857
858 r = sd_nfnl_message_batch_end(nfnl, &messages[msgcnt++]);
859 if (r < 0)
860 return r;
861
862 assert(msgcnt < ELEMENTSOF(messages));
863 r = nfnl_netlink_sendv(nfnl, messages, msgcnt);
864 if (r < 0 && r != -EEXIST)
865 return r;
866
867 return 0;
868 }
869
870 int fw_nftables_init(FirewallContext *ctx) {
871 _cleanup_(sd_netlink_unrefp) sd_netlink *nfnl = NULL;
872 int r;
873
874 assert(ctx);
875 assert(!ctx->nfnl);
876
877 r = sd_nfnl_socket_open(&nfnl);
878 if (r < 0)
879 return r;
880
881 r = fw_nftables_init_family(nfnl, AF_INET);
882 if (r < 0)
883 return r;
884
885 if (socket_ipv6_is_supported()) {
886 r = fw_nftables_init_family(nfnl, AF_INET6);
887 if (r < 0)
888 log_debug_errno(r, "Failed to init ipv6 NAT: %m");
889 }
890
891 ctx->nfnl = TAKE_PTR(nfnl);
892 return 0;
893 }
894
895 void fw_nftables_exit(FirewallContext *ctx) {
896 assert(ctx);
897
898 ctx->nfnl = sd_netlink_unref(ctx->nfnl);
899 }
900
901 static int nft_message_add_setelem_iprange(
902 sd_netlink_message *m,
903 const union in_addr_union *source,
904 unsigned int prefixlen) {
905
906 uint32_t mask, start, end;
907 unsigned int nplen;
908 int r;
909
910 assert(m);
911 assert(source);
912 assert(prefixlen <= 32);
913
914 nplen = 32 - prefixlen;
915
916 mask = (1U << nplen) - 1U;
917 mask = htobe32(~mask);
918 start = source->in.s_addr & mask;
919
920 r = sd_nfnl_nft_message_add_setelem(m, 0, &start, sizeof(start), NULL, 0);
921 if (r < 0)
922 return r;
923
924 r = sd_nfnl_nft_message_add_setelem_end(m);
925 if (r < 0)
926 return r;
927
928 end = be32toh(start) + (1U << nplen);
929 if (end < be32toh(start))
930 end = 0U;
931 end = htobe32(end);
932
933 r = sd_nfnl_nft_message_add_setelem(m, 1, &end, sizeof(end), NULL, 0);
934 if (r < 0)
935 return r;
936
937 r = sd_netlink_message_append_u32(m, NFTA_SET_ELEM_FLAGS, htobe32(NFT_SET_ELEM_INTERVAL_END));
938 if (r < 0)
939 return r;
940
941 return sd_nfnl_nft_message_add_setelem_end(m);
942 }
943
944 static int nft_message_add_setelem_ip6range(
945 sd_netlink_message *m,
946 const union in_addr_union *source,
947 unsigned int prefixlen) {
948
949 union in_addr_union start, end;
950 int r;
951
952 assert(m);
953 assert(source);
954
955 r = in_addr_prefix_range(AF_INET6, source, prefixlen, &start, &end);
956 if (r < 0)
957 return r;
958
959 r = sd_nfnl_nft_message_add_setelem(m, 0, &start.in6, sizeof(start.in6), NULL, 0);
960 if (r < 0)
961 return r;
962
963 r = sd_nfnl_nft_message_add_setelem_end(m);
964 if (r < 0)
965 return r;
966
967 r = sd_nfnl_nft_message_add_setelem(m, 1, &end.in6, sizeof(end.in6), NULL, 0);
968 if (r < 0)
969 return r;
970
971 r = sd_netlink_message_append_u32(m, NFTA_SET_ELEM_FLAGS, htobe32(NFT_SET_ELEM_INTERVAL_END));
972 if (r < 0)
973 return r;
974
975 return sd_nfnl_nft_message_add_setelem_end(m);
976 }
977
978 static int fw_nftables_add_masquerade_internal(
979 sd_netlink *nfnl,
980 bool add,
981 int af,
982 const union in_addr_union *source,
983 unsigned int source_prefixlen) {
984
985 sd_netlink_message *messages[4] = {};
986 _unused_ _cleanup_(netlink_message_unref_manyp) sd_netlink_message **unref = messages;
987 size_t msgcnt = 0;
988 int r;
989
990 assert(nfnl);
991 assert(IN_SET(af, AF_INET, AF_INET6));
992
993 if (!source || source_prefixlen == 0)
994 return -EINVAL;
995
996 if (af == AF_INET6 && source_prefixlen < 8)
997 return -EINVAL;
998
999 r = sd_nfnl_message_batch_begin(nfnl, &messages[msgcnt++]);
1000 if (r < 0)
1001 return r;
1002
1003 if (add)
1004 r = sd_nfnl_nft_message_new_setelems_begin(nfnl, &messages[msgcnt++], af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME);
1005 else
1006 r = sd_nfnl_nft_message_del_setelems_begin(nfnl, &messages[msgcnt++], af, NFT_SYSTEMD_TABLE_NAME, NFT_SYSTEMD_MASQ_SET_NAME);
1007 if (r < 0)
1008 return r;
1009
1010 if (af == AF_INET)
1011 r = nft_message_add_setelem_iprange(messages[msgcnt-1], source, source_prefixlen);
1012 else
1013 r = nft_message_add_setelem_ip6range(messages[msgcnt-1], source, source_prefixlen);
1014 if (r < 0)
1015 return r;
1016
1017 r = sd_nfnl_message_batch_end(nfnl, &messages[msgcnt++]);
1018 if (r < 0)
1019 return r;
1020
1021 assert(msgcnt < ELEMENTSOF(messages));
1022 r = nfnl_netlink_sendv(nfnl, messages, msgcnt);
1023 if (r < 0)
1024 return r;
1025
1026 return 0;
1027 }
1028
1029 int fw_nftables_add_masquerade(
1030 FirewallContext *ctx,
1031 bool add,
1032 int af,
1033 const union in_addr_union *source,
1034 unsigned int source_prefixlen) {
1035
1036 int r;
1037
1038 assert(ctx);
1039 assert(ctx->nfnl);
1040 assert(IN_SET(af, AF_INET, AF_INET6));
1041
1042 if (!socket_ipv6_is_supported() && af == AF_INET6)
1043 return -EOPNOTSUPP;
1044
1045 r = fw_nftables_add_masquerade_internal(ctx->nfnl, add, af, source, source_prefixlen);
1046 if (r != -ENOENT)
1047 return r;
1048
1049 /* When someone runs 'nft flush ruleset' in the same net namespace this will also tear down the
1050 * systemd nat table.
1051 *
1052 * Unlike iptables -t nat -F (which will remove all rules added by the systemd iptables
1053 * backend, iptables has builtin chains that cannot be deleted -- the next add operation will
1054 * 'just work'.
1055 *
1056 * In the nftables case, everything gets removed. The next add operation will yield -ENOENT.
1057 *
1058 * If we see -ENOENT on add, replay the initial table setup. If that works, re-do the add
1059 * operation.
1060 *
1061 * Note that this doesn't protect against external sabotage such as a
1062 * 'while true; nft flush ruleset; done'. There is nothing that could be done about that short
1063 * of extending the kernel to allow tables to be owned by stystemd-networkd and making them
1064 * non-deleteable except by the 'owning process'. */
1065
1066 r = fw_nftables_init_family(ctx->nfnl, af);
1067 if (r < 0)
1068 return r;
1069
1070 return fw_nftables_add_masquerade_internal(ctx->nfnl, add, af, source, source_prefixlen);
1071 }
1072
1073 static int fw_nftables_add_local_dnat_internal(
1074 sd_netlink *nfnl,
1075 bool add,
1076 int af,
1077 int protocol,
1078 uint16_t local_port,
1079 const union in_addr_union *remote,
1080 uint16_t remote_port,
1081 const union in_addr_union *previous_remote) {
1082
1083 sd_netlink_message *messages[5] = {};
1084 _unused_ _cleanup_(netlink_message_unref_manyp) sd_netlink_message **unref = messages;
1085 static bool ipv6_supported = true;
1086 uint32_t data[5], key[2], dlen;
1087 size_t msgcnt = 0;
1088 int r;
1089
1090 assert(nfnl);
1091 assert(add || !previous_remote);
1092 assert(IN_SET(af, AF_INET, AF_INET6));
1093
1094 if (!ipv6_supported && af == AF_INET6)
1095 return -EOPNOTSUPP;
1096
1097 if (!IN_SET(protocol, IPPROTO_TCP, IPPROTO_UDP))
1098 return -EPROTONOSUPPORT;
1099
1100 if (local_port <= 0)
1101 return -EINVAL;
1102
1103 key[0] = protocol;
1104 key[1] = htobe16(local_port);
1105
1106 if (!remote)
1107 return -EOPNOTSUPP;
1108
1109 if (remote_port <= 0)
1110 return -EINVAL;
1111
1112 if (af == AF_INET) {
1113 dlen = 8;
1114 data[1] = htobe16(remote_port);
1115 } else {
1116 assert(af == AF_INET6);
1117 dlen = sizeof(data);
1118 data[4] = htobe16(remote_port);
1119 }
1120
1121 r = sd_nfnl_message_batch_begin(nfnl, &messages[msgcnt++]);
1122 if (r < 0)
1123 return r;
1124
1125 /* If a previous remote is set, remove its entry */
1126 if (add && previous_remote && !in_addr_equal(af, previous_remote, remote)) {
1127 if (af == AF_INET)
1128 data[0] = previous_remote->in.s_addr;
1129 else
1130 memcpy(data, &previous_remote->in6, sizeof(previous_remote->in6));
1131
1132 r = nft_del_element(nfnl, &messages[msgcnt++], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, dlen);
1133 if (r < 0)
1134 return r;
1135 }
1136
1137 if (af == AF_INET)
1138 data[0] = remote->in.s_addr;
1139 else
1140 memcpy(data, &remote->in6, sizeof(remote->in6));
1141
1142 if (add)
1143 r = nft_add_element(nfnl, &messages[msgcnt++], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, dlen);
1144 else
1145 r = nft_del_element(nfnl, &messages[msgcnt++], af, NFT_SYSTEMD_DNAT_MAP_NAME, key, sizeof(key), data, dlen);
1146 if (r < 0)
1147 return r;
1148
1149 r = sd_nfnl_message_batch_end(nfnl, &messages[msgcnt++]);
1150 if (r < 0)
1151 return r;
1152
1153 assert(msgcnt < ELEMENTSOF(messages));
1154 r = nfnl_netlink_sendv(nfnl, messages, msgcnt);
1155 if (r == -EOVERFLOW && af == AF_INET6) {
1156 /* The current implementation of DNAT in systemd requires kernel's
1157 * fdb9c405e35bdc6e305b9b4e20ebc141ed14fc81 (v5.8), and the older kernel returns
1158 * -EOVERFLOW. Let's treat the error as -EOPNOTSUPP. */
1159 log_debug_errno(r, "The current implementation of IPv6 DNAT in systemd requires kernel 5.8 or newer, ignoring: %m");
1160 ipv6_supported = false;
1161 return -EOPNOTSUPP;
1162 }
1163 if (r < 0)
1164 return r;
1165
1166 return 0;
1167 }
1168
1169 int fw_nftables_add_local_dnat(
1170 FirewallContext *ctx,
1171 bool add,
1172 int af,
1173 int protocol,
1174 uint16_t local_port,
1175 const union in_addr_union *remote,
1176 uint16_t remote_port,
1177 const union in_addr_union *previous_remote) {
1178
1179 int r;
1180
1181 assert(ctx);
1182 assert(ctx->nfnl);
1183 assert(IN_SET(af, AF_INET, AF_INET6));
1184
1185 if (!socket_ipv6_is_supported() && af == AF_INET6)
1186 return -EOPNOTSUPP;
1187
1188 r = fw_nftables_add_local_dnat_internal(ctx->nfnl, add, af, protocol, local_port, remote, remote_port, previous_remote);
1189 if (r != -ENOENT)
1190 return r;
1191
1192 /* See comment in fw_nftables_add_masquerade(). */
1193 r = fw_nftables_init_family(ctx->nfnl, af);
1194 if (r < 0)
1195 return r;
1196
1197 /* table created anew; previous address already gone */
1198 return fw_nftables_add_local_dnat_internal(ctx->nfnl, add, af, protocol, local_port, remote, remote_port, NULL);
1199 }