]> git.ipfire.org Git - thirdparty/bird.git/blob - sysdep/linux/netlink.c
Netlink: Simplify handling of IPv6 ECMP routes
[thirdparty/bird.git] / sysdep / linux / netlink.c
1 /*
2 * BIRD -- Linux Netlink Interface
3 *
4 * (c) 1999--2000 Martin Mares <mj@ucw.cz>
5 *
6 * Can be freely distributed and used under the terms of the GNU GPL.
7 */
8
9 #include <alloca.h>
10 #include <stdio.h>
11 #include <unistd.h>
12 #include <fcntl.h>
13 #include <sys/socket.h>
14 #include <sys/uio.h>
15 #include <errno.h>
16
17 #undef LOCAL_DEBUG
18
19 #include "nest/bird.h"
20 #include "nest/route.h"
21 #include "nest/protocol.h"
22 #include "nest/iface.h"
23 #include "lib/alloca.h"
24 #include "sysdep/unix/unix.h"
25 #include "sysdep/unix/krt.h"
26 #include "lib/socket.h"
27 #include "lib/string.h"
28 #include "lib/hash.h"
29 #include "conf/conf.h"
30
31 #include <asm/types.h>
32 #include <linux/if.h>
33 #include <linux/netlink.h>
34 #include <linux/rtnetlink.h>
35
36 #ifdef HAVE_MPLS_KERNEL
37 #include <linux/lwtunnel.h>
38 #endif
39
40 #ifndef MSG_TRUNC /* Hack: Several versions of glibc miss this one :( */
41 #define MSG_TRUNC 0x20
42 #endif
43
44 #ifndef IFA_FLAGS
45 #define IFA_FLAGS 8
46 #endif
47
48 #ifndef IFF_LOWER_UP
49 #define IFF_LOWER_UP 0x10000
50 #endif
51
52 #ifndef RTA_TABLE
53 #define RTA_TABLE 15
54 #endif
55
56 #ifndef RTA_VIA
57 #define RTA_VIA 18
58 #endif
59
60 #ifndef RTA_NEWDST
61 #define RTA_NEWDST 19
62 #endif
63
64 #ifndef RTA_ENCAP_TYPE
65 #define RTA_ENCAP_TYPE 21
66 #endif
67
68 #ifndef RTA_ENCAP
69 #define RTA_ENCAP 22
70 #endif
71
72 #ifndef NETLINK_GET_STRICT_CHK
73 #define NETLINK_GET_STRICT_CHK 12
74 #endif
75
76 #define krt_ipv4(p) ((p)->af == AF_INET)
77
78 const int rt_default_ecmp = 16;
79
80 struct nl_parse_state
81 {
82 struct krt_proto *proto;
83 struct linpool *pool;
84 int scan;
85
86 u32 rta_flow;
87 };
88
89 /*
90 * Synchronous Netlink interface
91 */
92
93 struct nl_sock
94 {
95 int fd;
96 u32 seq;
97 byte *rx_buffer; /* Receive buffer */
98 struct nlmsghdr *last_hdr; /* Recently received packet */
99 uint last_size;
100 };
101
102 #define NL_RX_SIZE 32768
103
104 #define NL_OP_DELETE 0
105 #define NL_OP_ADD (NLM_F_CREATE|NLM_F_EXCL)
106 #define NL_OP_REPLACE (NLM_F_CREATE|NLM_F_REPLACE)
107 #define NL_OP_APPEND (NLM_F_CREATE|NLM_F_APPEND)
108
109 static linpool *nl_linpool;
110
111 static struct nl_sock nl_scan = {.fd = -1}; /* Netlink socket for synchronous scan */
112 static struct nl_sock nl_req = {.fd = -1}; /* Netlink socket for requests */
113
114 static void
115 nl_open_sock(struct nl_sock *nl)
116 {
117 if (nl->fd < 0)
118 {
119 nl->fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
120 if (nl->fd < 0)
121 die("Unable to open rtnetlink socket: %m");
122 nl->seq = (u32) (current_time() TO_S); /* Or perhaps random_u32() ? */
123 nl->rx_buffer = xmalloc(NL_RX_SIZE);
124 nl->last_hdr = NULL;
125 nl->last_size = 0;
126 }
127 }
128
129 static int
130 nl_set_strict_dump(struct nl_sock *nl UNUSED, int strict UNUSED)
131 {
132 #ifdef SOL_NETLINK
133 return setsockopt(nl->fd, SOL_NETLINK, NETLINK_GET_STRICT_CHK, &strict, sizeof(strict));
134 #else
135 return -1;
136 #endif
137 }
138
139 static void
140 nl_set_rcvbuf(int fd, uint val)
141 {
142 if (setsockopt(fd, SOL_SOCKET, SO_RCVBUFFORCE, &val, sizeof(val)) < 0)
143 log(L_WARN "KRT: Cannot set netlink rx buffer size to %u: %m", val);
144 }
145
146 static uint
147 nl_cfg_rx_buffer_size(struct config *cfg)
148 {
149 uint bufsize = 0;
150
151 struct proto_config *pc;
152 WALK_LIST(pc, cfg->protos)
153 if ((pc->protocol == &proto_unix_kernel) && !pc->disabled)
154 bufsize = MAX(bufsize, ((struct krt_config *) pc)->sys.netlink_rx_buffer);
155
156 return bufsize;
157 }
158
159
160 static void
161 nl_open(void)
162 {
163 if ((nl_scan.fd >= 0) && (nl_req.fd >= 0))
164 return;
165
166 nl_open_sock(&nl_scan);
167 nl_open_sock(&nl_req);
168
169 if (nl_set_strict_dump(&nl_scan, 1) < 0)
170 {
171 log(L_WARN "KRT: Netlink strict checking failed, will scan all tables at once");
172 krt_use_shared_scan();
173 }
174 }
175
176 static void
177 nl_send(struct nl_sock *nl, struct nlmsghdr *nh)
178 {
179 struct sockaddr_nl sa;
180
181 memset(&sa, 0, sizeof(sa));
182 sa.nl_family = AF_NETLINK;
183 nh->nlmsg_pid = 0;
184 nh->nlmsg_seq = ++(nl->seq);
185 nh->nlmsg_len = NLMSG_ALIGN(nh->nlmsg_len);
186 if (sendto(nl->fd, nh, nh->nlmsg_len, 0, (struct sockaddr *)&sa, sizeof(sa)) < 0)
187 die("rtnetlink sendto: %m");
188 nl->last_hdr = NULL;
189 }
190
191 static void
192 nl_request_dump_link(void)
193 {
194 struct {
195 struct nlmsghdr nh;
196 struct ifinfomsg ifi;
197 } req = {
198 .nh.nlmsg_type = RTM_GETLINK,
199 .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)),
200 .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
201 .nh.nlmsg_seq = ++(nl_scan.seq),
202 .ifi.ifi_family = AF_UNSPEC,
203 };
204
205 send(nl_scan.fd, &req, sizeof(req), 0);
206 nl_scan.last_hdr = NULL;
207 }
208
209 static void
210 nl_request_dump_addr(int af)
211 {
212 struct {
213 struct nlmsghdr nh;
214 struct ifaddrmsg ifa;
215 } req = {
216 .nh.nlmsg_type = RTM_GETADDR,
217 .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg)),
218 .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
219 .nh.nlmsg_seq = ++(nl_scan.seq),
220 .ifa.ifa_family = af,
221 };
222
223 send(nl_scan.fd, &req, sizeof(req), 0);
224 nl_scan.last_hdr = NULL;
225 }
226
227 static void
228 nl_request_dump_route(int af, int table_id)
229 {
230 struct {
231 struct nlmsghdr nh;
232 struct rtmsg rtm;
233 struct rtattr rta;
234 u32 table_id;
235 } req = {
236 .nh.nlmsg_type = RTM_GETROUTE,
237 .nh.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)),
238 .nh.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP,
239 .nh.nlmsg_seq = ++(nl_scan.seq),
240 .rtm.rtm_family = af,
241 };
242
243 if (table_id < 256)
244 req.rtm.rtm_table = table_id;
245 else
246 {
247 req.rta.rta_type = RTA_TABLE;
248 req.rta.rta_len = RTA_LENGTH(4);
249 req.table_id = table_id;
250 req.nh.nlmsg_len = NLMSG_ALIGN(req.nh.nlmsg_len) + req.rta.rta_len;
251 }
252
253 send(nl_scan.fd, &req, req.nh.nlmsg_len, 0);
254 nl_scan.last_hdr = NULL;
255 }
256
257
258 static struct nlmsghdr *
259 nl_get_reply(struct nl_sock *nl)
260 {
261 for(;;)
262 {
263 if (!nl->last_hdr)
264 {
265 struct iovec iov = { nl->rx_buffer, NL_RX_SIZE };
266 struct sockaddr_nl sa;
267 struct msghdr m = {
268 .msg_name = &sa,
269 .msg_namelen = sizeof(sa),
270 .msg_iov = &iov,
271 .msg_iovlen = 1,
272 };
273 int x = recvmsg(nl->fd, &m, 0);
274 if (x < 0)
275 die("nl_get_reply: %m");
276 if (sa.nl_pid) /* It isn't from the kernel */
277 {
278 DBG("Non-kernel packet\n");
279 continue;
280 }
281 nl->last_size = x;
282 nl->last_hdr = (void *) nl->rx_buffer;
283 if (m.msg_flags & MSG_TRUNC)
284 bug("nl_get_reply: got truncated reply which should be impossible");
285 }
286 if (NLMSG_OK(nl->last_hdr, nl->last_size))
287 {
288 struct nlmsghdr *h = nl->last_hdr;
289 nl->last_hdr = NLMSG_NEXT(h, nl->last_size);
290 if (h->nlmsg_seq != nl->seq)
291 {
292 log(L_WARN "nl_get_reply: Ignoring out of sequence netlink packet (%x != %x)",
293 h->nlmsg_seq, nl->seq);
294 continue;
295 }
296 return h;
297 }
298 if (nl->last_size)
299 log(L_WARN "nl_get_reply: Found packet remnant of size %d", nl->last_size);
300 nl->last_hdr = NULL;
301 }
302 }
303
304 static struct tbf rl_netlink_err = TBF_DEFAULT_LOG_LIMITS;
305
306 static int
307 nl_error(struct nlmsghdr *h, int ignore_esrch)
308 {
309 struct nlmsgerr *e;
310 int ec;
311
312 if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr)))
313 {
314 log(L_WARN "Netlink: Truncated error message received");
315 return ENOBUFS;
316 }
317 e = (struct nlmsgerr *) NLMSG_DATA(h);
318 ec = -e->error;
319 if (ec && !(ignore_esrch && (ec == ESRCH)))
320 log_rl(&rl_netlink_err, L_WARN "Netlink: %s", strerror(ec));
321 return ec;
322 }
323
324 static struct nlmsghdr *
325 nl_get_scan(void)
326 {
327 struct nlmsghdr *h = nl_get_reply(&nl_scan);
328
329 if (h->nlmsg_type == NLMSG_DONE)
330 return NULL;
331 if (h->nlmsg_type == NLMSG_ERROR)
332 {
333 nl_error(h, 0);
334 return NULL;
335 }
336 return h;
337 }
338
339 static int
340 nl_exchange(struct nlmsghdr *pkt, int ignore_esrch)
341 {
342 struct nlmsghdr *h;
343
344 nl_send(&nl_req, pkt);
345 for(;;)
346 {
347 h = nl_get_reply(&nl_req);
348 if (h->nlmsg_type == NLMSG_ERROR)
349 break;
350 log(L_WARN "nl_exchange: Unexpected reply received");
351 }
352 return nl_error(h, ignore_esrch) ? -1 : 0;
353 }
354
355 /*
356 * Netlink attributes
357 */
358
359 static int nl_attr_len;
360
361 static void *
362 nl_checkin(struct nlmsghdr *h, int lsize)
363 {
364 nl_attr_len = h->nlmsg_len - NLMSG_LENGTH(lsize);
365 if (nl_attr_len < 0)
366 {
367 log(L_ERR "nl_checkin: underrun by %d bytes", -nl_attr_len);
368 return NULL;
369 }
370 return NLMSG_DATA(h);
371 }
372
373 struct nl_want_attrs {
374 u8 defined:1;
375 u8 checksize:1;
376 u8 size;
377 };
378
379
380 #define BIRD_IFLA_MAX (IFLA_WIRELESS+1)
381
382 static struct nl_want_attrs ifla_attr_want[BIRD_IFLA_MAX] = {
383 [IFLA_IFNAME] = { 1, 0, 0 },
384 [IFLA_MTU] = { 1, 1, sizeof(u32) },
385 [IFLA_MASTER] = { 1, 1, sizeof(u32) },
386 [IFLA_WIRELESS] = { 1, 0, 0 },
387 };
388
389
390 #define BIRD_IFA_MAX (IFA_FLAGS+1)
391
392 static struct nl_want_attrs ifa_attr_want4[BIRD_IFA_MAX] = {
393 [IFA_ADDRESS] = { 1, 1, sizeof(ip4_addr) },
394 [IFA_LOCAL] = { 1, 1, sizeof(ip4_addr) },
395 [IFA_BROADCAST] = { 1, 1, sizeof(ip4_addr) },
396 [IFA_FLAGS] = { 1, 1, sizeof(u32) },
397 };
398
399 static struct nl_want_attrs ifa_attr_want6[BIRD_IFA_MAX] = {
400 [IFA_ADDRESS] = { 1, 1, sizeof(ip6_addr) },
401 [IFA_LOCAL] = { 1, 1, sizeof(ip6_addr) },
402 [IFA_FLAGS] = { 1, 1, sizeof(u32) },
403 };
404
405
406 #define BIRD_RTA_MAX (RTA_ENCAP+1)
407
408 static struct nl_want_attrs nexthop_attr_want4[BIRD_RTA_MAX] = {
409 [RTA_GATEWAY] = { 1, 1, sizeof(ip4_addr) },
410 [RTA_VIA] = { 1, 0, 0 },
411 [RTA_FLOW] = { 1, 1, sizeof(u32) },
412 [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
413 [RTA_ENCAP] = { 1, 0, 0 },
414 };
415
416 static struct nl_want_attrs nexthop_attr_want6[BIRD_RTA_MAX] = {
417 [RTA_GATEWAY] = { 1, 1, sizeof(ip6_addr) },
418 [RTA_VIA] = { 1, 0, 0 },
419 [RTA_FLOW] = { 1, 1, sizeof(u32) },
420 [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
421 [RTA_ENCAP] = { 1, 0, 0 },
422 };
423
424 #ifdef HAVE_MPLS_KERNEL
425 static struct nl_want_attrs nexthop_attr_want_mpls[BIRD_RTA_MAX] = {
426 [RTA_VIA] = { 1, 0, 0 },
427 [RTA_NEWDST] = { 1, 0, 0 },
428 };
429
430 static struct nl_want_attrs encap_mpls_want[BIRD_RTA_MAX] = {
431 [RTA_DST] = { 1, 0, 0 },
432 };
433 #endif
434
435 static struct nl_want_attrs rtm_attr_want4[BIRD_RTA_MAX] = {
436 [RTA_DST] = { 1, 1, sizeof(ip4_addr) },
437 [RTA_OIF] = { 1, 1, sizeof(u32) },
438 [RTA_GATEWAY] = { 1, 1, sizeof(ip4_addr) },
439 [RTA_PRIORITY] = { 1, 1, sizeof(u32) },
440 [RTA_PREFSRC] = { 1, 1, sizeof(ip4_addr) },
441 [RTA_METRICS] = { 1, 0, 0 },
442 [RTA_MULTIPATH] = { 1, 0, 0 },
443 [RTA_FLOW] = { 1, 1, sizeof(u32) },
444 [RTA_TABLE] = { 1, 1, sizeof(u32) },
445 [RTA_VIA] = { 1, 0, 0 },
446 [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
447 [RTA_ENCAP] = { 1, 0, 0 },
448 };
449
450 static struct nl_want_attrs rtm_attr_want6[BIRD_RTA_MAX] = {
451 [RTA_DST] = { 1, 1, sizeof(ip6_addr) },
452 [RTA_SRC] = { 1, 1, sizeof(ip6_addr) },
453 [RTA_IIF] = { 1, 1, sizeof(u32) },
454 [RTA_OIF] = { 1, 1, sizeof(u32) },
455 [RTA_GATEWAY] = { 1, 1, sizeof(ip6_addr) },
456 [RTA_PRIORITY] = { 1, 1, sizeof(u32) },
457 [RTA_PREFSRC] = { 1, 1, sizeof(ip6_addr) },
458 [RTA_METRICS] = { 1, 0, 0 },
459 [RTA_MULTIPATH] = { 1, 0, 0 },
460 [RTA_FLOW] = { 1, 1, sizeof(u32) },
461 [RTA_TABLE] = { 1, 1, sizeof(u32) },
462 [RTA_VIA] = { 1, 0, 0 },
463 [RTA_ENCAP_TYPE]= { 1, 1, sizeof(u16) },
464 [RTA_ENCAP] = { 1, 0, 0 },
465 };
466
467 #ifdef HAVE_MPLS_KERNEL
468 static struct nl_want_attrs rtm_attr_want_mpls[BIRD_RTA_MAX] = {
469 [RTA_DST] = { 1, 1, sizeof(u32) },
470 [RTA_IIF] = { 1, 1, sizeof(u32) },
471 [RTA_OIF] = { 1, 1, sizeof(u32) },
472 [RTA_PRIORITY] = { 1, 1, sizeof(u32) },
473 [RTA_METRICS] = { 1, 0, 0 },
474 [RTA_MULTIPATH] = { 1, 0, 0 },
475 [RTA_FLOW] = { 1, 1, sizeof(u32) },
476 [RTA_TABLE] = { 1, 1, sizeof(u32) },
477 [RTA_VIA] = { 1, 0, 0 },
478 [RTA_NEWDST] = { 1, 0, 0 },
479 };
480 #endif
481
482
483 static int
484 nl_parse_attrs(struct rtattr *a, struct nl_want_attrs *want, struct rtattr **k, int ksize)
485 {
486 int max = ksize / sizeof(struct rtattr *);
487 bzero(k, ksize);
488
489 for ( ; RTA_OK(a, nl_attr_len); a = RTA_NEXT(a, nl_attr_len))
490 {
491 if ((a->rta_type >= max) || !want[a->rta_type].defined)
492 continue;
493
494 if (want[a->rta_type].checksize && (RTA_PAYLOAD(a) != want[a->rta_type].size))
495 {
496 log(L_ERR "nl_parse_attrs: Malformed attribute received");
497 return 0;
498 }
499
500 k[a->rta_type] = a;
501 }
502
503 if (nl_attr_len)
504 {
505 log(L_ERR "nl_parse_attrs: remnant of size %d", nl_attr_len);
506 return 0;
507 }
508
509 return 1;
510 }
511
512 static inline u16 rta_get_u16(struct rtattr *a)
513 { return *(u16 *) RTA_DATA(a); }
514
515 static inline u32 rta_get_u32(struct rtattr *a)
516 { return *(u32 *) RTA_DATA(a); }
517
518 static inline ip4_addr rta_get_ip4(struct rtattr *a)
519 { return ip4_ntoh(*(ip4_addr *) RTA_DATA(a)); }
520
521 static inline ip6_addr rta_get_ip6(struct rtattr *a)
522 { return ip6_ntoh(*(ip6_addr *) RTA_DATA(a)); }
523
524 static inline ip_addr rta_get_ipa(struct rtattr *a)
525 {
526 if (RTA_PAYLOAD(a) == sizeof(ip4_addr))
527 return ipa_from_ip4(rta_get_ip4(a));
528 else
529 return ipa_from_ip6(rta_get_ip6(a));
530 }
531
532 #ifdef HAVE_MPLS_KERNEL
533 static inline ip_addr rta_get_via(struct rtattr *a)
534 {
535 struct rtvia *v = RTA_DATA(a);
536 switch(v->rtvia_family) {
537 case AF_INET: return ipa_from_ip4(ip4_ntoh(*(ip4_addr *) v->rtvia_addr));
538 case AF_INET6: return ipa_from_ip6(ip6_ntoh(*(ip6_addr *) v->rtvia_addr));
539 }
540 return IPA_NONE;
541 }
542
543 static u32 rta_mpls_stack[MPLS_MAX_LABEL_STACK];
544 static inline int rta_get_mpls(struct rtattr *a, u32 *stack)
545 {
546 if (!a)
547 return 0;
548
549 if (RTA_PAYLOAD(a) % 4)
550 log(L_WARN "KRT: Strange length of received MPLS stack: %u", RTA_PAYLOAD(a));
551
552 int labels = mpls_get(RTA_DATA(a), RTA_PAYLOAD(a) & ~0x3, stack);
553
554 if (labels < 0)
555 {
556 log(L_WARN "KRT: Too long MPLS stack received, ignoring");
557 labels = 0;
558 }
559
560 return labels;
561 }
562 #endif
563
564 struct rtattr *
565 nl_add_attr(struct nlmsghdr *h, uint bufsize, uint code, const void *data, uint dlen)
566 {
567 uint pos = NLMSG_ALIGN(h->nlmsg_len);
568 uint len = RTA_LENGTH(dlen);
569
570 if (pos + len > bufsize)
571 bug("nl_add_attr: packet buffer overflow");
572
573 struct rtattr *a = (struct rtattr *)((char *)h + pos);
574 a->rta_type = code;
575 a->rta_len = len;
576 h->nlmsg_len = pos + len;
577
578 if (dlen > 0)
579 memcpy(RTA_DATA(a), data, dlen);
580
581 return a;
582 }
583
584 static inline struct rtattr *
585 nl_open_attr(struct nlmsghdr *h, uint bufsize, uint code)
586 {
587 return nl_add_attr(h, bufsize, code, NULL, 0);
588 }
589
590 static inline void
591 nl_close_attr(struct nlmsghdr *h, struct rtattr *a)
592 {
593 a->rta_len = (void *)h + NLMSG_ALIGN(h->nlmsg_len) - (void *)a;
594 }
595
596 static inline void
597 nl_add_attr_u16(struct nlmsghdr *h, uint bufsize, int code, u16 data)
598 {
599 nl_add_attr(h, bufsize, code, &data, 2);
600 }
601
602 static inline void
603 nl_add_attr_u32(struct nlmsghdr *h, uint bufsize, int code, u32 data)
604 {
605 nl_add_attr(h, bufsize, code, &data, 4);
606 }
607
608 static inline void
609 nl_add_attr_ip4(struct nlmsghdr *h, uint bufsize, int code, ip4_addr ip4)
610 {
611 ip4 = ip4_hton(ip4);
612 nl_add_attr(h, bufsize, code, &ip4, sizeof(ip4));
613 }
614
615 static inline void
616 nl_add_attr_ip6(struct nlmsghdr *h, uint bufsize, int code, ip6_addr ip6)
617 {
618 ip6 = ip6_hton(ip6);
619 nl_add_attr(h, bufsize, code, &ip6, sizeof(ip6));
620 }
621
622 static inline void
623 nl_add_attr_ipa(struct nlmsghdr *h, uint bufsize, int code, ip_addr ipa)
624 {
625 if (ipa_is_ip4(ipa))
626 nl_add_attr_ip4(h, bufsize, code, ipa_to_ip4(ipa));
627 else
628 nl_add_attr_ip6(h, bufsize, code, ipa_to_ip6(ipa));
629 }
630
631 #ifdef HAVE_MPLS_KERNEL
632 static inline void
633 nl_add_attr_mpls(struct nlmsghdr *h, uint bufsize, int code, int len, u32 *stack)
634 {
635 char buf[len*4];
636 mpls_put(buf, len, stack);
637 nl_add_attr(h, bufsize, code, buf, len*4);
638 }
639
640 static inline void
641 nl_add_attr_mpls_encap(struct nlmsghdr *h, uint bufsize, int len, u32 *stack)
642 {
643 nl_add_attr_u16(h, bufsize, RTA_ENCAP_TYPE, LWTUNNEL_ENCAP_MPLS);
644
645 struct rtattr *nest = nl_open_attr(h, bufsize, RTA_ENCAP);
646 nl_add_attr_mpls(h, bufsize, RTA_DST, len, stack);
647 nl_close_attr(h, nest);
648 }
649
650 static inline void
651 nl_add_attr_via(struct nlmsghdr *h, uint bufsize, ip_addr ipa)
652 {
653 struct rtvia *via = alloca(sizeof(struct rtvia) + 16);
654
655 if (ipa_is_ip4(ipa))
656 {
657 via->rtvia_family = AF_INET;
658 put_ip4(via->rtvia_addr, ipa_to_ip4(ipa));
659 nl_add_attr(h, bufsize, RTA_VIA, via, sizeof(struct rtvia) + 4);
660 }
661 else
662 {
663 via->rtvia_family = AF_INET6;
664 put_ip6(via->rtvia_addr, ipa_to_ip6(ipa));
665 nl_add_attr(h, bufsize, RTA_VIA, via, sizeof(struct rtvia) + 16);
666 }
667 }
668 #endif
669
670 static inline struct rtnexthop *
671 nl_open_nexthop(struct nlmsghdr *h, uint bufsize)
672 {
673 uint pos = NLMSG_ALIGN(h->nlmsg_len);
674 uint len = RTNH_LENGTH(0);
675
676 if (pos + len > bufsize)
677 bug("nl_open_nexthop: packet buffer overflow");
678
679 h->nlmsg_len = pos + len;
680
681 return (void *)h + pos;
682 }
683
684 static inline void
685 nl_close_nexthop(struct nlmsghdr *h, struct rtnexthop *nh)
686 {
687 nh->rtnh_len = (void *)h + NLMSG_ALIGN(h->nlmsg_len) - (void *)nh;
688 }
689
690 static inline void
691 nl_add_nexthop(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af UNUSED)
692 {
693 #ifdef HAVE_MPLS_KERNEL
694 if (nh->labels > 0)
695 if (af == AF_MPLS)
696 nl_add_attr_mpls(h, bufsize, RTA_NEWDST, nh->labels, nh->label);
697 else
698 nl_add_attr_mpls_encap(h, bufsize, nh->labels, nh->label);
699
700 if (ipa_nonzero(nh->gw))
701 {
702 if (af == (ipa_is_ip4(nh->gw) ? AF_INET : AF_INET6))
703 nl_add_attr_ipa(h, bufsize, RTA_GATEWAY, nh->gw);
704 else
705 nl_add_attr_via(h, bufsize, nh->gw);
706 }
707 #else
708
709 if (ipa_nonzero(nh->gw))
710 nl_add_attr_ipa(h, bufsize, RTA_GATEWAY, nh->gw);
711 #endif
712 }
713
714 static void
715 nl_add_multipath(struct nlmsghdr *h, uint bufsize, struct nexthop *nh, int af, ea_list *eattrs)
716 {
717 struct rtattr *a = nl_open_attr(h, bufsize, RTA_MULTIPATH);
718 eattr *flow = ea_find(eattrs, EA_KRT_REALM);
719
720 for (; nh; nh = nh->next)
721 {
722 struct rtnexthop *rtnh = nl_open_nexthop(h, bufsize);
723
724 rtnh->rtnh_flags = 0;
725 rtnh->rtnh_hops = nh->weight;
726 rtnh->rtnh_ifindex = nh->iface->index;
727
728 nl_add_nexthop(h, bufsize, nh, af);
729
730 if (nh->flags & RNF_ONLINK)
731 rtnh->rtnh_flags |= RTNH_F_ONLINK;
732
733 /* Our KRT_REALM is per-route, but kernel RTA_FLOW is per-nexthop.
734 Therefore, we need to attach the same attribute to each nexthop. */
735 if (flow)
736 nl_add_attr_u32(h, bufsize, RTA_FLOW, flow->u.data);
737
738 nl_close_nexthop(h, rtnh);
739 }
740
741 nl_close_attr(h, a);
742 }
743
744 static struct nexthop *
745 nl_parse_multipath(struct nl_parse_state *s, struct krt_proto *p, const net_addr *n, struct rtattr *ra, int af, int krt_src)
746 {
747 struct rtattr *a[BIRD_RTA_MAX];
748 struct rtnexthop *nh = RTA_DATA(ra);
749 struct nexthop *rv, *first, **last;
750 unsigned len = RTA_PAYLOAD(ra);
751
752 first = NULL;
753 last = &first;
754
755 while (len)
756 {
757 /* Use RTNH_OK(nh,len) ?? */
758 if ((len < sizeof(*nh)) || (len < nh->rtnh_len))
759 goto err;
760
761 if ((nh->rtnh_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD))
762 goto next;
763
764 *last = rv = lp_allocz(s->pool, NEXTHOP_MAX_SIZE);
765 last = &(rv->next);
766
767 rv->weight = nh->rtnh_hops;
768 rv->iface = if_find_by_index(nh->rtnh_ifindex);
769 if (!rv->iface)
770 {
771 log(L_ERR "KRT: Received route %N with unknown ifindex %u", n, nh->rtnh_ifindex);
772 return NULL;
773 }
774
775 /* Nonexistent RTNH_PAYLOAD ?? */
776 nl_attr_len = nh->rtnh_len - RTNH_LENGTH(0);
777 switch (af)
778 {
779 case AF_INET:
780 if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want4, a, sizeof(a)))
781 goto err;
782 break;
783
784 case AF_INET6:
785 if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want6, a, sizeof(a)))
786 goto err;
787 break;
788
789 #ifdef HAVE_MPLS_KERNEL
790 case AF_MPLS:
791 if (!nl_parse_attrs(RTNH_DATA(nh), nexthop_attr_want_mpls, a, sizeof(a)))
792 goto err;
793
794 if (a[RTA_NEWDST])
795 rv->labels = rta_get_mpls(a[RTA_NEWDST], rv->label);
796
797 break;
798 #endif
799
800 default:
801 goto err;
802 }
803
804 if (a[RTA_GATEWAY])
805 rv->gw = rta_get_ipa(a[RTA_GATEWAY]);
806
807 if (a[RTA_FLOW])
808 s->rta_flow = rta_get_u32(a[RTA_FLOW]);
809
810 #ifdef HAVE_MPLS_KERNEL
811 if (a[RTA_VIA])
812 rv->gw = rta_get_via(a[RTA_VIA]);
813 #endif
814
815 if (ipa_nonzero(rv->gw))
816 {
817 if (nh->rtnh_flags & RTNH_F_ONLINK)
818 rv->flags |= RNF_ONLINK;
819
820 neighbor *nbr;
821 nbr = neigh_find(&p->p, rv->gw, rv->iface,
822 (rv->flags & RNF_ONLINK) ? NEF_ONLINK : 0);
823 if (!nbr || (nbr->scope == SCOPE_HOST))
824 {
825 log(L_ERR "KRT: Received route %N with strange next-hop %I", n, rv->gw);
826 return NULL;
827 }
828 }
829
830 #ifdef HAVE_MPLS_KERNEL
831 if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE])
832 {
833 if (rta_get_u16(a[RTA_ENCAP_TYPE]) != LWTUNNEL_ENCAP_MPLS)
834 {
835 log(L_WARN "KRT: Received route %N with unknown encapsulation method %d",
836 n, rta_get_u16(a[RTA_ENCAP_TYPE]));
837 return NULL;
838 }
839
840 struct rtattr *enca[BIRD_RTA_MAX];
841 nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
842 nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
843 rv->labels = rta_get_mpls(enca[RTA_DST], rv->label);
844 }
845 #endif
846
847 next:
848 len -= NLMSG_ALIGN(nh->rtnh_len);
849 nh = RTNH_NEXT(nh);
850 }
851
852 /* Ensure nexthops are sorted to satisfy nest invariant */
853 if (!nexthop_is_sorted(first))
854 first = nexthop_sort(first);
855
856 return first;
857
858 err:
859 log(L_ERR "KRT: Received strange multipath route %N", n);
860 return NULL;
861 }
862
863 static void
864 nl_add_metrics(struct nlmsghdr *h, uint bufsize, u32 *metrics, int max)
865 {
866 struct rtattr *a = nl_open_attr(h, bufsize, RTA_METRICS);
867 int t;
868
869 for (t = 1; t < max; t++)
870 if (metrics[0] & (1 << t))
871 nl_add_attr_u32(h, bufsize, t, metrics[t]);
872
873 nl_close_attr(h, a);
874 }
875
876 static int
877 nl_parse_metrics(struct rtattr *hdr, u32 *metrics, int max)
878 {
879 struct rtattr *a = RTA_DATA(hdr);
880 int len = RTA_PAYLOAD(hdr);
881
882 metrics[0] = 0;
883 for (; RTA_OK(a, len); a = RTA_NEXT(a, len))
884 {
885 if (a->rta_type == RTA_UNSPEC)
886 continue;
887
888 if (a->rta_type >= max)
889 continue;
890
891 if (RTA_PAYLOAD(a) != 4)
892 return -1;
893
894 metrics[0] |= 1 << a->rta_type;
895 metrics[a->rta_type] = rta_get_u32(a);
896 }
897
898 if (len > 0)
899 return -1;
900
901 return 0;
902 }
903
904
905 /*
906 * Scanning of interfaces
907 */
908
909 static void
910 nl_parse_link(struct nlmsghdr *h, int scan)
911 {
912 struct ifinfomsg *i;
913 struct rtattr *a[BIRD_IFLA_MAX];
914 int new = h->nlmsg_type == RTM_NEWLINK;
915 struct iface f = {};
916 struct iface *ifi;
917 char *name;
918 u32 mtu, master = 0;
919 uint fl;
920
921 if (!(i = nl_checkin(h, sizeof(*i))) || !nl_parse_attrs(IFLA_RTA(i), ifla_attr_want, a, sizeof(a)))
922 return;
923 if (!a[IFLA_IFNAME] || (RTA_PAYLOAD(a[IFLA_IFNAME]) < 2) || !a[IFLA_MTU])
924 {
925 /*
926 * IFLA_IFNAME and IFLA_MTU are required, in fact, but there may also come
927 * a message with IFLA_WIRELESS set, where (e.g.) no IFLA_IFNAME exists.
928 * We simply ignore all such messages with IFLA_WIRELESS without notice.
929 */
930
931 if (a[IFLA_WIRELESS])
932 return;
933
934 log(L_ERR "KIF: Malformed message received");
935 return;
936 }
937
938 name = RTA_DATA(a[IFLA_IFNAME]);
939 mtu = rta_get_u32(a[IFLA_MTU]);
940
941 if (a[IFLA_MASTER])
942 master = rta_get_u32(a[IFLA_MASTER]);
943
944 ifi = if_find_by_index(i->ifi_index);
945 if (!new)
946 {
947 DBG("KIF: IF%d(%s) goes down\n", i->ifi_index, name);
948 if (!ifi)
949 return;
950
951 if_delete(ifi);
952 }
953 else
954 {
955 DBG("KIF: IF%d(%s) goes up (mtu=%d,flg=%x)\n", i->ifi_index, name, mtu, i->ifi_flags);
956 if (ifi && strncmp(ifi->name, name, sizeof(ifi->name)-1))
957 if_delete(ifi);
958
959 strncpy(f.name, name, sizeof(f.name)-1);
960 f.index = i->ifi_index;
961 f.mtu = mtu;
962
963 f.master_index = master;
964 f.master = if_find_by_index(master);
965
966 fl = i->ifi_flags;
967 if (fl & IFF_UP)
968 f.flags |= IF_ADMIN_UP;
969 if (fl & IFF_LOWER_UP)
970 f.flags |= IF_LINK_UP;
971 if (fl & IFF_LOOPBACK) /* Loopback */
972 f.flags |= IF_MULTIACCESS | IF_LOOPBACK | IF_IGNORE;
973 else if (fl & IFF_POINTOPOINT) /* PtP */
974 f.flags |= IF_MULTICAST;
975 else if (fl & IFF_BROADCAST) /* Broadcast */
976 f.flags |= IF_MULTIACCESS | IF_BROADCAST | IF_MULTICAST;
977 else
978 f.flags |= IF_MULTIACCESS; /* NBMA */
979
980 if (fl & IFF_MULTICAST)
981 f.flags |= IF_MULTICAST;
982
983 ifi = if_update(&f);
984
985 if (!scan)
986 if_end_partial_update(ifi);
987 }
988 }
989
990 static void
991 nl_parse_addr4(struct ifaddrmsg *i, int scan, int new)
992 {
993 struct rtattr *a[BIRD_IFA_MAX];
994 struct iface *ifi;
995 u32 ifa_flags;
996 int scope;
997
998 if (!nl_parse_attrs(IFA_RTA(i), ifa_attr_want4, a, sizeof(a)))
999 return;
1000
1001 if (!a[IFA_LOCAL])
1002 {
1003 log(L_ERR "KIF: Malformed message received (missing IFA_LOCAL)");
1004 return;
1005 }
1006 if (!a[IFA_ADDRESS])
1007 {
1008 log(L_ERR "KIF: Malformed message received (missing IFA_ADDRESS)");
1009 return;
1010 }
1011
1012 ifi = if_find_by_index(i->ifa_index);
1013 if (!ifi)
1014 {
1015 log(L_ERR "KIF: Received address message for unknown interface %d", i->ifa_index);
1016 return;
1017 }
1018
1019 if (a[IFA_FLAGS])
1020 ifa_flags = rta_get_u32(a[IFA_FLAGS]);
1021 else
1022 ifa_flags = i->ifa_flags;
1023
1024 struct ifa ifa;
1025 bzero(&ifa, sizeof(ifa));
1026 ifa.iface = ifi;
1027 if (ifa_flags & IFA_F_SECONDARY)
1028 ifa.flags |= IA_SECONDARY;
1029
1030 ifa.ip = rta_get_ipa(a[IFA_LOCAL]);
1031
1032 if (i->ifa_prefixlen > IP4_MAX_PREFIX_LENGTH)
1033 {
1034 log(L_ERR "KIF: Invalid prefix length for interface %s: %d", ifi->name, i->ifa_prefixlen);
1035 new = 0;
1036 }
1037 if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH)
1038 {
1039 ifa.brd = rta_get_ipa(a[IFA_ADDRESS]);
1040 net_fill_ip4(&ifa.prefix, rta_get_ip4(a[IFA_ADDRESS]), i->ifa_prefixlen);
1041
1042 /* It is either a host address or a peer address */
1043 if (ipa_equal(ifa.ip, ifa.brd))
1044 ifa.flags |= IA_HOST;
1045 else
1046 {
1047 ifa.flags |= IA_PEER;
1048 ifa.opposite = ifa.brd;
1049 }
1050 }
1051 else
1052 {
1053 net_fill_ip4(&ifa.prefix, ipa_to_ip4(ifa.ip), i->ifa_prefixlen);
1054 net_normalize(&ifa.prefix);
1055
1056 if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH - 1)
1057 ifa.opposite = ipa_opposite_m1(ifa.ip);
1058
1059 if (i->ifa_prefixlen == IP4_MAX_PREFIX_LENGTH - 2)
1060 ifa.opposite = ipa_opposite_m2(ifa.ip);
1061
1062 if (ifi->flags & IF_BROADCAST)
1063 {
1064 /* If kernel offers us a broadcast address, we trust it */
1065 if (a[IFA_BROADCAST])
1066 ifa.brd = ipa_from_ip4(rta_get_ip4(a[IFA_BROADCAST]));
1067 /* Otherwise we create one (except for /31) */
1068 else if (i->ifa_prefixlen < (IP4_MAX_PREFIX_LENGTH - 1))
1069 ifa.brd = ipa_from_ip4(ip4_or(ipa_to_ip4(ifa.ip),
1070 ip4_not(ip4_mkmask(i->ifa_prefixlen))));
1071 }
1072 }
1073
1074 scope = ipa_classify(ifa.ip);
1075 if (scope < 0)
1076 {
1077 log(L_ERR "KIF: Invalid interface address %I for %s", ifa.ip, ifi->name);
1078 return;
1079 }
1080 ifa.scope = scope & IADDR_SCOPE_MASK;
1081
1082 DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n",
1083 ifi->index, ifi->name,
1084 new ? "added" : "removed",
1085 ifa.ip, ifa.flags, &ifa.prefix, ifa.brd, ifa.opposite);
1086
1087 if (new)
1088 ifa_update(&ifa);
1089 else
1090 ifa_delete(&ifa);
1091
1092 if (!scan)
1093 if_end_partial_update(ifi);
1094 }
1095
1096 static void
1097 nl_parse_addr6(struct ifaddrmsg *i, int scan, int new)
1098 {
1099 struct rtattr *a[BIRD_IFA_MAX];
1100 struct iface *ifi;
1101 u32 ifa_flags;
1102 int scope;
1103
1104 if (!nl_parse_attrs(IFA_RTA(i), ifa_attr_want6, a, sizeof(a)))
1105 return;
1106
1107 if (!a[IFA_ADDRESS])
1108 {
1109 log(L_ERR "KIF: Malformed message received (missing IFA_ADDRESS)");
1110 return;
1111 }
1112
1113 ifi = if_find_by_index(i->ifa_index);
1114 if (!ifi)
1115 {
1116 log(L_ERR "KIF: Received address message for unknown interface %d", i->ifa_index);
1117 return;
1118 }
1119
1120 if (a[IFA_FLAGS])
1121 ifa_flags = rta_get_u32(a[IFA_FLAGS]);
1122 else
1123 ifa_flags = i->ifa_flags;
1124
1125 struct ifa ifa;
1126 bzero(&ifa, sizeof(ifa));
1127 ifa.iface = ifi;
1128 if (ifa_flags & IFA_F_SECONDARY)
1129 ifa.flags |= IA_SECONDARY;
1130
1131 /* Ignore tentative addresses silently */
1132 if (ifa_flags & IFA_F_TENTATIVE)
1133 return;
1134
1135 /* IFA_LOCAL can be unset for IPv6 interfaces */
1136 ifa.ip = rta_get_ipa(a[IFA_LOCAL] ? : a[IFA_ADDRESS]);
1137
1138 if (i->ifa_prefixlen > IP6_MAX_PREFIX_LENGTH)
1139 {
1140 log(L_ERR "KIF: Invalid prefix length for interface %s: %d", ifi->name, i->ifa_prefixlen);
1141 new = 0;
1142 }
1143 if (i->ifa_prefixlen == IP6_MAX_PREFIX_LENGTH)
1144 {
1145 ifa.brd = rta_get_ipa(a[IFA_ADDRESS]);
1146 net_fill_ip6(&ifa.prefix, rta_get_ip6(a[IFA_ADDRESS]), i->ifa_prefixlen);
1147
1148 /* It is either a host address or a peer address */
1149 if (ipa_equal(ifa.ip, ifa.brd))
1150 ifa.flags |= IA_HOST;
1151 else
1152 {
1153 ifa.flags |= IA_PEER;
1154 ifa.opposite = ifa.brd;
1155 }
1156 }
1157 else
1158 {
1159 net_fill_ip6(&ifa.prefix, ipa_to_ip6(ifa.ip), i->ifa_prefixlen);
1160 net_normalize(&ifa.prefix);
1161
1162 if (i->ifa_prefixlen == IP6_MAX_PREFIX_LENGTH - 1)
1163 ifa.opposite = ipa_opposite_m1(ifa.ip);
1164 }
1165
1166 scope = ipa_classify(ifa.ip);
1167 if (scope < 0)
1168 {
1169 log(L_ERR "KIF: Invalid interface address %I for %s", ifa.ip, ifi->name);
1170 return;
1171 }
1172 ifa.scope = scope & IADDR_SCOPE_MASK;
1173
1174 DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n",
1175 ifi->index, ifi->name,
1176 new ? "added" : "removed",
1177 ifa.ip, ifa.flags, &ifa.prefix, ifa.brd, ifa.opposite);
1178
1179 if (new)
1180 ifa_update(&ifa);
1181 else
1182 ifa_delete(&ifa);
1183
1184 if (!scan)
1185 if_end_partial_update(ifi);
1186 }
1187
1188 static void
1189 nl_parse_addr(struct nlmsghdr *h, int scan)
1190 {
1191 struct ifaddrmsg *i;
1192
1193 if (!(i = nl_checkin(h, sizeof(*i))))
1194 return;
1195
1196 int new = (h->nlmsg_type == RTM_NEWADDR);
1197
1198 switch (i->ifa_family)
1199 {
1200 case AF_INET:
1201 return nl_parse_addr4(i, scan, new);
1202
1203 case AF_INET6:
1204 return nl_parse_addr6(i, scan, new);
1205 }
1206 }
1207
1208 void
1209 kif_do_scan(struct kif_proto *p UNUSED)
1210 {
1211 struct nlmsghdr *h;
1212
1213 if_start_update();
1214
1215 nl_request_dump_link();
1216 while (h = nl_get_scan())
1217 if (h->nlmsg_type == RTM_NEWLINK || h->nlmsg_type == RTM_DELLINK)
1218 nl_parse_link(h, 1);
1219 else
1220 log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1221
1222 /* Re-resolve master interface for slaves */
1223 struct iface *i;
1224 WALK_LIST(i, iface_list)
1225 if (i->master_index)
1226 {
1227 struct iface f = {
1228 .flags = i->flags,
1229 .mtu = i->mtu,
1230 .index = i->index,
1231 .master_index = i->master_index,
1232 .master = if_find_by_index(i->master_index)
1233 };
1234
1235 if (f.master != i->master)
1236 {
1237 memcpy(f.name, i->name, sizeof(f.name));
1238 if_update(&f);
1239 }
1240 }
1241
1242 nl_request_dump_addr(AF_INET);
1243 while (h = nl_get_scan())
1244 if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR)
1245 nl_parse_addr(h, 1);
1246 else
1247 log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1248
1249 nl_request_dump_addr(AF_INET6);
1250 while (h = nl_get_scan())
1251 if (h->nlmsg_type == RTM_NEWADDR || h->nlmsg_type == RTM_DELADDR)
1252 nl_parse_addr(h, 1);
1253 else
1254 log(L_DEBUG "nl_scan_ifaces: Unknown packet received (type=%d)", h->nlmsg_type);
1255
1256 if_end_update();
1257 }
1258
1259 /*
1260 * Routes
1261 */
1262
1263 static inline u32
1264 krt_table_id(struct krt_proto *p)
1265 {
1266 return KRT_CF->sys.table_id;
1267 }
1268
1269 static HASH(struct krt_proto) nl_table_map;
1270
1271 #define RTH_KEY(p) p->af, krt_table_id(p)
1272 #define RTH_NEXT(p) p->sys.hash_next
1273 #define RTH_EQ(a1,i1,a2,i2) a1 == a2 && i1 == i2
1274 #define RTH_FN(a,i) a ^ u32_hash(i)
1275
1276 #define RTH_REHASH rth_rehash
1277 #define RTH_PARAMS /8, *2, 2, 2, 6, 20
1278
1279 HASH_DEFINE_REHASH_FN(RTH, struct krt_proto)
1280
1281 int
1282 krt_capable(rte *e)
1283 {
1284 rta *a = e->attrs;
1285
1286 switch (a->dest)
1287 {
1288 case RTD_UNICAST:
1289 case RTD_BLACKHOLE:
1290 case RTD_UNREACHABLE:
1291 case RTD_PROHIBIT:
1292 return 1;
1293
1294 default:
1295 return 0;
1296 }
1297 }
1298
1299 static inline int
1300 nh_bufsize(struct nexthop *nh)
1301 {
1302 int rv = 0;
1303 for (; nh != NULL; nh = nh->next)
1304 rv += RTNH_LENGTH(RTA_LENGTH(sizeof(ip_addr)));
1305 return rv;
1306 }
1307
1308 static int
1309 nl_send_route(struct krt_proto *p, rte *e, int op)
1310 {
1311 eattr *ea;
1312 net *net = e->net;
1313 rta *a = e->attrs;
1314 ea_list *eattrs = a->eattrs;
1315 int bufsize = 128 + KRT_METRICS_MAX*8 + nh_bufsize(&(a->nh));
1316 u32 priority = 0;
1317
1318 struct {
1319 struct nlmsghdr h;
1320 struct rtmsg r;
1321 char buf[0];
1322 } *r;
1323
1324 int rsize = sizeof(*r) + bufsize;
1325 r = alloca(rsize);
1326
1327 DBG("nl_send_route(%N,op=%x)\n", net->n.addr, op);
1328
1329 bzero(&r->h, sizeof(r->h));
1330 bzero(&r->r, sizeof(r->r));
1331 r->h.nlmsg_type = op ? RTM_NEWROUTE : RTM_DELROUTE;
1332 r->h.nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg));
1333 r->h.nlmsg_flags = op | NLM_F_REQUEST | NLM_F_ACK;
1334
1335 r->r.rtm_family = p->af;
1336 r->r.rtm_dst_len = net_pxlen(net->n.addr);
1337 r->r.rtm_protocol = RTPROT_BIRD;
1338 r->r.rtm_scope = RT_SCOPE_NOWHERE;
1339 #ifdef HAVE_MPLS_KERNEL
1340 if (p->af == AF_MPLS)
1341 {
1342 /*
1343 * Kernel MPLS code is a bit picky. We must:
1344 * 1) Always set RT_SCOPE_UNIVERSE and RTN_UNICAST (even for RTM_DELROUTE)
1345 * 2) Never use RTA_PRIORITY
1346 */
1347
1348 u32 label = net_mpls(net->n.addr);
1349 nl_add_attr_mpls(&r->h, rsize, RTA_DST, 1, &label);
1350 r->r.rtm_scope = RT_SCOPE_UNIVERSE;
1351 r->r.rtm_type = RTN_UNICAST;
1352 }
1353 else
1354 #endif
1355 {
1356 nl_add_attr_ipa(&r->h, rsize, RTA_DST, net_prefix(net->n.addr));
1357
1358 /* Add source address for IPv6 SADR routes */
1359 if (net->n.addr->type == NET_IP6_SADR)
1360 {
1361 net_addr_ip6_sadr *a = (void *) &net->n.addr;
1362 nl_add_attr_ip6(&r->h, rsize, RTA_SRC, a->src_prefix);
1363 r->r.rtm_src_len = a->src_pxlen;
1364 }
1365 }
1366
1367 /*
1368 * Strange behavior for RTM_DELROUTE:
1369 * 1) rtm_family is ignored in IPv6, works for IPv4
1370 * 2) not setting RTA_PRIORITY is different from setting default value (on IPv6)
1371 * 3) not setting RTA_PRIORITY is equivalent to setting 0, which is wildcard
1372 */
1373
1374 if (krt_table_id(p) < 256)
1375 r->r.rtm_table = krt_table_id(p);
1376 else
1377 nl_add_attr_u32(&r->h, rsize, RTA_TABLE, krt_table_id(p));
1378
1379 if (p->af == AF_MPLS)
1380 priority = 0;
1381 else if (a->source == RTS_DUMMY)
1382 priority = e->u.krt.metric;
1383 else if (KRT_CF->sys.metric)
1384 priority = KRT_CF->sys.metric;
1385 else if ((op != NL_OP_DELETE) && (ea = ea_find(eattrs, EA_KRT_METRIC)))
1386 priority = ea->u.data;
1387
1388 if (priority)
1389 nl_add_attr_u32(&r->h, rsize, RTA_PRIORITY, priority);
1390
1391 /* For route delete, we do not specify remaining route attributes */
1392 if (op == NL_OP_DELETE)
1393 goto done;
1394
1395 /* Default scope is LINK for device routes, UNIVERSE otherwise */
1396 if (p->af == AF_MPLS)
1397 r->r.rtm_scope = RT_SCOPE_UNIVERSE;
1398 else if (ea = ea_find(eattrs, EA_KRT_SCOPE))
1399 r->r.rtm_scope = ea->u.data;
1400 else if (a->dest == RTD_UNICAST && ipa_zero(a->nh.gw))
1401 r->r.rtm_scope = RT_SCOPE_LINK;
1402 else
1403 r->r.rtm_scope = RT_SCOPE_UNIVERSE;
1404
1405 if (ea = ea_find(eattrs, EA_KRT_PREFSRC))
1406 nl_add_attr_ipa(&r->h, rsize, RTA_PREFSRC, *(ip_addr *)ea->u.ptr->data);
1407
1408 if (ea = ea_find(eattrs, EA_KRT_REALM))
1409 nl_add_attr_u32(&r->h, rsize, RTA_FLOW, ea->u.data);
1410
1411
1412 u32 metrics[KRT_METRICS_MAX];
1413 metrics[0] = 0;
1414
1415 struct ea_walk_state ews = { .eattrs = eattrs };
1416 while (ea = ea_walk(&ews, EA_KRT_METRICS, KRT_METRICS_MAX))
1417 {
1418 int id = ea->id - EA_KRT_METRICS;
1419 metrics[0] |= 1 << id;
1420 metrics[id] = ea->u.data;
1421 }
1422
1423 if (metrics[0])
1424 nl_add_metrics(&r->h, rsize, metrics, KRT_METRICS_MAX);
1425
1426 switch (a->dest)
1427 {
1428 case RTD_UNICAST:
1429 r->r.rtm_type = RTN_UNICAST;
1430 struct nexthop *nh = &(a->nh);
1431 if (nh->next)
1432 nl_add_multipath(&r->h, rsize, nh, p->af, eattrs);
1433 else
1434 {
1435 nl_add_attr_u32(&r->h, rsize, RTA_OIF, nh->iface->index);
1436 nl_add_nexthop(&r->h, rsize, nh, p->af);
1437
1438 if (nh->flags & RNF_ONLINK)
1439 r->r.rtm_flags |= RTNH_F_ONLINK;
1440 }
1441 break;
1442 case RTD_BLACKHOLE:
1443 r->r.rtm_type = RTN_BLACKHOLE;
1444 break;
1445 case RTD_UNREACHABLE:
1446 r->r.rtm_type = RTN_UNREACHABLE;
1447 break;
1448 case RTD_PROHIBIT:
1449 r->r.rtm_type = RTN_PROHIBIT;
1450 break;
1451 case RTD_NONE:
1452 break;
1453 default:
1454 bug("krt_capable inconsistent with nl_send_route");
1455 }
1456
1457 done:
1458 /* Ignore missing for DELETE */
1459 return nl_exchange(&r->h, (op == NL_OP_DELETE));
1460 }
1461
1462 void
1463 krt_replace_rte(struct krt_proto *p, net *n UNUSED, rte *new, rte *old)
1464 {
1465 int err = 0;
1466
1467 if (old && new)
1468 {
1469 err = nl_send_route(p, new, NL_OP_REPLACE);
1470 }
1471 else
1472 {
1473 if (old)
1474 nl_send_route(p, old, NL_OP_DELETE);
1475
1476 if (new)
1477 err = nl_send_route(p, new, NL_OP_ADD);
1478 }
1479
1480 if (new)
1481 {
1482 if (err < 0)
1483 bmap_clear(&p->sync_map, new->id);
1484 else
1485 bmap_set(&p->sync_map, new->id);
1486 }
1487 }
1488
1489
1490 #define SKIP0(ARG, ...) do { DBG("KRT: Ignoring route - " ARG, ##__VA_ARGS__); return; } while(0)
1491 #define SKIP(ARG, ...) do { DBG("KRT: Ignoring route %N - " ARG, &dst, ##__VA_ARGS__); return; } while(0)
1492
1493 static void
1494 nl_parse_route(struct nl_parse_state *s, struct nlmsghdr *h)
1495 {
1496 struct krt_proto *p;
1497 struct rtmsg *i;
1498 struct rtattr *a[BIRD_RTA_MAX];
1499 int new = h->nlmsg_type == RTM_NEWROUTE;
1500
1501 net_addr dst, src = {};
1502 u32 oif = ~0;
1503 u32 table_id;
1504 u32 priority = 0;
1505 u32 def_scope = RT_SCOPE_UNIVERSE;
1506 int krt_src;
1507
1508 if (!(i = nl_checkin(h, sizeof(*i))))
1509 return;
1510
1511 switch (i->rtm_family)
1512 {
1513 case AF_INET:
1514 if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want4, a, sizeof(a)))
1515 return;
1516
1517 if (a[RTA_DST])
1518 net_fill_ip4(&dst, rta_get_ip4(a[RTA_DST]), i->rtm_dst_len);
1519 else
1520 net_fill_ip4(&dst, IP4_NONE, 0);
1521 break;
1522
1523 case AF_INET6:
1524 if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want6, a, sizeof(a)))
1525 return;
1526
1527 if (a[RTA_DST])
1528 net_fill_ip6(&dst, rta_get_ip6(a[RTA_DST]), i->rtm_dst_len);
1529 else
1530 net_fill_ip6(&dst, IP6_NONE, 0);
1531
1532 if (a[RTA_SRC])
1533 net_fill_ip6(&src, rta_get_ip6(a[RTA_SRC]), i->rtm_src_len);
1534 else
1535 net_fill_ip6(&src, IP6_NONE, 0);
1536 break;
1537
1538 #ifdef HAVE_MPLS_KERNEL
1539 case AF_MPLS:
1540 if (!nl_parse_attrs(RTM_RTA(i), rtm_attr_want_mpls, a, sizeof(a)))
1541 return;
1542
1543 if (!a[RTA_DST])
1544 SKIP0("MPLS route without RTA_DST\n");
1545
1546 if (rta_get_mpls(a[RTA_DST], rta_mpls_stack) != 1)
1547 SKIP0("MPLS route with multi-label RTA_DST\n");
1548
1549 net_fill_mpls(&dst, rta_mpls_stack[0]);
1550 break;
1551 #endif
1552
1553 default:
1554 return;
1555 }
1556
1557 if (a[RTA_OIF])
1558 oif = rta_get_u32(a[RTA_OIF]);
1559
1560 if (a[RTA_TABLE])
1561 table_id = rta_get_u32(a[RTA_TABLE]);
1562 else
1563 table_id = i->rtm_table;
1564
1565 if (i->rtm_flags & RTM_F_CLONED)
1566 SKIP("cloned\n");
1567
1568 /* Do we know this table? */
1569 p = HASH_FIND(nl_table_map, RTH, i->rtm_family, table_id);
1570 if (!p)
1571 SKIP("unknown table %u\n", table_id);
1572
1573 if (a[RTA_SRC] && (p->p.net_type != NET_IP6_SADR))
1574 SKIP("src prefix for non-SADR channel\n");
1575
1576 if (a[RTA_IIF])
1577 SKIP("IIF set\n");
1578
1579 if (i->rtm_tos != 0) /* We don't support TOS */
1580 SKIP("TOS %02x\n", i->rtm_tos);
1581
1582 if (s->scan && !new)
1583 SKIP("RTM_DELROUTE in scan\n");
1584
1585 if (a[RTA_PRIORITY])
1586 priority = rta_get_u32(a[RTA_PRIORITY]);
1587
1588 int c = net_classify(&dst);
1589 if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
1590 SKIP("strange class/scope\n");
1591
1592 switch (i->rtm_protocol)
1593 {
1594 case RTPROT_UNSPEC:
1595 SKIP("proto unspec\n");
1596
1597 case RTPROT_REDIRECT:
1598 krt_src = KRT_SRC_REDIRECT;
1599 break;
1600
1601 case RTPROT_KERNEL:
1602 krt_src = KRT_SRC_KERNEL;
1603 return;
1604
1605 case RTPROT_BIRD:
1606 if (!s->scan)
1607 SKIP("echo\n");
1608 krt_src = KRT_SRC_BIRD;
1609 break;
1610
1611 case RTPROT_BOOT:
1612 default:
1613 krt_src = KRT_SRC_ALIEN;
1614 }
1615
1616 net_addr *n = &dst;
1617 if (p->p.net_type == NET_IP6_SADR)
1618 {
1619 n = alloca(sizeof(net_addr_ip6_sadr));
1620 net_fill_ip6_sadr(n, net6_prefix(&dst), net6_pxlen(&dst),
1621 net6_prefix(&src), net6_pxlen(&src));
1622 }
1623
1624 net *net = net_get(p->p.main_channel->table, n);
1625
1626 rta *ra = lp_allocz(s->pool, RTA_MAX_SIZE);
1627 ra->src = p->p.main_source;
1628 ra->source = RTS_INHERIT;
1629 ra->scope = SCOPE_UNIVERSE;
1630
1631 if (a[RTA_FLOW])
1632 s->rta_flow = rta_get_u32(a[RTA_FLOW]);
1633 else
1634 s->rta_flow = 0;
1635
1636 switch (i->rtm_type)
1637 {
1638 case RTN_UNICAST:
1639 ra->dest = RTD_UNICAST;
1640
1641 if (a[RTA_MULTIPATH])
1642 {
1643 struct nexthop *nh = nl_parse_multipath(s, p, n, a[RTA_MULTIPATH], i->rtm_family, krt_src);
1644 if (!nh)
1645 SKIP("strange RTA_MULTIPATH\n");
1646
1647 nexthop_link(ra, nh);
1648 break;
1649 }
1650
1651 if ((i->rtm_flags & RTNH_F_DEAD) && (krt_src != KRT_SRC_BIRD))
1652 SKIP("ignore RTNH_F_DEAD\n");
1653
1654 ra->nh.iface = if_find_by_index(oif);
1655 if (!ra->nh.iface)
1656 {
1657 log(L_ERR "KRT: Received route %N with unknown ifindex %u", net->n.addr, oif);
1658 return;
1659 }
1660
1661 if (a[RTA_GATEWAY])
1662 ra->nh.gw = rta_get_ipa(a[RTA_GATEWAY]);
1663
1664 #ifdef HAVE_MPLS_KERNEL
1665 if (a[RTA_VIA])
1666 ra->nh.gw = rta_get_via(a[RTA_VIA]);
1667 #endif
1668
1669 if (ipa_nonzero(ra->nh.gw))
1670 {
1671 /* Silently skip strange 6to4 routes */
1672 const net_addr_ip6 sit = NET_ADDR_IP6(IP6_NONE, 96);
1673 if ((i->rtm_family == AF_INET6) && ipa_in_netX(ra->nh.gw, (net_addr *) &sit))
1674 return;
1675
1676 if (i->rtm_flags & RTNH_F_ONLINK)
1677 ra->nh.flags |= RNF_ONLINK;
1678
1679 neighbor *nbr;
1680 nbr = neigh_find(&p->p, ra->nh.gw, ra->nh.iface,
1681 (ra->nh.flags & RNF_ONLINK) ? NEF_ONLINK : 0);
1682 if (!nbr || (nbr->scope == SCOPE_HOST))
1683 {
1684 log(L_ERR "KRT: Received route %N with strange next-hop %I", net->n.addr,
1685 ra->nh.gw);
1686 return;
1687 }
1688 }
1689
1690 break;
1691 case RTN_BLACKHOLE:
1692 ra->dest = RTD_BLACKHOLE;
1693 break;
1694 case RTN_UNREACHABLE:
1695 ra->dest = RTD_UNREACHABLE;
1696 break;
1697 case RTN_PROHIBIT:
1698 ra->dest = RTD_PROHIBIT;
1699 break;
1700 /* FIXME: What about RTN_THROW? */
1701 default:
1702 SKIP("type %d\n", i->rtm_type);
1703 return;
1704 }
1705
1706 #ifdef HAVE_MPLS_KERNEL
1707 if ((i->rtm_family == AF_MPLS) && a[RTA_NEWDST] && !ra->nh.next)
1708 ra->nh.labels = rta_get_mpls(a[RTA_NEWDST], ra->nh.label);
1709
1710 if (a[RTA_ENCAP] && a[RTA_ENCAP_TYPE] && !ra->nh.next)
1711 {
1712 switch (rta_get_u16(a[RTA_ENCAP_TYPE]))
1713 {
1714 case LWTUNNEL_ENCAP_MPLS:
1715 {
1716 struct rtattr *enca[BIRD_RTA_MAX];
1717 nl_attr_len = RTA_PAYLOAD(a[RTA_ENCAP]);
1718 nl_parse_attrs(RTA_DATA(a[RTA_ENCAP]), encap_mpls_want, enca, sizeof(enca));
1719 ra->nh.labels = rta_get_mpls(enca[RTA_DST], ra->nh.label);
1720 break;
1721 }
1722 default:
1723 SKIP("unknown encapsulation method %d\n", rta_get_u16(a[RTA_ENCAP_TYPE]));
1724 break;
1725 }
1726 }
1727 #endif
1728
1729 if (i->rtm_scope != def_scope)
1730 {
1731 ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1732 ea->next = ra->eattrs;
1733 ra->eattrs = ea;
1734 ea->flags = EALF_SORTED;
1735 ea->count = 1;
1736 ea->attrs[0].id = EA_KRT_SCOPE;
1737 ea->attrs[0].flags = 0;
1738 ea->attrs[0].type = EAF_TYPE_INT;
1739 ea->attrs[0].u.data = i->rtm_scope;
1740 }
1741
1742 if (a[RTA_PREFSRC])
1743 {
1744 ip_addr ps = rta_get_ipa(a[RTA_PREFSRC]);
1745
1746 ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1747 ea->next = ra->eattrs;
1748 ra->eattrs = ea;
1749 ea->flags = EALF_SORTED;
1750 ea->count = 1;
1751 ea->attrs[0].id = EA_KRT_PREFSRC;
1752 ea->attrs[0].flags = 0;
1753 ea->attrs[0].type = EAF_TYPE_IP_ADDRESS;
1754
1755 struct adata *ad = lp_alloc(s->pool, sizeof(struct adata) + sizeof(ps));
1756 ad->length = sizeof(ps);
1757 memcpy(ad->data, &ps, sizeof(ps));
1758
1759 ea->attrs[0].u.ptr = ad;
1760 }
1761
1762 /* Can be set per-route or per-nexthop */
1763 if (s->rta_flow)
1764 {
1765 ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + sizeof(eattr));
1766 ea->next = ra->eattrs;
1767 ra->eattrs = ea;
1768 ea->flags = EALF_SORTED;
1769 ea->count = 1;
1770 ea->attrs[0].id = EA_KRT_REALM;
1771 ea->attrs[0].flags = 0;
1772 ea->attrs[0].type = EAF_TYPE_INT;
1773 ea->attrs[0].u.data = s->rta_flow;
1774 }
1775
1776 if (a[RTA_METRICS])
1777 {
1778 u32 metrics[KRT_METRICS_MAX];
1779 ea_list *ea = lp_alloc(s->pool, sizeof(ea_list) + KRT_METRICS_MAX * sizeof(eattr));
1780 int t, n = 0;
1781
1782 if (nl_parse_metrics(a[RTA_METRICS], metrics, ARRAY_SIZE(metrics)) < 0)
1783 {
1784 log(L_ERR "KRT: Received route %N with strange RTA_METRICS attribute", net->n.addr);
1785 return;
1786 }
1787
1788 for (t = 1; t < KRT_METRICS_MAX; t++)
1789 if (metrics[0] & (1 << t))
1790 {
1791 ea->attrs[n].id = EA_CODE(PROTOCOL_KERNEL, KRT_METRICS_OFFSET + t);
1792 ea->attrs[n].flags = 0;
1793 ea->attrs[n].type = EAF_TYPE_INT; /* FIXME: Some are EAF_TYPE_BITFIELD */
1794 ea->attrs[n].u.data = metrics[t];
1795 n++;
1796 }
1797
1798 if (n > 0)
1799 {
1800 ea->next = ra->eattrs;
1801 ea->flags = EALF_SORTED;
1802 ea->count = n;
1803 ra->eattrs = ea;
1804 }
1805 }
1806
1807 rte *e = rte_get_temp(ra);
1808 e->net = net;
1809 e->u.krt.src = krt_src;
1810 e->u.krt.proto = i->rtm_protocol;
1811 e->u.krt.seen = 0;
1812 e->u.krt.best = 0;
1813 e->u.krt.metric = priority;
1814
1815 if (s->scan)
1816 krt_got_route(p, e);
1817 else
1818 krt_got_route_async(p, e, new);
1819
1820 lp_flush(s->pool);
1821 }
1822
1823 void
1824 krt_do_scan(struct krt_proto *p)
1825 {
1826 struct nl_parse_state s = {
1827 .proto = p,
1828 .pool = nl_linpool,
1829 .scan = 1,
1830 };
1831
1832 /* Table-specific scan or shared scan */
1833 if (p)
1834 nl_request_dump_route(p->af, krt_table_id(p));
1835 else
1836 nl_request_dump_route(AF_UNSPEC, 0);
1837
1838 struct nlmsghdr *h;
1839 while (h = nl_get_scan())
1840 {
1841 if (h->nlmsg_type == RTM_NEWROUTE || h->nlmsg_type == RTM_DELROUTE)
1842 nl_parse_route(&s, h);
1843 else
1844 log(L_DEBUG "nl_scan_fire: Unknown packet received (type=%d)", h->nlmsg_type);
1845 }
1846 }
1847
1848 /*
1849 * Asynchronous Netlink interface
1850 */
1851
1852 static sock *nl_async_sk; /* BIRD socket for asynchronous notifications */
1853 static byte *nl_async_rx_buffer; /* Receive buffer */
1854 static uint nl_async_bufsize; /* Kernel rx buffer size for the netlink socket */
1855 static struct config *nl_last_config; /* For tracking changes to nl_async_bufsize */
1856
1857 static void
1858 nl_async_msg(struct nlmsghdr *h)
1859 {
1860 struct nl_parse_state s = {
1861 .proto = NULL,
1862 .pool = nl_linpool,
1863 .scan = 0,
1864 };
1865
1866 switch (h->nlmsg_type)
1867 {
1868 case RTM_NEWROUTE:
1869 case RTM_DELROUTE:
1870 DBG("KRT: Received async route notification (%d)\n", h->nlmsg_type);
1871 nl_parse_route(&s, h);
1872 break;
1873 case RTM_NEWLINK:
1874 case RTM_DELLINK:
1875 DBG("KRT: Received async link notification (%d)\n", h->nlmsg_type);
1876 if (kif_proto)
1877 nl_parse_link(h, 0);
1878 break;
1879 case RTM_NEWADDR:
1880 case RTM_DELADDR:
1881 DBG("KRT: Received async address notification (%d)\n", h->nlmsg_type);
1882 if (kif_proto)
1883 nl_parse_addr(h, 0);
1884 break;
1885 default:
1886 DBG("KRT: Received unknown async notification (%d)\n", h->nlmsg_type);
1887 }
1888 }
1889
1890 static int
1891 nl_async_hook(sock *sk, uint size UNUSED)
1892 {
1893 struct iovec iov = { nl_async_rx_buffer, NL_RX_SIZE };
1894 struct sockaddr_nl sa;
1895 struct msghdr m = {
1896 .msg_name = &sa,
1897 .msg_namelen = sizeof(sa),
1898 .msg_iov = &iov,
1899 .msg_iovlen = 1,
1900 };
1901 struct nlmsghdr *h;
1902 int x;
1903 uint len;
1904
1905 x = recvmsg(sk->fd, &m, 0);
1906 if (x < 0)
1907 {
1908 if (errno == ENOBUFS)
1909 {
1910 /*
1911 * Netlink reports some packets have been thrown away.
1912 * One day we might react to it by asking for route table
1913 * scan in near future.
1914 */
1915 log(L_WARN "Kernel dropped some netlink messages, will resync on next scan.");
1916 return 1; /* More data are likely to be ready */
1917 }
1918 else if (errno != EWOULDBLOCK)
1919 log(L_ERR "Netlink recvmsg: %m");
1920 return 0;
1921 }
1922 if (sa.nl_pid) /* It isn't from the kernel */
1923 {
1924 DBG("Non-kernel packet\n");
1925 return 1;
1926 }
1927 h = (void *) nl_async_rx_buffer;
1928 len = x;
1929 if (m.msg_flags & MSG_TRUNC)
1930 {
1931 log(L_WARN "Netlink got truncated asynchronous message");
1932 return 1;
1933 }
1934 while (NLMSG_OK(h, len))
1935 {
1936 nl_async_msg(h);
1937 h = NLMSG_NEXT(h, len);
1938 }
1939 if (len)
1940 log(L_WARN "nl_async_hook: Found packet remnant of size %d", len);
1941 return 1;
1942 }
1943
1944 static void
1945 nl_async_err_hook(sock *sk, int e UNUSED)
1946 {
1947 nl_async_hook(sk, 0);
1948 }
1949
1950 static void
1951 nl_open_async(void)
1952 {
1953 sock *sk;
1954 struct sockaddr_nl sa;
1955 int fd;
1956
1957 if (nl_async_sk)
1958 return;
1959
1960 DBG("KRT: Opening async netlink socket\n");
1961
1962 fd = socket(PF_NETLINK, SOCK_RAW, NETLINK_ROUTE);
1963 if (fd < 0)
1964 {
1965 log(L_ERR "Unable to open asynchronous rtnetlink socket: %m");
1966 return;
1967 }
1968
1969 bzero(&sa, sizeof(sa));
1970 sa.nl_family = AF_NETLINK;
1971 sa.nl_groups = RTMGRP_LINK |
1972 RTMGRP_IPV4_IFADDR | RTMGRP_IPV4_ROUTE |
1973 RTMGRP_IPV6_IFADDR | RTMGRP_IPV6_ROUTE;
1974
1975 if (bind(fd, (struct sockaddr *) &sa, sizeof(sa)) < 0)
1976 {
1977 log(L_ERR "Unable to bind asynchronous rtnetlink socket: %m");
1978 close(fd);
1979 return;
1980 }
1981
1982 nl_async_rx_buffer = xmalloc(NL_RX_SIZE);
1983
1984 sk = nl_async_sk = sk_new(krt_pool);
1985 sk->type = SK_MAGIC;
1986 sk->rx_hook = nl_async_hook;
1987 sk->err_hook = nl_async_err_hook;
1988 sk->fd = fd;
1989 if (sk_open(sk) < 0)
1990 bug("Netlink: sk_open failed");
1991 }
1992
1993 static void
1994 nl_update_async_bufsize(void)
1995 {
1996 /* No async socket */
1997 if (!nl_async_sk)
1998 return;
1999
2000 /* Already reconfigured */
2001 if (nl_last_config == config)
2002 return;
2003
2004 /* Update netlink buffer size */
2005 uint bufsize = nl_cfg_rx_buffer_size(config);
2006 if (bufsize && (bufsize != nl_async_bufsize))
2007 {
2008 /* Log message for reconfigurations only */
2009 if (nl_last_config)
2010 log(L_INFO "KRT: Changing netlink rx buffer size to %u", bufsize);
2011
2012 nl_set_rcvbuf(nl_async_sk->fd, bufsize);
2013 nl_async_bufsize = bufsize;
2014 }
2015
2016 nl_last_config = config;
2017 }
2018
2019
2020 /*
2021 * Interface to the UNIX krt module
2022 */
2023
2024 void
2025 krt_sys_io_init(void)
2026 {
2027 nl_linpool = lp_new_default(krt_pool);
2028 HASH_INIT(nl_table_map, krt_pool, 6);
2029 }
2030
2031 int
2032 krt_sys_start(struct krt_proto *p)
2033 {
2034 struct krt_proto *old = HASH_FIND(nl_table_map, RTH, p->af, krt_table_id(p));
2035
2036 if (old)
2037 {
2038 log(L_ERR "%s: Kernel table %u already registered by %s",
2039 p->p.name, krt_table_id(p), old->p.name);
2040 return 0;
2041 }
2042
2043 HASH_INSERT2(nl_table_map, RTH, krt_pool, p);
2044
2045 nl_open();
2046 nl_open_async();
2047 nl_update_async_bufsize();
2048
2049 return 1;
2050 }
2051
2052 void
2053 krt_sys_shutdown(struct krt_proto *p)
2054 {
2055 nl_update_async_bufsize();
2056
2057 HASH_REMOVE2(nl_table_map, RTH, krt_pool, p);
2058 }
2059
2060 int
2061 krt_sys_reconfigure(struct krt_proto *p UNUSED, struct krt_config *n, struct krt_config *o)
2062 {
2063 nl_update_async_bufsize();
2064
2065 return (n->sys.table_id == o->sys.table_id) && (n->sys.metric == o->sys.metric);
2066 }
2067
2068 void
2069 krt_sys_init_config(struct krt_config *cf)
2070 {
2071 cf->sys.table_id = RT_TABLE_MAIN;
2072 cf->sys.metric = 32;
2073 }
2074
2075 void
2076 krt_sys_copy_config(struct krt_config *d, struct krt_config *s)
2077 {
2078 d->sys.table_id = s->sys.table_id;
2079 d->sys.metric = s->sys.metric;
2080 }
2081
2082 static const char *krt_metrics_names[KRT_METRICS_MAX] = {
2083 NULL, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss",
2084 "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack"
2085 };
2086
2087 static const char *krt_features_names[KRT_FEATURES_MAX] = {
2088 "ecn", NULL, NULL, "allfrag"
2089 };
2090
2091 int
2092 krt_sys_get_attr(const eattr *a, byte *buf, int buflen UNUSED)
2093 {
2094 switch (a->id)
2095 {
2096 case EA_KRT_PREFSRC:
2097 bsprintf(buf, "prefsrc");
2098 return GA_NAME;
2099
2100 case EA_KRT_REALM:
2101 bsprintf(buf, "realm");
2102 return GA_NAME;
2103
2104 case EA_KRT_SCOPE:
2105 bsprintf(buf, "scope");
2106 return GA_NAME;
2107
2108 case EA_KRT_LOCK:
2109 buf += bsprintf(buf, "lock:");
2110 ea_format_bitfield(a, buf, buflen, krt_metrics_names, 2, KRT_METRICS_MAX);
2111 return GA_FULL;
2112
2113 case EA_KRT_FEATURES:
2114 buf += bsprintf(buf, "features:");
2115 ea_format_bitfield(a, buf, buflen, krt_features_names, 0, KRT_FEATURES_MAX);
2116 return GA_FULL;
2117
2118 default:;
2119 int id = (int)EA_ID(a->id) - KRT_METRICS_OFFSET;
2120 if (id > 0 && id < KRT_METRICS_MAX)
2121 {
2122 bsprintf(buf, "%s", krt_metrics_names[id]);
2123 return GA_NAME;
2124 }
2125
2126 return GA_UNKNOWN;
2127 }
2128 }
2129
2130
2131
2132 void
2133 kif_sys_start(struct kif_proto *p UNUSED)
2134 {
2135 nl_open();
2136 nl_open_async();
2137 }
2138
2139 void
2140 kif_sys_shutdown(struct kif_proto *p UNUSED)
2141 {
2142 }
2143
2144 int
2145 kif_update_sysdep_addr(struct iface *i UNUSED)
2146 {
2147 return 0;
2148 }