2 * BIRD -- Linux Netlink Interface
4 * (c) 1999--2000 Martin Mares <mj@ucw.cz>
6 * Can be freely distributed and used under the terms of the GNU GPL.
13 #include <sys/socket.h>
19 #include "nest/bird.h"
20 #include "nest/route.h"
21 #include "nest/protocol.h"
22 #include "nest/iface.h"
23 #include "lib/alloca.h"
24 #include "sysdep/unix/unix.h"
25 #include "sysdep/unix/krt.h"
26 #include "lib/socket.h"
27 #include "lib/string.h"
29 #include "conf/conf.h"
31 #include <asm/types.h>
33 #include <linux/netlink.h>
34 #include <linux/rtnetlink.h>
36 #ifdef HAVE_MPLS_KERNEL
37 #include <linux/lwtunnel.h>
40 #ifndef MSG_TRUNC /* Hack: Several versions of glibc miss this one :( */
41 #define MSG_TRUNC 0x20
49 #define IFF_LOWER_UP 0x10000
64 #ifndef RTA_ENCAP_TYPE
65 #define RTA_ENCAP_TYPE 21
72 #define krt_ipv4(p) ((p)->af == AF_INET)
73 #define krt_ecmp6(p) ((p)->af == AF_INET6)
75 const int rt_default_ecmp
= 16;
78 * Structure nl_parse_state keeps state of received route processing. Ideally,
79 * we could just independently parse received Netlink messages and immediately
80 * propagate received routes to the rest of BIRD, but older Linux kernel (before
81 * version 4.11) represents and announces IPv6 ECMP routes not as one route with
82 * multiple next hops (like RTA_MULTIPATH in IPv4 ECMP), but as a sequence of
83 * routes with the same prefix. More recent kernels work as with IPv4.
85 * Therefore, BIRD keeps currently processed route in nl_parse_state structure
86 * and postpones its propagation until we expect it to be final; i.e., when
87 * non-matching route is received or when the scan ends. When another matching
88 * route is received, it is merged with the already processed route to form an
89 * ECMP route. Note that merging is done only for IPv6 (merge == 1), but the
90 * postponing is done in both cases (for simplicity). All IPv4 routes or IPv6
91 * routes with RTA_MULTIPATH set are just considered non-matching.
93 * This is ignored for asynchronous notifications (every notification is handled
94 * as a separate route). It is not an issue for our routes, as we ignore such
95 * notifications anyways. But importing alien IPv6 ECMP routes does not work
96 * properly with older kernels.
98 * Whatever the kernel version is, IPv6 ECMP routes are sent as multiple routes
99 * for the same prefix.
102 struct nl_parse_state
104 struct linpool
*pool
;
110 struct krt_proto
*proto
;
119 * Synchronous Netlink interface
126 byte
*rx_buffer
; /* Receive buffer */
127 struct nlmsghdr
*last_hdr
; /* Recently received packet */
131 #define NL_RX_SIZE 8192
133 #define NL_OP_DELETE 0
134 #define NL_OP_ADD (NLM_F_CREATE|NLM_F_EXCL)
135 #define NL_OP_REPLACE (NLM_F_CREATE|NLM_F_REPLACE)
136 #define NL_OP_APPEND (NLM_F_CREATE|NLM_F_APPEND)
138 static linpool
*nl_linpool
;
140 static struct nl_sock nl_scan
= {.fd
= -1}; /* Netlink socket for synchronous scan */
141 static struct nl_sock nl_req
= {.fd
= -1}; /* Netlink socket for requests */
144 nl_open_sock(struct nl_sock
*nl
)
148 nl
->fd
= socket(PF_NETLINK
, SOCK_RAW
, NETLINK_ROUTE
);
150 die("Unable to open rtnetlink socket: %m");
151 nl
->seq
= (u32
) (current_time() TO_S
); /* Or perhaps random_u32() ? */
152 nl
->rx_buffer
= xmalloc(NL_RX_SIZE
);
161 nl_open_sock(&nl_scan
);
162 nl_open_sock(&nl_req
);
166 nl_send(struct nl_sock
*nl
, struct nlmsghdr
*nh
)
168 struct sockaddr_nl sa
;
170 memset(&sa
, 0, sizeof(sa
));
171 sa
.nl_family
= AF_NETLINK
;
173 nh
->nlmsg_seq
= ++(nl
->seq
);
174 nh
->nlmsg_len
= NLMSG_ALIGN(nh
->nlmsg_len
);
175 if (sendto(nl
->fd
, nh
, nh
->nlmsg_len
, 0, (struct sockaddr
*)&sa
, sizeof(sa
)) < 0)
176 die("rtnetlink sendto: %m");
181 nl_request_dump(int af
, int cmd
)
187 .nh
.nlmsg_type
= cmd
,
188 .nh
.nlmsg_len
= sizeof(req
),
189 .nh
.nlmsg_flags
= NLM_F_REQUEST
| NLM_F_DUMP
,
192 nl_send(&nl_scan
, &req
.nh
);
195 static struct nlmsghdr
*
196 nl_get_reply(struct nl_sock
*nl
)
202 struct iovec iov
= { nl
->rx_buffer
, NL_RX_SIZE
};
203 struct sockaddr_nl sa
;
206 .msg_namelen
= sizeof(sa
),
210 int x
= recvmsg(nl
->fd
, &m
, 0);
212 die("nl_get_reply: %m");
213 if (sa
.nl_pid
) /* It isn't from the kernel */
215 DBG("Non-kernel packet\n");
219 nl
->last_hdr
= (void *) nl
->rx_buffer
;
220 if (m
.msg_flags
& MSG_TRUNC
)
221 bug("nl_get_reply: got truncated reply which should be impossible");
223 if (NLMSG_OK(nl
->last_hdr
, nl
->last_size
))
225 struct nlmsghdr
*h
= nl
->last_hdr
;
226 nl
->last_hdr
= NLMSG_NEXT(h
, nl
->last_size
);
227 if (h
->nlmsg_seq
!= nl
->seq
)
229 log(L_WARN
"nl_get_reply: Ignoring out of sequence netlink packet (%x != %x)",
230 h
->nlmsg_seq
, nl
->seq
);
236 log(L_WARN
"nl_get_reply: Found packet remnant of size %d", nl
->last_size
);
241 static struct tbf rl_netlink_err
= TBF_DEFAULT_LOG_LIMITS
;
244 nl_error(struct nlmsghdr
*h
, int ignore_esrch
)
249 if (h
->nlmsg_len
< NLMSG_LENGTH(sizeof(struct nlmsgerr
)))
251 log(L_WARN
"Netlink: Truncated error message received");
254 e
= (struct nlmsgerr
*) NLMSG_DATA(h
);
256 if (ec
&& !(ignore_esrch
&& (ec
== ESRCH
)))
257 log_rl(&rl_netlink_err
, L_WARN
"Netlink: %s", strerror(ec
));
261 static struct nlmsghdr
*
264 struct nlmsghdr
*h
= nl_get_reply(&nl_scan
);
266 if (h
->nlmsg_type
== NLMSG_DONE
)
268 if (h
->nlmsg_type
== NLMSG_ERROR
)
277 nl_exchange(struct nlmsghdr
*pkt
, int ignore_esrch
)
281 nl_send(&nl_req
, pkt
);
284 h
= nl_get_reply(&nl_req
);
285 if (h
->nlmsg_type
== NLMSG_ERROR
)
287 log(L_WARN
"nl_exchange: Unexpected reply received");
289 return nl_error(h
, ignore_esrch
) ? -1 : 0;
296 static int nl_attr_len
;
299 nl_checkin(struct nlmsghdr
*h
, int lsize
)
301 nl_attr_len
= h
->nlmsg_len
- NLMSG_LENGTH(lsize
);
304 log(L_ERR
"nl_checkin: underrun by %d bytes", -nl_attr_len
);
307 return NLMSG_DATA(h
);
310 struct nl_want_attrs
{
317 #define BIRD_IFLA_MAX (IFLA_WIRELESS+1)
319 static struct nl_want_attrs ifla_attr_want
[BIRD_IFLA_MAX
] = {
320 [IFLA_IFNAME
] = { 1, 0, 0 },
321 [IFLA_MTU
] = { 1, 1, sizeof(u32
) },
322 [IFLA_MASTER
] = { 1, 1, sizeof(u32
) },
323 [IFLA_WIRELESS
] = { 1, 0, 0 },
327 #define BIRD_IFA_MAX (IFA_FLAGS+1)
329 static struct nl_want_attrs ifa_attr_want4
[BIRD_IFA_MAX
] = {
330 [IFA_ADDRESS
] = { 1, 1, sizeof(ip4_addr
) },
331 [IFA_LOCAL
] = { 1, 1, sizeof(ip4_addr
) },
332 [IFA_BROADCAST
] = { 1, 1, sizeof(ip4_addr
) },
333 [IFA_FLAGS
] = { 1, 1, sizeof(u32
) },
336 static struct nl_want_attrs ifa_attr_want6
[BIRD_IFA_MAX
] = {
337 [IFA_ADDRESS
] = { 1, 1, sizeof(ip6_addr
) },
338 [IFA_LOCAL
] = { 1, 1, sizeof(ip6_addr
) },
339 [IFA_FLAGS
] = { 1, 1, sizeof(u32
) },
343 #define BIRD_RTA_MAX (RTA_ENCAP+1)
345 static struct nl_want_attrs nexthop_attr_want4
[BIRD_RTA_MAX
] = {
346 [RTA_GATEWAY
] = { 1, 1, sizeof(ip4_addr
) },
347 [RTA_VIA
] = { 1, 0, 0 },
348 [RTA_ENCAP_TYPE
]= { 1, 1, sizeof(u16
) },
349 [RTA_ENCAP
] = { 1, 0, 0 },
352 static struct nl_want_attrs nexthop_attr_want6
[BIRD_RTA_MAX
] = {
353 [RTA_GATEWAY
] = { 1, 1, sizeof(ip6_addr
) },
354 [RTA_VIA
] = { 1, 0, 0 },
355 [RTA_ENCAP_TYPE
]= { 1, 1, sizeof(u16
) },
356 [RTA_ENCAP
] = { 1, 0, 0 },
359 #ifdef HAVE_MPLS_KERNEL
360 static struct nl_want_attrs encap_mpls_want
[BIRD_RTA_MAX
] = {
361 [RTA_DST
] = { 1, 0, 0 },
365 static struct nl_want_attrs rtm_attr_want4
[BIRD_RTA_MAX
] = {
366 [RTA_DST
] = { 1, 1, sizeof(ip4_addr
) },
367 [RTA_OIF
] = { 1, 1, sizeof(u32
) },
368 [RTA_GATEWAY
] = { 1, 1, sizeof(ip4_addr
) },
369 [RTA_PRIORITY
] = { 1, 1, sizeof(u32
) },
370 [RTA_PREFSRC
] = { 1, 1, sizeof(ip4_addr
) },
371 [RTA_METRICS
] = { 1, 0, 0 },
372 [RTA_MULTIPATH
] = { 1, 0, 0 },
373 [RTA_FLOW
] = { 1, 1, sizeof(u32
) },
374 [RTA_TABLE
] = { 1, 1, sizeof(u32
) },
375 [RTA_VIA
] = { 1, 0, 0 },
376 [RTA_ENCAP_TYPE
]= { 1, 1, sizeof(u16
) },
377 [RTA_ENCAP
] = { 1, 0, 0 },
380 static struct nl_want_attrs rtm_attr_want6
[BIRD_RTA_MAX
] = {
381 [RTA_DST
] = { 1, 1, sizeof(ip6_addr
) },
382 [RTA_SRC
] = { 1, 1, sizeof(ip6_addr
) },
383 [RTA_IIF
] = { 1, 1, sizeof(u32
) },
384 [RTA_OIF
] = { 1, 1, sizeof(u32
) },
385 [RTA_GATEWAY
] = { 1, 1, sizeof(ip6_addr
) },
386 [RTA_PRIORITY
] = { 1, 1, sizeof(u32
) },
387 [RTA_PREFSRC
] = { 1, 1, sizeof(ip6_addr
) },
388 [RTA_METRICS
] = { 1, 0, 0 },
389 [RTA_MULTIPATH
] = { 1, 0, 0 },
390 [RTA_FLOW
] = { 1, 1, sizeof(u32
) },
391 [RTA_TABLE
] = { 1, 1, sizeof(u32
) },
392 [RTA_VIA
] = { 1, 0, 0 },
393 [RTA_ENCAP_TYPE
]= { 1, 1, sizeof(u16
) },
394 [RTA_ENCAP
] = { 1, 0, 0 },
397 #ifdef HAVE_MPLS_KERNEL
398 static struct nl_want_attrs rtm_attr_want_mpls
[BIRD_RTA_MAX
] = {
399 [RTA_DST
] = { 1, 1, sizeof(u32
) },
400 [RTA_IIF
] = { 1, 1, sizeof(u32
) },
401 [RTA_OIF
] = { 1, 1, sizeof(u32
) },
402 [RTA_PRIORITY
] = { 1, 1, sizeof(u32
) },
403 [RTA_METRICS
] = { 1, 0, 0 },
404 [RTA_FLOW
] = { 1, 1, sizeof(u32
) },
405 [RTA_TABLE
] = { 1, 1, sizeof(u32
) },
406 [RTA_VIA
] = { 1, 0, 0 },
407 [RTA_NEWDST
] = { 1, 0, 0 },
413 nl_parse_attrs(struct rtattr
*a
, struct nl_want_attrs
*want
, struct rtattr
**k
, int ksize
)
415 int max
= ksize
/ sizeof(struct rtattr
*);
418 for ( ; RTA_OK(a
, nl_attr_len
); a
= RTA_NEXT(a
, nl_attr_len
))
420 if ((a
->rta_type
>= max
) || !want
[a
->rta_type
].defined
)
423 if (want
[a
->rta_type
].checksize
&& (RTA_PAYLOAD(a
) != want
[a
->rta_type
].size
))
425 log(L_ERR
"nl_parse_attrs: Malformed attribute received");
434 log(L_ERR
"nl_parse_attrs: remnant of size %d", nl_attr_len
);
441 static inline u16
rta_get_u16(struct rtattr
*a
)
442 { return *(u16
*) RTA_DATA(a
); }
444 static inline u32
rta_get_u32(struct rtattr
*a
)
445 { return *(u32
*) RTA_DATA(a
); }
447 static inline ip4_addr
rta_get_ip4(struct rtattr
*a
)
448 { return ip4_ntoh(*(ip4_addr
*) RTA_DATA(a
)); }
450 static inline ip6_addr
rta_get_ip6(struct rtattr
*a
)
451 { return ip6_ntoh(*(ip6_addr
*) RTA_DATA(a
)); }
453 static inline ip_addr
rta_get_ipa(struct rtattr
*a
)
455 if (RTA_PAYLOAD(a
) == sizeof(ip4_addr
))
456 return ipa_from_ip4(rta_get_ip4(a
));
458 return ipa_from_ip6(rta_get_ip6(a
));
461 #ifdef HAVE_MPLS_KERNEL
462 static inline ip_addr
rta_get_via(struct rtattr
*a
)
464 struct rtvia
*v
= RTA_DATA(a
);
465 switch(v
->rtvia_family
) {
466 case AF_INET
: return ipa_from_ip4(ip4_ntoh(*(ip4_addr
*) v
->rtvia_addr
));
467 case AF_INET6
: return ipa_from_ip6(ip6_ntoh(*(ip6_addr
*) v
->rtvia_addr
));
472 static u32 rta_mpls_stack
[MPLS_MAX_LABEL_STACK
];
473 static inline int rta_get_mpls(struct rtattr
*a
, u32
*stack
)
478 if (RTA_PAYLOAD(a
) % 4)
479 log(L_WARN
"KRT: Strange length of received MPLS stack: %u", RTA_PAYLOAD(a
));
481 int labels
= mpls_get(RTA_DATA(a
), RTA_PAYLOAD(a
) & ~0x3, stack
);
485 log(L_WARN
"KRT: Too long MPLS stack received, ignoring");
494 nl_add_attr(struct nlmsghdr
*h
, uint bufsize
, uint code
, const void *data
, uint dlen
)
496 uint pos
= NLMSG_ALIGN(h
->nlmsg_len
);
497 uint len
= RTA_LENGTH(dlen
);
499 if (pos
+ len
> bufsize
)
500 bug("nl_add_attr: packet buffer overflow");
502 struct rtattr
*a
= (struct rtattr
*)((char *)h
+ pos
);
505 h
->nlmsg_len
= pos
+ len
;
508 memcpy(RTA_DATA(a
), data
, dlen
);
513 static inline struct rtattr
*
514 nl_open_attr(struct nlmsghdr
*h
, uint bufsize
, uint code
)
516 return nl_add_attr(h
, bufsize
, code
, NULL
, 0);
520 nl_close_attr(struct nlmsghdr
*h
, struct rtattr
*a
)
522 a
->rta_len
= (void *)h
+ NLMSG_ALIGN(h
->nlmsg_len
) - (void *)a
;
526 nl_add_attr_u16(struct nlmsghdr
*h
, uint bufsize
, int code
, u16 data
)
528 nl_add_attr(h
, bufsize
, code
, &data
, 2);
532 nl_add_attr_u32(struct nlmsghdr
*h
, uint bufsize
, int code
, u32 data
)
534 nl_add_attr(h
, bufsize
, code
, &data
, 4);
538 nl_add_attr_ip4(struct nlmsghdr
*h
, uint bufsize
, int code
, ip4_addr ip4
)
541 nl_add_attr(h
, bufsize
, code
, &ip4
, sizeof(ip4
));
545 nl_add_attr_ip6(struct nlmsghdr
*h
, uint bufsize
, int code
, ip6_addr ip6
)
548 nl_add_attr(h
, bufsize
, code
, &ip6
, sizeof(ip6
));
552 nl_add_attr_ipa(struct nlmsghdr
*h
, uint bufsize
, int code
, ip_addr ipa
)
555 nl_add_attr_ip4(h
, bufsize
, code
, ipa_to_ip4(ipa
));
557 nl_add_attr_ip6(h
, bufsize
, code
, ipa_to_ip6(ipa
));
560 #ifdef HAVE_MPLS_KERNEL
562 nl_add_attr_mpls(struct nlmsghdr
*h
, uint bufsize
, int code
, int len
, u32
*stack
)
565 mpls_put(buf
, len
, stack
);
566 nl_add_attr(h
, bufsize
, code
, buf
, len
*4);
570 nl_add_attr_mpls_encap(struct nlmsghdr
*h
, uint bufsize
, int len
, u32
*stack
)
572 nl_add_attr_u16(h
, bufsize
, RTA_ENCAP_TYPE
, LWTUNNEL_ENCAP_MPLS
);
574 struct rtattr
*nest
= nl_open_attr(h
, bufsize
, RTA_ENCAP
);
575 nl_add_attr_mpls(h
, bufsize
, RTA_DST
, len
, stack
);
576 nl_close_attr(h
, nest
);
580 nl_add_attr_via(struct nlmsghdr
*h
, uint bufsize
, ip_addr ipa
)
582 struct rtvia
*via
= alloca(sizeof(struct rtvia
) + 16);
586 via
->rtvia_family
= AF_INET
;
587 put_ip4(via
->rtvia_addr
, ipa_to_ip4(ipa
));
588 nl_add_attr(h
, bufsize
, RTA_VIA
, via
, sizeof(struct rtvia
) + 4);
592 via
->rtvia_family
= AF_INET6
;
593 put_ip6(via
->rtvia_addr
, ipa_to_ip6(ipa
));
594 nl_add_attr(h
, bufsize
, RTA_VIA
, via
, sizeof(struct rtvia
) + 16);
599 static inline struct rtnexthop
*
600 nl_open_nexthop(struct nlmsghdr
*h
, uint bufsize
)
602 uint pos
= NLMSG_ALIGN(h
->nlmsg_len
);
603 uint len
= RTNH_LENGTH(0);
605 if (pos
+ len
> bufsize
)
606 bug("nl_open_nexthop: packet buffer overflow");
608 h
->nlmsg_len
= pos
+ len
;
610 return (void *)h
+ pos
;
614 nl_close_nexthop(struct nlmsghdr
*h
, struct rtnexthop
*nh
)
616 nh
->rtnh_len
= (void *)h
+ NLMSG_ALIGN(h
->nlmsg_len
) - (void *)nh
;
620 nl_add_nexthop(struct nlmsghdr
*h
, uint bufsize
, struct nexthop
*nh
, int af UNUSED
)
622 #ifdef HAVE_MPLS_KERNEL
625 nl_add_attr_mpls(h
, bufsize
, RTA_NEWDST
, nh
->labels
, nh
->label
);
627 nl_add_attr_mpls_encap(h
, bufsize
, nh
->labels
, nh
->label
);
629 if (ipa_nonzero(nh
->gw
))
631 if (af
== (ipa_is_ip4(nh
->gw
) ? AF_INET
: AF_INET6
))
632 nl_add_attr_ipa(h
, bufsize
, RTA_GATEWAY
, nh
->gw
);
634 nl_add_attr_via(h
, bufsize
, nh
->gw
);
638 if (ipa_nonzero(nh
->gw
))
639 nl_add_attr_ipa(h
, bufsize
, RTA_GATEWAY
, nh
->gw
);
644 nl_add_multipath(struct nlmsghdr
*h
, uint bufsize
, struct nexthop
*nh
, int af
)
646 struct rtattr
*a
= nl_open_attr(h
, bufsize
, RTA_MULTIPATH
);
648 for (; nh
; nh
= nh
->next
)
650 struct rtnexthop
*rtnh
= nl_open_nexthop(h
, bufsize
);
652 rtnh
->rtnh_flags
= 0;
653 rtnh
->rtnh_hops
= nh
->weight
;
654 rtnh
->rtnh_ifindex
= nh
->iface
->index
;
656 nl_add_nexthop(h
, bufsize
, nh
, af
);
658 if (nh
->flags
& RNF_ONLINK
)
659 rtnh
->rtnh_flags
|= RTNH_F_ONLINK
;
661 nl_close_nexthop(h
, rtnh
);
667 static struct nexthop
*
668 nl_parse_multipath(struct nl_parse_state
*s
, struct krt_proto
*p
, struct rtattr
*ra
, int af
)
670 struct rtattr
*a
[BIRD_RTA_MAX
];
671 struct rtnexthop
*nh
= RTA_DATA(ra
);
672 struct nexthop
*rv
, *first
, **last
;
673 unsigned len
= RTA_PAYLOAD(ra
);
680 /* Use RTNH_OK(nh,len) ?? */
681 if ((len
< sizeof(*nh
)) || (len
< nh
->rtnh_len
))
684 *last
= rv
= lp_allocz(s
->pool
, NEXTHOP_MAX_SIZE
);
687 rv
->weight
= nh
->rtnh_hops
;
688 rv
->iface
= if_find_by_index(nh
->rtnh_ifindex
);
692 /* Nonexistent RTNH_PAYLOAD ?? */
693 nl_attr_len
= nh
->rtnh_len
- RTNH_LENGTH(0);
697 if (!nl_parse_attrs(RTNH_DATA(nh
), nexthop_attr_want4
, a
, sizeof(a
)))
702 if (!nl_parse_attrs(RTNH_DATA(nh
), nexthop_attr_want6
, a
, sizeof(a
)))
711 rv
->gw
= rta_get_ipa(a
[RTA_GATEWAY
]);
713 #ifdef HAVE_MPLS_KERNEL
715 rv
->gw
= rta_get_via(a
[RTA_VIA
]);
718 if (ipa_nonzero(rv
->gw
))
720 if (nh
->rtnh_flags
& RTNH_F_ONLINK
)
721 rv
->flags
|= RNF_ONLINK
;
724 nbr
= neigh_find(&p
->p
, rv
->gw
, rv
->iface
,
725 (rv
->flags
& RNF_ONLINK
) ? NEF_ONLINK
: 0);
726 if (!nbr
|| (nbr
->scope
== SCOPE_HOST
))
730 #ifdef HAVE_MPLS_KERNEL
731 if (a
[RTA_ENCAP
] && a
[RTA_ENCAP_TYPE
])
733 if (rta_get_u16(a
[RTA_ENCAP_TYPE
]) != LWTUNNEL_ENCAP_MPLS
) {
734 log(L_WARN
"KRT: Unknown encapsulation method %d in multipath", rta_get_u16(a
[RTA_ENCAP_TYPE
]));
738 struct rtattr
*enca
[BIRD_RTA_MAX
];
739 nl_attr_len
= RTA_PAYLOAD(a
[RTA_ENCAP
]);
740 nl_parse_attrs(RTA_DATA(a
[RTA_ENCAP
]), encap_mpls_want
, enca
, sizeof(enca
));
741 rv
->labels
= rta_get_mpls(enca
[RTA_DST
], rv
->label
);
746 len
-= NLMSG_ALIGN(nh
->rtnh_len
);
750 /* Ensure nexthops are sorted to satisfy nest invariant */
751 if (!nexthop_is_sorted(first
))
752 first
= nexthop_sort(first
);
758 nl_add_metrics(struct nlmsghdr
*h
, uint bufsize
, u32
*metrics
, int max
)
760 struct rtattr
*a
= nl_open_attr(h
, bufsize
, RTA_METRICS
);
763 for (t
= 1; t
< max
; t
++)
764 if (metrics
[0] & (1 << t
))
765 nl_add_attr_u32(h
, bufsize
, t
, metrics
[t
]);
771 nl_parse_metrics(struct rtattr
*hdr
, u32
*metrics
, int max
)
773 struct rtattr
*a
= RTA_DATA(hdr
);
774 int len
= RTA_PAYLOAD(hdr
);
777 for (; RTA_OK(a
, len
); a
= RTA_NEXT(a
, len
))
779 if (a
->rta_type
== RTA_UNSPEC
)
782 if (a
->rta_type
>= max
)
785 if (RTA_PAYLOAD(a
) != 4)
788 metrics
[0] |= 1 << a
->rta_type
;
789 metrics
[a
->rta_type
] = rta_get_u32(a
);
800 * Scanning of interfaces
804 nl_parse_link(struct nlmsghdr
*h
, int scan
)
807 struct rtattr
*a
[BIRD_IFLA_MAX
];
808 int new = h
->nlmsg_type
== RTM_NEWLINK
;
815 if (!(i
= nl_checkin(h
, sizeof(*i
))) || !nl_parse_attrs(IFLA_RTA(i
), ifla_attr_want
, a
, sizeof(a
)))
817 if (!a
[IFLA_IFNAME
] || (RTA_PAYLOAD(a
[IFLA_IFNAME
]) < 2) || !a
[IFLA_MTU
])
820 * IFLA_IFNAME and IFLA_MTU are required, in fact, but there may also come
821 * a message with IFLA_WIRELESS set, where (e.g.) no IFLA_IFNAME exists.
822 * We simply ignore all such messages with IFLA_WIRELESS without notice.
825 if (a
[IFLA_WIRELESS
])
828 log(L_ERR
"KIF: Malformed message received");
832 name
= RTA_DATA(a
[IFLA_IFNAME
]);
833 mtu
= rta_get_u32(a
[IFLA_MTU
]);
836 master
= rta_get_u32(a
[IFLA_MASTER
]);
838 ifi
= if_find_by_index(i
->ifi_index
);
841 DBG("KIF: IF%d(%s) goes down\n", i
->ifi_index
, name
);
849 DBG("KIF: IF%d(%s) goes up (mtu=%d,flg=%x)\n", i
->ifi_index
, name
, mtu
, i
->ifi_flags
);
850 if (ifi
&& strncmp(ifi
->name
, name
, sizeof(ifi
->name
)-1))
853 strncpy(f
.name
, name
, sizeof(f
.name
)-1);
854 f
.index
= i
->ifi_index
;
857 f
.master_index
= master
;
858 f
.master
= if_find_by_index(master
);
862 f
.flags
|= IF_ADMIN_UP
;
863 if (fl
& IFF_LOWER_UP
)
864 f
.flags
|= IF_LINK_UP
;
865 if (fl
& IFF_LOOPBACK
) /* Loopback */
866 f
.flags
|= IF_MULTIACCESS
| IF_LOOPBACK
| IF_IGNORE
;
867 else if (fl
& IFF_POINTOPOINT
) /* PtP */
868 f
.flags
|= IF_MULTICAST
;
869 else if (fl
& IFF_BROADCAST
) /* Broadcast */
870 f
.flags
|= IF_MULTIACCESS
| IF_BROADCAST
| IF_MULTICAST
;
872 f
.flags
|= IF_MULTIACCESS
; /* NBMA */
874 if (fl
& IFF_MULTICAST
)
875 f
.flags
|= IF_MULTICAST
;
880 if_end_partial_update(ifi
);
885 nl_parse_addr4(struct ifaddrmsg
*i
, int scan
, int new)
887 struct rtattr
*a
[BIRD_IFA_MAX
];
892 if (!nl_parse_attrs(IFA_RTA(i
), ifa_attr_want4
, a
, sizeof(a
)))
897 log(L_ERR
"KIF: Malformed message received (missing IFA_LOCAL)");
902 log(L_ERR
"KIF: Malformed message received (missing IFA_ADDRESS)");
906 ifi
= if_find_by_index(i
->ifa_index
);
909 log(L_ERR
"KIF: Received address message for unknown interface %d", i
->ifa_index
);
914 ifa_flags
= rta_get_u32(a
[IFA_FLAGS
]);
916 ifa_flags
= i
->ifa_flags
;
919 bzero(&ifa
, sizeof(ifa
));
921 if (ifa_flags
& IFA_F_SECONDARY
)
922 ifa
.flags
|= IA_SECONDARY
;
924 ifa
.ip
= rta_get_ipa(a
[IFA_LOCAL
]);
926 if (i
->ifa_prefixlen
> IP4_MAX_PREFIX_LENGTH
)
928 log(L_ERR
"KIF: Invalid prefix length for interface %s: %d", ifi
->name
, i
->ifa_prefixlen
);
931 if (i
->ifa_prefixlen
== IP4_MAX_PREFIX_LENGTH
)
933 ifa
.brd
= rta_get_ipa(a
[IFA_ADDRESS
]);
934 net_fill_ip4(&ifa
.prefix
, rta_get_ip4(a
[IFA_ADDRESS
]), i
->ifa_prefixlen
);
936 /* It is either a host address or a peer address */
937 if (ipa_equal(ifa
.ip
, ifa
.brd
))
938 ifa
.flags
|= IA_HOST
;
941 ifa
.flags
|= IA_PEER
;
942 ifa
.opposite
= ifa
.brd
;
947 net_fill_ip4(&ifa
.prefix
, ipa_to_ip4(ifa
.ip
), i
->ifa_prefixlen
);
948 net_normalize(&ifa
.prefix
);
950 if (i
->ifa_prefixlen
== IP4_MAX_PREFIX_LENGTH
- 1)
951 ifa
.opposite
= ipa_opposite_m1(ifa
.ip
);
953 if (i
->ifa_prefixlen
== IP4_MAX_PREFIX_LENGTH
- 2)
954 ifa
.opposite
= ipa_opposite_m2(ifa
.ip
);
956 if ((ifi
->flags
& IF_BROADCAST
) && a
[IFA_BROADCAST
])
958 ip4_addr xbrd
= rta_get_ip4(a
[IFA_BROADCAST
]);
959 ip4_addr ybrd
= ip4_or(ipa_to_ip4(ifa
.ip
), ip4_not(ip4_mkmask(i
->ifa_prefixlen
)));
961 if (ip4_equal(xbrd
, net4_prefix(&ifa
.prefix
)) || ip4_equal(xbrd
, ybrd
))
962 ifa
.brd
= ipa_from_ip4(xbrd
);
963 else if (ifi
->flags
& IF_TMP_DOWN
) /* Complain only during the first scan */
965 log(L_ERR
"KIF: Invalid broadcast address %I4 for %s", xbrd
, ifi
->name
);
966 ifa
.brd
= ipa_from_ip4(ybrd
);
971 scope
= ipa_classify(ifa
.ip
);
974 log(L_ERR
"KIF: Invalid interface address %I for %s", ifa
.ip
, ifi
->name
);
977 ifa
.scope
= scope
& IADDR_SCOPE_MASK
;
979 DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n",
980 ifi
->index
, ifi
->name
,
981 new ? "added" : "removed",
982 ifa
.ip
, ifa
.flags
, &ifa
.prefix
, ifa
.brd
, ifa
.opposite
);
990 if_end_partial_update(ifi
);
994 nl_parse_addr6(struct ifaddrmsg
*i
, int scan
, int new)
996 struct rtattr
*a
[BIRD_IFA_MAX
];
1001 if (!nl_parse_attrs(IFA_RTA(i
), ifa_attr_want6
, a
, sizeof(a
)))
1004 if (!a
[IFA_ADDRESS
])
1006 log(L_ERR
"KIF: Malformed message received (missing IFA_ADDRESS)");
1010 ifi
= if_find_by_index(i
->ifa_index
);
1013 log(L_ERR
"KIF: Received address message for unknown interface %d", i
->ifa_index
);
1018 ifa_flags
= rta_get_u32(a
[IFA_FLAGS
]);
1020 ifa_flags
= i
->ifa_flags
;
1023 bzero(&ifa
, sizeof(ifa
));
1025 if (ifa_flags
& IFA_F_SECONDARY
)
1026 ifa
.flags
|= IA_SECONDARY
;
1028 /* Ignore tentative addresses silently */
1029 if (ifa_flags
& IFA_F_TENTATIVE
)
1032 /* IFA_LOCAL can be unset for IPv6 interfaces */
1033 ifa
.ip
= rta_get_ipa(a
[IFA_LOCAL
] ? : a
[IFA_ADDRESS
]);
1035 if (i
->ifa_prefixlen
> IP6_MAX_PREFIX_LENGTH
)
1037 log(L_ERR
"KIF: Invalid prefix length for interface %s: %d", ifi
->name
, i
->ifa_prefixlen
);
1040 if (i
->ifa_prefixlen
== IP6_MAX_PREFIX_LENGTH
)
1042 ifa
.brd
= rta_get_ipa(a
[IFA_ADDRESS
]);
1043 net_fill_ip6(&ifa
.prefix
, rta_get_ip6(a
[IFA_ADDRESS
]), i
->ifa_prefixlen
);
1045 /* It is either a host address or a peer address */
1046 if (ipa_equal(ifa
.ip
, ifa
.brd
))
1047 ifa
.flags
|= IA_HOST
;
1050 ifa
.flags
|= IA_PEER
;
1051 ifa
.opposite
= ifa
.brd
;
1056 net_fill_ip6(&ifa
.prefix
, ipa_to_ip6(ifa
.ip
), i
->ifa_prefixlen
);
1057 net_normalize(&ifa
.prefix
);
1059 if (i
->ifa_prefixlen
== IP6_MAX_PREFIX_LENGTH
- 1)
1060 ifa
.opposite
= ipa_opposite_m1(ifa
.ip
);
1063 scope
= ipa_classify(ifa
.ip
);
1066 log(L_ERR
"KIF: Invalid interface address %I for %s", ifa
.ip
, ifi
->name
);
1069 ifa
.scope
= scope
& IADDR_SCOPE_MASK
;
1071 DBG("KIF: IF%d(%s): %s IPA %I, flg %x, net %N, brd %I, opp %I\n",
1072 ifi
->index
, ifi
->name
,
1073 new ? "added" : "removed",
1074 ifa
.ip
, ifa
.flags
, &ifa
.prefix
, ifa
.brd
, ifa
.opposite
);
1082 if_end_partial_update(ifi
);
1086 nl_parse_addr(struct nlmsghdr
*h
, int scan
)
1088 struct ifaddrmsg
*i
;
1090 if (!(i
= nl_checkin(h
, sizeof(*i
))))
1093 int new = (h
->nlmsg_type
== RTM_NEWADDR
);
1095 switch (i
->ifa_family
)
1098 return nl_parse_addr4(i
, scan
, new);
1101 return nl_parse_addr6(i
, scan
, new);
1106 kif_do_scan(struct kif_proto
*p UNUSED
)
1112 nl_request_dump(AF_UNSPEC
, RTM_GETLINK
);
1113 while (h
= nl_get_scan())
1114 if (h
->nlmsg_type
== RTM_NEWLINK
|| h
->nlmsg_type
== RTM_DELLINK
)
1115 nl_parse_link(h
, 1);
1117 log(L_DEBUG
"nl_scan_ifaces: Unknown packet received (type=%d)", h
->nlmsg_type
);
1119 /* Re-resolve master interface for slaves */
1121 WALK_LIST(i
, iface_list
)
1122 if (i
->master_index
)
1128 .master_index
= i
->master_index
,
1129 .master
= if_find_by_index(i
->master_index
)
1132 if (f
.master
!= i
->master
)
1134 memcpy(f
.name
, i
->name
, sizeof(f
.name
));
1139 nl_request_dump(AF_INET
, RTM_GETADDR
);
1140 while (h
= nl_get_scan())
1141 if (h
->nlmsg_type
== RTM_NEWADDR
|| h
->nlmsg_type
== RTM_DELADDR
)
1142 nl_parse_addr(h
, 1);
1144 log(L_DEBUG
"nl_scan_ifaces: Unknown packet received (type=%d)", h
->nlmsg_type
);
1146 nl_request_dump(AF_INET6
, RTM_GETADDR
);
1147 while (h
= nl_get_scan())
1148 if (h
->nlmsg_type
== RTM_NEWADDR
|| h
->nlmsg_type
== RTM_DELADDR
)
1149 nl_parse_addr(h
, 1);
1151 log(L_DEBUG
"nl_scan_ifaces: Unknown packet received (type=%d)", h
->nlmsg_type
);
1161 krt_table_id(struct krt_proto
*p
)
1163 return KRT_CF
->sys
.table_id
;
1166 static HASH(struct krt_proto
) nl_table_map
;
1168 #define RTH_KEY(p) p->af, krt_table_id(p)
1169 #define RTH_NEXT(p) p->sys.hash_next
1170 #define RTH_EQ(a1,i1,a2,i2) a1 == a2 && i1 == i2
1171 #define RTH_FN(a,i) a ^ u32_hash(i)
1173 #define RTH_REHASH rth_rehash
1174 #define RTH_PARAMS /8, *2, 2, 2, 6, 20
1176 HASH_DEFINE_REHASH_FN(RTH
, struct krt_proto
)
1187 case RTD_UNREACHABLE
:
1197 nh_bufsize(struct nexthop
*nh
)
1200 for (; nh
!= NULL
; nh
= nh
->next
)
1201 rv
+= RTNH_LENGTH(RTA_LENGTH(sizeof(ip_addr
)));
1206 nl_send_route(struct krt_proto
*p
, rte
*e
, int op
, int dest
, struct nexthop
*nh
)
1211 ea_list
*eattrs
= a
->eattrs
;
1212 int bufsize
= 128 + KRT_METRICS_MAX
*8 + nh_bufsize(&(a
->nh
));
1221 int rsize
= sizeof(*r
) + bufsize
;
1224 DBG("nl_send_route(%N,op=%x)\n", net
->n
.addr
, op
);
1226 bzero(&r
->h
, sizeof(r
->h
));
1227 bzero(&r
->r
, sizeof(r
->r
));
1228 r
->h
.nlmsg_type
= op
? RTM_NEWROUTE
: RTM_DELROUTE
;
1229 r
->h
.nlmsg_len
= NLMSG_LENGTH(sizeof(struct rtmsg
));
1230 r
->h
.nlmsg_flags
= op
| NLM_F_REQUEST
| NLM_F_ACK
;
1232 r
->r
.rtm_family
= p
->af
;
1233 r
->r
.rtm_dst_len
= net_pxlen(net
->n
.addr
);
1234 r
->r
.rtm_protocol
= RTPROT_BIRD
;
1235 r
->r
.rtm_scope
= RT_SCOPE_NOWHERE
;
1236 #ifdef HAVE_MPLS_KERNEL
1237 if (p
->af
== AF_MPLS
)
1240 * Kernel MPLS code is a bit picky. We must:
1241 * 1) Always set RT_SCOPE_UNIVERSE and RTN_UNICAST (even for RTM_DELROUTE)
1242 * 2) Never use RTA_PRIORITY
1245 u32 label
= net_mpls(net
->n
.addr
);
1246 nl_add_attr_mpls(&r
->h
, rsize
, RTA_DST
, 1, &label
);
1247 r
->r
.rtm_scope
= RT_SCOPE_UNIVERSE
;
1248 r
->r
.rtm_type
= RTN_UNICAST
;
1253 nl_add_attr_ipa(&r
->h
, rsize
, RTA_DST
, net_prefix(net
->n
.addr
));
1255 /* Add source address for IPv6 SADR routes */
1256 if (net
->n
.addr
->type
== NET_IP6_SADR
)
1258 net_addr_ip6_sadr
*a
= (void *) &net
->n
.addr
;
1259 nl_add_attr_ip6(&r
->h
, rsize
, RTA_SRC
, a
->src_prefix
);
1260 r
->r
.rtm_src_len
= a
->src_pxlen
;
1265 * Strange behavior for RTM_DELROUTE:
1266 * 1) rtm_family is ignored in IPv6, works for IPv4
1267 * 2) not setting RTA_PRIORITY is different from setting default value (on IPv6)
1268 * 3) not setting RTA_PRIORITY is equivalent to setting 0, which is wildcard
1271 if (krt_table_id(p
) < 256)
1272 r
->r
.rtm_table
= krt_table_id(p
);
1274 nl_add_attr_u32(&r
->h
, rsize
, RTA_TABLE
, krt_table_id(p
));
1276 if (p
->af
== AF_MPLS
)
1278 else if (a
->source
== RTS_DUMMY
)
1279 priority
= e
->u
.krt
.metric
;
1280 else if (KRT_CF
->sys
.metric
)
1281 priority
= KRT_CF
->sys
.metric
;
1282 else if ((op
!= NL_OP_DELETE
) && (ea
= ea_find(eattrs
, EA_KRT_METRIC
)))
1283 priority
= ea
->u
.data
;
1286 nl_add_attr_u32(&r
->h
, rsize
, RTA_PRIORITY
, priority
);
1288 /* For route delete, we do not specify remaining route attributes */
1289 if (op
== NL_OP_DELETE
)
1292 /* Default scope is LINK for device routes, UNIVERSE otherwise */
1293 if (p
->af
== AF_MPLS
)
1294 r
->r
.rtm_scope
= RT_SCOPE_UNIVERSE
;
1295 else if (ea
= ea_find(eattrs
, EA_KRT_SCOPE
))
1296 r
->r
.rtm_scope
= ea
->u
.data
;
1298 r
->r
.rtm_scope
= (dest
== RTD_UNICAST
&& ipa_zero(nh
->gw
)) ? RT_SCOPE_LINK
: RT_SCOPE_UNIVERSE
;
1300 if (ea
= ea_find(eattrs
, EA_KRT_PREFSRC
))
1301 nl_add_attr_ipa(&r
->h
, rsize
, RTA_PREFSRC
, *(ip_addr
*)ea
->u
.ptr
->data
);
1303 if (ea
= ea_find(eattrs
, EA_KRT_REALM
))
1304 nl_add_attr_u32(&r
->h
, rsize
, RTA_FLOW
, ea
->u
.data
);
1307 u32 metrics
[KRT_METRICS_MAX
];
1310 struct ea_walk_state ews
= { .eattrs
= eattrs
};
1311 while (ea
= ea_walk(&ews
, EA_KRT_METRICS
, KRT_METRICS_MAX
))
1313 int id
= ea
->id
- EA_KRT_METRICS
;
1314 metrics
[0] |= 1 << id
;
1315 metrics
[id
] = ea
->u
.data
;
1319 nl_add_metrics(&r
->h
, rsize
, metrics
, KRT_METRICS_MAX
);
1326 r
->r
.rtm_type
= RTN_UNICAST
;
1327 if (nh
->next
&& !krt_ecmp6(p
))
1328 nl_add_multipath(&r
->h
, rsize
, nh
, p
->af
);
1331 nl_add_attr_u32(&r
->h
, rsize
, RTA_OIF
, nh
->iface
->index
);
1332 nl_add_nexthop(&r
->h
, rsize
, nh
, p
->af
);
1334 if (nh
->flags
& RNF_ONLINK
)
1335 r
->r
.rtm_flags
|= RTNH_F_ONLINK
;
1339 r
->r
.rtm_type
= RTN_BLACKHOLE
;
1341 case RTD_UNREACHABLE
:
1342 r
->r
.rtm_type
= RTN_UNREACHABLE
;
1345 r
->r
.rtm_type
= RTN_PROHIBIT
;
1350 bug("krt_capable inconsistent with nl_send_route");
1353 /* Ignore missing for DELETE */
1354 return nl_exchange(&r
->h
, (op
== NL_OP_DELETE
));
1358 nl_add_rte(struct krt_proto
*p
, rte
*e
)
1363 if (krt_ecmp6(p
) && a
->nh
.next
)
1365 struct nexthop
*nh
= &(a
->nh
);
1367 err
= nl_send_route(p
, e
, NL_OP_ADD
, RTD_UNICAST
, nh
);
1371 for (nh
= nh
->next
; nh
; nh
= nh
->next
)
1372 err
+= nl_send_route(p
, e
, NL_OP_APPEND
, RTD_UNICAST
, nh
);
1377 return nl_send_route(p
, e
, NL_OP_ADD
, a
->dest
, &(a
->nh
));
1381 nl_delete_rte(struct krt_proto
*p
, rte
*e
)
1385 /* For IPv6, we just repeatedly request DELETE until we get error */
1387 err
= nl_send_route(p
, e
, NL_OP_DELETE
, RTD_NONE
, NULL
);
1388 while (krt_ecmp6(p
) && !err
);
1394 nl_replace_rte(struct krt_proto
*p
, rte
*e
)
1397 return nl_send_route(p
, e
, NL_OP_REPLACE
, a
->dest
, &(a
->nh
));
1402 krt_replace_rte(struct krt_proto
*p
, net
*n
, rte
*new, rte
*old
)
1407 * We use NL_OP_REPLACE for IPv4, it has an issue with not checking for
1408 * matching rtm_protocol, but that is OK when dedicated priority is used.
1410 * We do not use NL_OP_REPLACE for IPv6, as it has broken semantics for ECMP
1411 * and with some kernel versions ECMP replace crashes kernel. Would need more
1412 * testing and checks for kernel versions.
1414 * For IPv6, we use NL_OP_DELETE and then NL_OP_ADD. We also do not trust the
1415 * old route value, so we do not try to optimize IPv6 ECMP reconfigurations.
1418 if (krt_ipv4(p
) && old
&& new)
1420 err
= nl_replace_rte(p
, new);
1425 nl_delete_rte(p
, old
);
1428 err
= nl_add_rte(p
, new);
1432 n
->n
.flags
|= KRF_SYNC_ERROR
;
1434 n
->n
.flags
&= ~KRF_SYNC_ERROR
;
1438 nl_mergable_route(struct nl_parse_state
*s
, net
*net
, struct krt_proto
*p
, uint priority
, uint krt_type
, uint rtm_family
)
1440 /* Route merging is used for IPv6 scans */
1441 if (!s
->scan
|| (rtm_family
!= AF_INET6
))
1444 /* Saved and new route must have same network, proto/table, and priority */
1445 if ((s
->net
!= net
) || (s
->proto
!= p
) || (s
->krt_metric
!= priority
))
1448 /* Both must be regular unicast routes */
1449 if ((s
->krt_type
!= RTN_UNICAST
) || (krt_type
!= RTN_UNICAST
))
1456 nl_announce_route(struct nl_parse_state
*s
)
1458 rte
*e
= rte_get_temp(s
->attrs
);
1460 e
->u
.krt
.src
= s
->krt_src
;
1461 e
->u
.krt
.proto
= s
->krt_proto
;
1464 e
->u
.krt
.metric
= s
->krt_metric
;
1467 krt_got_route(s
->proto
, e
);
1469 krt_got_route_async(s
->proto
, e
, s
->new);
1478 nl_parse_begin(struct nl_parse_state
*s
, int scan
)
1480 memset(s
, 0, sizeof (struct nl_parse_state
));
1481 s
->pool
= nl_linpool
;
1486 nl_parse_end(struct nl_parse_state
*s
)
1489 nl_announce_route(s
);
1493 #define SKIP(ARG...) do { DBG("KRT: Ignoring route - " ARG); return; } while(0)
1496 nl_parse_route(struct nl_parse_state
*s
, struct nlmsghdr
*h
)
1498 struct krt_proto
*p
;
1500 struct rtattr
*a
[BIRD_RTA_MAX
];
1501 int new = h
->nlmsg_type
== RTM_NEWROUTE
;
1503 net_addr dst
, src
= {};
1507 u32 def_scope
= RT_SCOPE_UNIVERSE
;
1510 if (!(i
= nl_checkin(h
, sizeof(*i
))))
1513 switch (i
->rtm_family
)
1516 if (!nl_parse_attrs(RTM_RTA(i
), rtm_attr_want4
, a
, sizeof(a
)))
1520 net_fill_ip4(&dst
, rta_get_ip4(a
[RTA_DST
]), i
->rtm_dst_len
);
1522 net_fill_ip4(&dst
, IP4_NONE
, 0);
1526 if (!nl_parse_attrs(RTM_RTA(i
), rtm_attr_want6
, a
, sizeof(a
)))
1530 net_fill_ip6(&dst
, rta_get_ip6(a
[RTA_DST
]), i
->rtm_dst_len
);
1532 net_fill_ip6(&dst
, IP6_NONE
, 0);
1535 net_fill_ip6(&src
, rta_get_ip6(a
[RTA_SRC
]), i
->rtm_src_len
);
1537 net_fill_ip6(&src
, IP6_NONE
, 0);
1540 #ifdef HAVE_MPLS_KERNEL
1542 if (!nl_parse_attrs(RTM_RTA(i
), rtm_attr_want_mpls
, a
, sizeof(a
)))
1546 SKIP("MPLS route without RTA_DST");
1548 if (rta_get_mpls(a
[RTA_DST
], rta_mpls_stack
) != 1)
1549 SKIP("MPLS route with multi-label RTA_DST");
1551 net_fill_mpls(&dst
, rta_mpls_stack
[0]);
1560 oif
= rta_get_u32(a
[RTA_OIF
]);
1563 table_id
= rta_get_u32(a
[RTA_TABLE
]);
1565 table_id
= i
->rtm_table
;
1567 /* Do we know this table? */
1568 p
= HASH_FIND(nl_table_map
, RTH
, i
->rtm_family
, table_id
);
1570 SKIP("unknown table %u\n", table_id
);
1572 if (a
[RTA_SRC
] && (p
->p
.net_type
!= NET_IP6_SADR
))
1573 SKIP("src prefix for non-SADR channel\n");
1578 if (i
->rtm_tos
!= 0) /* We don't support TOS */
1579 SKIP("TOS %02x\n", i
->rtm_tos
);
1581 if (s
->scan
&& !new)
1582 SKIP("RTM_DELROUTE in scan\n");
1584 if (a
[RTA_PRIORITY
])
1585 priority
= rta_get_u32(a
[RTA_PRIORITY
]);
1587 int c
= net_classify(&dst
);
1588 if ((c
< 0) || !(c
& IADDR_HOST
) || ((c
& IADDR_SCOPE_MASK
) <= SCOPE_LINK
))
1589 SKIP("strange class/scope\n");
1591 switch (i
->rtm_protocol
)
1594 SKIP("proto unspec\n");
1596 case RTPROT_REDIRECT
:
1597 krt_src
= KRT_SRC_REDIRECT
;
1601 krt_src
= KRT_SRC_KERNEL
;
1607 krt_src
= KRT_SRC_BIRD
;
1612 krt_src
= KRT_SRC_ALIEN
;
1616 if (p
->p
.net_type
== NET_IP6_SADR
)
1618 n
= alloca(sizeof(net_addr_ip6_sadr
));
1619 net_fill_ip6_sadr(n
, net6_prefix(&dst
), net6_pxlen(&dst
),
1620 net6_prefix(&src
), net6_pxlen(&src
));
1623 net
*net
= net_get(p
->p
.main_channel
->table
, n
);
1625 if (s
->net
&& !nl_mergable_route(s
, net
, p
, priority
, i
->rtm_type
, i
->rtm_family
))
1626 nl_announce_route(s
);
1628 rta
*ra
= lp_allocz(s
->pool
, RTA_MAX_SIZE
);
1629 ra
->src
= p
->p
.main_source
;
1630 ra
->source
= RTS_INHERIT
;
1631 ra
->scope
= SCOPE_UNIVERSE
;
1633 switch (i
->rtm_type
)
1636 ra
->dest
= RTD_UNICAST
;
1638 if (a
[RTA_MULTIPATH
])
1640 struct nexthop
*nh
= nl_parse_multipath(s
, p
, a
[RTA_MULTIPATH
], i
->rtm_family
);
1643 log(L_ERR
"KRT: Received strange multipath route %N", net
->n
.addr
);
1647 nexthop_link(ra
, nh
);
1651 ra
->nh
.iface
= if_find_by_index(oif
);
1654 log(L_ERR
"KRT: Received route %N with unknown ifindex %u", net
->n
.addr
, oif
);
1659 ra
->nh
.gw
= rta_get_ipa(a
[RTA_GATEWAY
]);
1661 #ifdef HAVE_MPLS_KERNEL
1663 ra
->nh
.gw
= rta_get_via(a
[RTA_VIA
]);
1666 if (ipa_nonzero(ra
->nh
.gw
))
1668 /* Silently skip strange 6to4 routes */
1669 const net_addr_ip6 sit
= NET_ADDR_IP6(IP6_NONE
, 96);
1670 if ((i
->rtm_family
== AF_INET6
) && ipa_in_netX(ra
->nh
.gw
, (net_addr
*) &sit
))
1673 if (i
->rtm_flags
& RTNH_F_ONLINK
)
1674 ra
->nh
.flags
|= RNF_ONLINK
;
1677 nbr
= neigh_find(&p
->p
, ra
->nh
.gw
, ra
->nh
.iface
,
1678 (ra
->nh
.flags
& RNF_ONLINK
) ? NEF_ONLINK
: 0);
1679 if (!nbr
|| (nbr
->scope
== SCOPE_HOST
))
1681 log(L_ERR
"KRT: Received route %N with strange next-hop %I", net
->n
.addr
,
1689 ra
->dest
= RTD_BLACKHOLE
;
1691 case RTN_UNREACHABLE
:
1692 ra
->dest
= RTD_UNREACHABLE
;
1695 ra
->dest
= RTD_PROHIBIT
;
1697 /* FIXME: What about RTN_THROW? */
1699 SKIP("type %d\n", i
->rtm_type
);
1703 #ifdef HAVE_MPLS_KERNEL
1704 if ((i
->rtm_family
== AF_MPLS
) && a
[RTA_NEWDST
] && !ra
->nh
.next
)
1705 ra
->nh
.labels
= rta_get_mpls(a
[RTA_NEWDST
], ra
->nh
.label
);
1707 if (a
[RTA_ENCAP
] && a
[RTA_ENCAP_TYPE
] && !ra
->nh
.next
)
1709 switch (rta_get_u16(a
[RTA_ENCAP_TYPE
]))
1711 case LWTUNNEL_ENCAP_MPLS
:
1713 struct rtattr
*enca
[BIRD_RTA_MAX
];
1714 nl_attr_len
= RTA_PAYLOAD(a
[RTA_ENCAP
]);
1715 nl_parse_attrs(RTA_DATA(a
[RTA_ENCAP
]), encap_mpls_want
, enca
, sizeof(enca
));
1716 ra
->nh
.labels
= rta_get_mpls(enca
[RTA_DST
], ra
->nh
.label
);
1720 SKIP("unknown encapsulation method %d\n", rta_get_u16(a
[RTA_ENCAP_TYPE
]));
1726 if (i
->rtm_scope
!= def_scope
)
1728 ea_list
*ea
= lp_alloc(s
->pool
, sizeof(ea_list
) + sizeof(eattr
));
1729 ea
->next
= ra
->eattrs
;
1731 ea
->flags
= EALF_SORTED
;
1733 ea
->attrs
[0].id
= EA_KRT_SCOPE
;
1734 ea
->attrs
[0].flags
= 0;
1735 ea
->attrs
[0].type
= EAF_TYPE_INT
;
1736 ea
->attrs
[0].u
.data
= i
->rtm_scope
;
1741 ip_addr ps
= rta_get_ipa(a
[RTA_PREFSRC
]);
1743 ea_list
*ea
= lp_alloc(s
->pool
, sizeof(ea_list
) + sizeof(eattr
));
1744 ea
->next
= ra
->eattrs
;
1746 ea
->flags
= EALF_SORTED
;
1748 ea
->attrs
[0].id
= EA_KRT_PREFSRC
;
1749 ea
->attrs
[0].flags
= 0;
1750 ea
->attrs
[0].type
= EAF_TYPE_IP_ADDRESS
;
1752 struct adata
*ad
= lp_alloc(s
->pool
, sizeof(struct adata
) + sizeof(ps
));
1753 ad
->length
= sizeof(ps
);
1754 memcpy(ad
->data
, &ps
, sizeof(ps
));
1756 ea
->attrs
[0].u
.ptr
= ad
;
1761 ea_list
*ea
= lp_alloc(s
->pool
, sizeof(ea_list
) + sizeof(eattr
));
1762 ea
->next
= ra
->eattrs
;
1764 ea
->flags
= EALF_SORTED
;
1766 ea
->attrs
[0].id
= EA_KRT_REALM
;
1767 ea
->attrs
[0].flags
= 0;
1768 ea
->attrs
[0].type
= EAF_TYPE_INT
;
1769 ea
->attrs
[0].u
.data
= rta_get_u32(a
[RTA_FLOW
]);
1774 u32 metrics
[KRT_METRICS_MAX
];
1775 ea_list
*ea
= lp_alloc(s
->pool
, sizeof(ea_list
) + KRT_METRICS_MAX
* sizeof(eattr
));
1778 if (nl_parse_metrics(a
[RTA_METRICS
], metrics
, ARRAY_SIZE(metrics
)) < 0)
1780 log(L_ERR
"KRT: Received route %N with strange RTA_METRICS attribute", net
->n
.addr
);
1784 for (t
= 1; t
< KRT_METRICS_MAX
; t
++)
1785 if (metrics
[0] & (1 << t
))
1787 ea
->attrs
[n
].id
= EA_CODE(PROTOCOL_KERNEL
, KRT_METRICS_OFFSET
+ t
);
1788 ea
->attrs
[n
].flags
= 0;
1789 ea
->attrs
[n
].type
= EAF_TYPE_INT
; /* FIXME: Some are EAF_TYPE_BITFIELD */
1790 ea
->attrs
[n
].u
.data
= metrics
[t
];
1796 ea
->next
= ra
->eattrs
;
1797 ea
->flags
= EALF_SORTED
;
1804 * Ideally, now we would send the received route to the rest of kernel code.
1805 * But IPv6 ECMP routes before 4.11 are sent as a sequence of routes, so we
1806 * postpone it and merge next hops until the end of the sequence. Note that
1807 * when doing merging of next hops, we expect the new route to be unipath.
1808 * Otherwise, we ignore additional next hops in nexthop_insert().
1813 /* Store the new route */
1818 s
->krt_src
= krt_src
;
1819 s
->krt_type
= i
->rtm_type
;
1820 s
->krt_proto
= i
->rtm_protocol
;
1821 s
->krt_metric
= priority
;
1825 /* Merge next hops with the stored route */
1828 struct nexthop
*nhs
= &oa
->nh
;
1829 nexthop_insert(&nhs
, &ra
->nh
);
1831 /* Perhaps new nexthop is inserted at the first position */
1837 /* Keep old eattrs */
1838 ra
->eattrs
= oa
->eattrs
;
1844 krt_do_scan(struct krt_proto
*p UNUSED
) /* CONFIG_ALL_TABLES_AT_ONCE => p is NULL */
1847 struct nl_parse_state s
;
1849 nl_parse_begin(&s
, 1);
1850 nl_request_dump(AF_UNSPEC
, RTM_GETROUTE
);
1851 while (h
= nl_get_scan())
1852 if (h
->nlmsg_type
== RTM_NEWROUTE
|| h
->nlmsg_type
== RTM_DELROUTE
)
1853 nl_parse_route(&s
, h
);
1855 log(L_DEBUG
"nl_scan_fire: Unknown packet received (type=%d)", h
->nlmsg_type
);
1860 * Asynchronous Netlink interface
1863 static sock
*nl_async_sk
; /* BIRD socket for asynchronous notifications */
1864 static byte
*nl_async_rx_buffer
; /* Receive buffer */
1867 nl_async_msg(struct nlmsghdr
*h
)
1869 struct nl_parse_state s
;
1871 switch (h
->nlmsg_type
)
1875 DBG("KRT: Received async route notification (%d)\n", h
->nlmsg_type
);
1876 nl_parse_begin(&s
, 0);
1877 nl_parse_route(&s
, h
);
1882 DBG("KRT: Received async link notification (%d)\n", h
->nlmsg_type
);
1884 nl_parse_link(h
, 0);
1888 DBG("KRT: Received async address notification (%d)\n", h
->nlmsg_type
);
1890 nl_parse_addr(h
, 0);
1893 DBG("KRT: Received unknown async notification (%d)\n", h
->nlmsg_type
);
1898 nl_async_hook(sock
*sk
, uint size UNUSED
)
1900 struct iovec iov
= { nl_async_rx_buffer
, NL_RX_SIZE
};
1901 struct sockaddr_nl sa
;
1904 .msg_namelen
= sizeof(sa
),
1912 x
= recvmsg(sk
->fd
, &m
, 0);
1915 if (errno
== ENOBUFS
)
1918 * Netlink reports some packets have been thrown away.
1919 * One day we might react to it by asking for route table
1920 * scan in near future.
1922 log(L_WARN
"Kernel dropped some netlink messages, will resync on next scan.");
1923 return 1; /* More data are likely to be ready */
1925 else if (errno
!= EWOULDBLOCK
)
1926 log(L_ERR
"Netlink recvmsg: %m");
1929 if (sa
.nl_pid
) /* It isn't from the kernel */
1931 DBG("Non-kernel packet\n");
1934 h
= (void *) nl_async_rx_buffer
;
1936 if (m
.msg_flags
& MSG_TRUNC
)
1938 log(L_WARN
"Netlink got truncated asynchronous message");
1941 while (NLMSG_OK(h
, len
))
1944 h
= NLMSG_NEXT(h
, len
);
1947 log(L_WARN
"nl_async_hook: Found packet remnant of size %d", len
);
1952 nl_async_err_hook(sock
*sk
, int e UNUSED
)
1954 nl_async_hook(sk
, 0);
1961 struct sockaddr_nl sa
;
1967 DBG("KRT: Opening async netlink socket\n");
1969 fd
= socket(PF_NETLINK
, SOCK_RAW
, NETLINK_ROUTE
);
1972 log(L_ERR
"Unable to open asynchronous rtnetlink socket: %m");
1976 bzero(&sa
, sizeof(sa
));
1977 sa
.nl_family
= AF_NETLINK
;
1978 sa
.nl_groups
= RTMGRP_LINK
|
1979 RTMGRP_IPV4_IFADDR
| RTMGRP_IPV4_ROUTE
|
1980 RTMGRP_IPV6_IFADDR
| RTMGRP_IPV6_ROUTE
;
1982 if (bind(fd
, (struct sockaddr
*) &sa
, sizeof(sa
)) < 0)
1984 log(L_ERR
"Unable to bind asynchronous rtnetlink socket: %m");
1989 nl_async_rx_buffer
= xmalloc(NL_RX_SIZE
);
1991 sk
= nl_async_sk
= sk_new(krt_pool
);
1992 sk
->type
= SK_MAGIC
;
1993 sk
->rx_hook
= nl_async_hook
;
1994 sk
->err_hook
= nl_async_err_hook
;
1996 if (sk_open(sk
) < 0)
1997 bug("Netlink: sk_open failed");
2002 * Interface to the UNIX krt module
2006 krt_sys_io_init(void)
2008 nl_linpool
= lp_new_default(krt_pool
);
2009 HASH_INIT(nl_table_map
, krt_pool
, 6);
2013 krt_sys_start(struct krt_proto
*p
)
2015 struct krt_proto
*old
= HASH_FIND(nl_table_map
, RTH
, p
->af
, krt_table_id(p
));
2019 log(L_ERR
"%s: Kernel table %u already registered by %s",
2020 p
->p
.name
, krt_table_id(p
), old
->p
.name
);
2024 HASH_INSERT2(nl_table_map
, RTH
, krt_pool
, p
);
2033 krt_sys_shutdown(struct krt_proto
*p
)
2035 HASH_REMOVE2(nl_table_map
, RTH
, krt_pool
, p
);
2039 krt_sys_reconfigure(struct krt_proto
*p UNUSED
, struct krt_config
*n
, struct krt_config
*o
)
2041 return (n
->sys
.table_id
== o
->sys
.table_id
) && (n
->sys
.metric
== o
->sys
.metric
);
2045 krt_sys_init_config(struct krt_config
*cf
)
2047 cf
->sys
.table_id
= RT_TABLE_MAIN
;
2048 cf
->sys
.metric
= 32;
2052 krt_sys_copy_config(struct krt_config
*d
, struct krt_config
*s
)
2054 d
->sys
.table_id
= s
->sys
.table_id
;
2055 d
->sys
.metric
= s
->sys
.metric
;
2058 static const char *krt_metrics_names
[KRT_METRICS_MAX
] = {
2059 NULL
, "lock", "mtu", "window", "rtt", "rttvar", "sstresh", "cwnd", "advmss",
2060 "reordering", "hoplimit", "initcwnd", "features", "rto_min", "initrwnd", "quickack"
2063 static const char *krt_features_names
[KRT_FEATURES_MAX
] = {
2064 "ecn", NULL
, NULL
, "allfrag"
2068 krt_sys_get_attr(eattr
*a
, byte
*buf
, int buflen UNUSED
)
2072 case EA_KRT_PREFSRC
:
2073 bsprintf(buf
, "prefsrc");
2077 bsprintf(buf
, "realm");
2081 bsprintf(buf
, "scope");
2085 buf
+= bsprintf(buf
, "lock:");
2086 ea_format_bitfield(a
, buf
, buflen
, krt_metrics_names
, 2, KRT_METRICS_MAX
);
2089 case EA_KRT_FEATURES
:
2090 buf
+= bsprintf(buf
, "features:");
2091 ea_format_bitfield(a
, buf
, buflen
, krt_features_names
, 0, KRT_FEATURES_MAX
);
2095 int id
= (int)EA_ID(a
->id
) - KRT_METRICS_OFFSET
;
2096 if (id
> 0 && id
< KRT_METRICS_MAX
)
2098 bsprintf(buf
, "%s", krt_metrics_names
[id
]);
2109 kif_sys_start(struct kif_proto
*p UNUSED
)
2116 kif_sys_shutdown(struct kif_proto
*p UNUSED
)
2121 kif_update_sysdep_addr(struct iface
*i UNUSED
)