]>
git.ipfire.org Git - thirdparty/bird.git/blob - nest/rt-table.c
2 * BIRD -- Routing Tables
4 * (c) 1998--2000 Martin Mares <mj@ucw.cz>
6 * Can be freely distributed and used under the terms of the GNU GPL.
12 * Routing tables are probably the most important structures BIRD uses. They
13 * hold all the information about known networks, the associated routes and
16 * There are multiple routing tables (a primary one together with any
17 * number of secondary ones if requested by the configuration). Each table
18 * is basically a FIB containing entries describing the individual
19 * destination networks. For each network (represented by structure &net),
20 * there is a one-way linked list of route entries (&rte), the first entry
21 * on the list being the best one (i.e., the one we currently use
22 * for routing), the order of the other ones is undetermined.
24 * The &rte contains information about the route. There are net and src, which
25 * together forms a key identifying the route in a routing table. There is a
26 * pointer to a &rta structure (see the route attribute module for a precise
27 * explanation) holding the route attributes, which are primary data about the
28 * route. There are several technical fields used by routing table code (route
29 * id, REF_* flags), There is also the pflags field, holding protocol-specific
30 * flags. They are not used by routing table code, but by protocol-specific
31 * hooks. In contrast to route attributes, they are not primary data and their
32 * validity is also limited to the routing table.
34 * There are several mechanisms that allow automatic update of routes in one
35 * routing table (dst) as a result of changes in another routing table (src).
36 * They handle issues of recursive next hop resolving, flowspec validation and
39 * The first such mechanism is handling of recursive next hops. A route in the
40 * dst table has an indirect next hop address, which is resolved through a route
41 * in the src table (which may also be the same table) to get an immediate next
42 * hop. This is implemented using structure &hostcache attached to the src
43 * table, which contains &hostentry structures for each tracked next hop
44 * address. These structures are linked from recursive routes in dst tables,
45 * possibly multiple routes sharing one hostentry (as many routes may have the
46 * same indirect next hop). There is also a trie in the hostcache, which matches
47 * all prefixes that may influence resolving of tracked next hops.
49 * When a best route changes in the src table, the hostcache is notified using
50 * rt_notify_hostcache(), which immediately checks using the trie whether the
51 * change is relevant and if it is, then it schedules asynchronous hostcache
52 * recomputation. The recomputation is done by rt_update_hostcache() (called
53 * from rt_event() of src table), it walks through all hostentries and resolves
54 * them (by rt_update_hostentry()). It also updates the trie. If a change in
55 * hostentry resolution was found, then it schedules asynchronous nexthop
56 * recomputation of associated dst table. That is done by rt_next_hop_update()
57 * (called from rt_event() of dst table), it iterates over all routes in the dst
58 * table and re-examines their hostentries for changes. Note that in contrast to
59 * hostcache update, next hop update can be interrupted by main loop. These two
60 * full-table walks (over hostcache and dst table) are necessary due to absence
61 * of direct lookups (route -> affected nexthop, nexthop -> its route).
63 * The second mechanism is for flowspec validation, where validity of flowspec
64 * routes depends of resolving their network prefixes in IP routing tables. This
65 * is similar to the recursive next hop mechanism, but simpler as there are no
66 * intermediate hostcache and hostentries (because flows are less likely to
67 * share common net prefix than routes sharing a common next hop). In src table,
68 * there is a list of dst tables (list flowspec_links), this list is updated by
69 * flowpsec channels (by rt_flowspec_link() and rt_flowspec_unlink() during
70 * channel start/stop). Each dst table has its own trie of prefixes that may
71 * influence validation of flowspec routes in it (flowspec_trie).
73 * When a best route changes in the src table, rt_flowspec_notify() immediately
74 * checks all dst tables from the list using their tries to see whether the
75 * change is relevant for them. If it is, then an asynchronous re-validation of
76 * flowspec routes in the dst table is scheduled. That is also done by function
77 * rt_next_hop_update(), like nexthop recomputation above. It iterates over all
78 * flowspec routes and re-validates them. It also recalculates the trie.
80 * Note that in contrast to the hostcache update, here the trie is recalculated
81 * during the rt_next_hop_update(), which may be interleaved with IP route
82 * updates. The trie is flushed at the beginning of recalculation, which means
83 * that such updates may use partial trie to see if they are relevant. But it
84 * works anyway! Either affected flowspec was already re-validated and added to
85 * the trie, then IP route change would match the trie and trigger a next round
86 * of re-validation, or it was not yet re-validated and added to the trie, but
87 * will be re-validated later in this round anyway.
89 * The third mechanism is used for RPKI re-validation of IP routes and it is the
90 * simplest. It is just a list of subscribers in src table, who are notified
91 * when any change happened, but only after a settle time. Also, in RPKI case
92 * the dst is not a table, but a channel, who refeeds routes through a filter.
97 #include "nest/bird.h"
98 #include "nest/route.h"
99 #include "nest/protocol.h"
100 #include "nest/iface.h"
101 #include "lib/resource.h"
102 #include "lib/event.h"
103 #include "lib/timer.h"
104 #include "lib/string.h"
105 #include "conf/conf.h"
106 #include "filter/filter.h"
107 #include "filter/data.h"
108 #include "lib/hash.h"
109 #include "lib/string.h"
110 #include "lib/alloca.h"
111 #include "lib/flowspec.h"
114 #include "proto/bgp/bgp.h"
119 static slab
*rte_slab
;
120 static linpool
*rte_update_pool
;
124 static void rt_free_hostcache(rtable
*tab
);
125 static void rt_notify_hostcache(rtable
*tab
, net
*net
);
126 static void rt_update_hostcache(rtable
*tab
);
127 static void rt_next_hop_update(rtable
*tab
);
128 static inline void rt_prune_table(rtable
*tab
);
129 static inline void rt_schedule_notify(rtable
*tab
);
130 static void rt_flowspec_notify(rtable
*tab
, net
*net
);
131 static void rt_kick_prune_timer(rtable
*tab
);
135 net_init_with_trie(struct fib
*f
, void *N
)
137 rtable
*tab
= SKIP_BACK(rtable
, fib
, f
);
141 trie_add_prefix(tab
->trie
, n
->n
.addr
, n
->n
.addr
->pxlen
, n
->n
.addr
->pxlen
);
144 trie_add_prefix(tab
->trie_new
, n
->n
.addr
, n
->n
.addr
->pxlen
, n
->n
.addr
->pxlen
);
148 net_route_ip4_trie(rtable
*t
, const net_addr_ip4
*n0
)
150 TRIE_WALK_TO_ROOT_IP4(t
->trie
, n0
, n
)
153 if (r
= net_find_valid(t
, (net_addr
*) &n
))
156 TRIE_WALK_TO_ROOT_END
;
162 net_route_vpn4_trie(rtable
*t
, const net_addr_vpn4
*n0
)
164 TRIE_WALK_TO_ROOT_IP4(t
->trie
, (const net_addr_ip4
*) n0
, px
)
166 net_addr_vpn4 n
= NET_ADDR_VPN4(px
.prefix
, px
.pxlen
, n0
->rd
);
169 if (r
= net_find_valid(t
, (net_addr
*) &n
))
172 TRIE_WALK_TO_ROOT_END
;
178 net_route_ip6_trie(rtable
*t
, const net_addr_ip6
*n0
)
180 TRIE_WALK_TO_ROOT_IP6(t
->trie
, n0
, n
)
183 if (r
= net_find_valid(t
, (net_addr
*) &n
))
186 TRIE_WALK_TO_ROOT_END
;
192 net_route_vpn6_trie(rtable
*t
, const net_addr_vpn6
*n0
)
194 TRIE_WALK_TO_ROOT_IP6(t
->trie
, (const net_addr_ip6
*) n0
, px
)
196 net_addr_vpn6 n
= NET_ADDR_VPN6(px
.prefix
, px
.pxlen
, n0
->rd
);
199 if (r
= net_find_valid(t
, (net_addr
*) &n
))
202 TRIE_WALK_TO_ROOT_END
;
208 net_route_ip6_sadr_trie(rtable
*t
, const net_addr_ip6_sadr
*n0
)
210 TRIE_WALK_TO_ROOT_IP6(t
->trie
, (const net_addr_ip6
*) n0
, px
)
212 net_addr_ip6_sadr n
= NET_ADDR_IP6_SADR(px
.prefix
, px
.pxlen
, n0
->src_prefix
, n0
->src_pxlen
);
216 /* We need to do dst first matching. Since sadr addresses are hashed on dst
217 prefix only, find the hash table chain and go through it to find the
218 match with the longest matching src prefix. */
219 for (struct fib_node
*fn
= fib_get_chain(&t
->fib
, (net_addr
*) &n
); fn
; fn
= fn
->next
)
221 net_addr_ip6_sadr
*a
= (void *) fn
->addr
;
223 if (net_equal_dst_ip6_sadr(&n
, a
) &&
224 net_in_net_src_ip6_sadr(&n
, a
) &&
225 (a
->src_pxlen
>= best_pxlen
))
227 best
= fib_node_to_user(&t
->fib
, fn
);
228 best_pxlen
= a
->src_pxlen
;
235 TRIE_WALK_TO_ROOT_END
;
241 net_route_ip4_fib(rtable
*t
, const net_addr_ip4
*n0
)
244 net_copy_ip4(&n
, n0
);
247 while (r
= net_find_valid(t
, (net_addr
*) &n
), (!r
) && (n
.pxlen
> 0))
250 ip4_clrbit(&n
.prefix
, n
.pxlen
);
257 net_route_vpn4_fib(rtable
*t
, const net_addr_vpn4
*n0
)
260 net_copy_vpn4(&n
, n0
);
263 while (r
= net_find_valid(t
, (net_addr
*) &n
), (!r
) && (n
.pxlen
> 0))
266 ip4_clrbit(&n
.prefix
, n
.pxlen
);
273 net_route_ip6_fib(rtable
*t
, const net_addr_ip6
*n0
)
276 net_copy_ip6(&n
, n0
);
279 while (r
= net_find_valid(t
, (net_addr
*) &n
), (!r
) && (n
.pxlen
> 0))
282 ip6_clrbit(&n
.prefix
, n
.pxlen
);
289 net_route_vpn6_fib(rtable
*t
, const net_addr_vpn6
*n0
)
292 net_copy_vpn6(&n
, n0
);
295 while (r
= net_find_valid(t
, (net_addr
*) &n
), (!r
) && (n
.pxlen
> 0))
298 ip6_clrbit(&n
.prefix
, n
.pxlen
);
305 net_route_ip6_sadr_fib(rtable
*t
, const net_addr_ip6_sadr
*n0
)
308 net_copy_ip6_sadr(&n
, n0
);
315 /* We need to do dst first matching. Since sadr addresses are hashed on dst
316 prefix only, find the hash table chain and go through it to find the
317 match with the longest matching src prefix. */
318 for (struct fib_node
*fn
= fib_get_chain(&t
->fib
, (net_addr
*) &n
); fn
; fn
= fn
->next
)
320 net_addr_ip6_sadr
*a
= (void *) fn
->addr
;
322 if (net_equal_dst_ip6_sadr(&n
, a
) &&
323 net_in_net_src_ip6_sadr(&n
, a
) &&
324 (a
->src_pxlen
>= best_pxlen
))
326 best
= fib_node_to_user(&t
->fib
, fn
);
327 best_pxlen
= a
->src_pxlen
;
338 ip6_clrbit(&n
.dst_prefix
, n
.dst_pxlen
);
345 net_route(rtable
*tab
, const net_addr
*n
)
347 ASSERT(tab
->addr_type
== n
->type
);
353 return net_route_ip4_trie(tab
, (net_addr_ip4
*) n
);
355 return net_route_ip4_fib (tab
, (net_addr_ip4
*) n
);
359 return net_route_vpn4_trie(tab
, (net_addr_vpn4
*) n
);
361 return net_route_vpn4_fib (tab
, (net_addr_vpn4
*) n
);
365 return net_route_ip6_trie(tab
, (net_addr_ip6
*) n
);
367 return net_route_ip6_fib (tab
, (net_addr_ip6
*) n
);
371 return net_route_vpn6_trie(tab
, (net_addr_vpn6
*) n
);
373 return net_route_vpn6_fib (tab
, (net_addr_vpn6
*) n
);
377 return net_route_ip6_sadr_trie(tab
, (net_addr_ip6_sadr
*) n
);
379 return net_route_ip6_sadr_fib (tab
, (net_addr_ip6_sadr
*) n
);
388 net_roa_check_ip4_trie(rtable
*tab
, const net_addr_ip4
*px
, u32 asn
)
392 TRIE_WALK_TO_ROOT_IP4(tab
->trie
, px
, px0
)
394 net_addr_roa4 roa0
= NET_ADDR_ROA4(px0
.prefix
, px0
.pxlen
, 0, 0);
397 for (fn
= fib_get_chain(&tab
->fib
, (net_addr
*) &roa0
); fn
; fn
= fn
->next
)
399 net_addr_roa4
*roa
= (void *) fn
->addr
;
400 net
*r
= fib_node_to_user(&tab
->fib
, fn
);
402 if (net_equal_prefix_roa4(roa
, &roa0
) && rte_is_valid(r
->routes
))
405 if (asn
&& (roa
->asn
== asn
) && (roa
->max_pxlen
>= px
->pxlen
))
410 TRIE_WALK_TO_ROOT_END
;
412 return anything
? ROA_INVALID
: ROA_UNKNOWN
;
416 net_roa_check_ip4_fib(rtable
*tab
, const net_addr_ip4
*px
, u32 asn
)
418 struct net_addr_roa4 n
= NET_ADDR_ROA4(px
->prefix
, px
->pxlen
, 0, 0);
424 for (fn
= fib_get_chain(&tab
->fib
, (net_addr
*) &n
); fn
; fn
= fn
->next
)
426 net_addr_roa4
*roa
= (void *) fn
->addr
;
427 net
*r
= fib_node_to_user(&tab
->fib
, fn
);
429 if (net_equal_prefix_roa4(roa
, &n
) && rte_is_valid(r
->routes
))
432 if (asn
&& (roa
->asn
== asn
) && (roa
->max_pxlen
>= px
->pxlen
))
441 ip4_clrbit(&n
.prefix
, n
.pxlen
);
444 return anything
? ROA_INVALID
: ROA_UNKNOWN
;
448 net_roa_check_ip6_trie(rtable
*tab
, const net_addr_ip6
*px
, u32 asn
)
452 TRIE_WALK_TO_ROOT_IP6(tab
->trie
, px
, px0
)
454 net_addr_roa6 roa0
= NET_ADDR_ROA6(px0
.prefix
, px0
.pxlen
, 0, 0);
457 for (fn
= fib_get_chain(&tab
->fib
, (net_addr
*) &roa0
); fn
; fn
= fn
->next
)
459 net_addr_roa6
*roa
= (void *) fn
->addr
;
460 net
*r
= fib_node_to_user(&tab
->fib
, fn
);
462 if (net_equal_prefix_roa6(roa
, &roa0
) && rte_is_valid(r
->routes
))
465 if (asn
&& (roa
->asn
== asn
) && (roa
->max_pxlen
>= px
->pxlen
))
470 TRIE_WALK_TO_ROOT_END
;
472 return anything
? ROA_INVALID
: ROA_UNKNOWN
;
476 net_roa_check_ip6_fib(rtable
*tab
, const net_addr_ip6
*px
, u32 asn
)
478 struct net_addr_roa6 n
= NET_ADDR_ROA6(px
->prefix
, px
->pxlen
, 0, 0);
484 for (fn
= fib_get_chain(&tab
->fib
, (net_addr
*) &n
); fn
; fn
= fn
->next
)
486 net_addr_roa6
*roa
= (void *) fn
->addr
;
487 net
*r
= fib_node_to_user(&tab
->fib
, fn
);
489 if (net_equal_prefix_roa6(roa
, &n
) && rte_is_valid(r
->routes
))
492 if (asn
&& (roa
->asn
== asn
) && (roa
->max_pxlen
>= px
->pxlen
))
501 ip6_clrbit(&n
.prefix
, n
.pxlen
);
504 return anything
? ROA_INVALID
: ROA_UNKNOWN
;
508 * roa_check - check validity of route origination in a ROA table
510 * @n: network prefix to check
511 * @asn: AS number of network prefix
513 * Implements RFC 6483 route validation for the given network prefix. The
514 * procedure is to find all candidate ROAs - ROAs whose prefixes cover the given
515 * network prefix. If there is no candidate ROA, return ROA_UNKNOWN. If there is
516 * a candidate ROA with matching ASN and maxlen field greater than or equal to
517 * the given prefix length, return ROA_VALID. Otherwise, return ROA_INVALID. If
518 * caller cannot determine origin AS, 0 could be used (in that case ROA_VALID
519 * cannot happen). Table @tab must have type NET_ROA4 or NET_ROA6, network @n
520 * must have type NET_IP4 or NET_IP6, respectively.
523 net_roa_check(rtable
*tab
, const net_addr
*n
, u32 asn
)
525 if ((tab
->addr_type
== NET_ROA4
) && (n
->type
== NET_IP4
))
528 return net_roa_check_ip4_trie(tab
, (const net_addr_ip4
*) n
, asn
);
530 return net_roa_check_ip4_fib (tab
, (const net_addr_ip4
*) n
, asn
);
532 else if ((tab
->addr_type
== NET_ROA6
) && (n
->type
== NET_IP6
))
535 return net_roa_check_ip6_trie(tab
, (const net_addr_ip6
*) n
, asn
);
537 return net_roa_check_ip6_fib (tab
, (const net_addr_ip6
*) n
, asn
);
540 return ROA_UNKNOWN
; /* Should not happen */
544 * rte_find - find a route
548 * The rte_find() function returns a route for destination @net
549 * which is from route source @src.
552 rte_find(net
*net
, struct rte_src
*src
)
554 rte
*e
= net
->routes
;
556 while (e
&& e
->src
!= src
)
562 * rte_get_temp - get a temporary &rte
563 * @a: attributes to assign to the new route (a &rta; in case it's
564 * un-cached, rte_update() will create a cached copy automatically)
567 * Create a temporary &rte and bind it with the attributes @a.
570 rte_get_temp(rta
*a
, struct rte_src
*src
)
572 rte
*e
= sl_alloc(rte_slab
);
578 rt_lock_source(e
->src
= src
);
585 rte
*e
= sl_alloc(rte_slab
);
587 memcpy(e
, r
, sizeof(rte
));
589 rt_lock_source(e
->src
);
590 e
->attrs
= rta_clone(r
->attrs
);
596 * rte_cow_rta - get a private writable copy of &rte with writable &rta
597 * @r: a route entry to be copied
598 * @lp: a linpool from which to allocate &rta
600 * rte_cow_rta() takes a &rte and prepares it and associated &rta for
601 * modification. There are three possibilities: First, both &rte and &rta are
602 * private copies, in that case they are returned unchanged. Second, &rte is
603 * private copy, but &rta is cached, in that case &rta is duplicated using
604 * rta_do_cow(). Third, both &rte is shared and &rta is cached, in that case
605 * both structures are duplicated by rte_do_cow() and rta_do_cow().
607 * Note that in the second case, cached &rta loses one reference, while private
608 * copy created by rta_do_cow() is a shallow copy sharing indirect data (eattrs,
609 * nexthops, ...) with it. To work properly, original shared &rta should have
610 * another reference during the life of created private copy.
612 * Result: a pointer to the new writable &rte with writable &rta.
615 rte_cow_rta(rte
*r
, linpool
*lp
)
617 if (!rta_is_cached(r
->attrs
))
621 rta
*a
= rta_do_cow(r
->attrs
, lp
);
627 static int /* Actually better or at least as good as */
628 rte_better(rte
*new, rte
*old
)
630 int (*better
)(rte
*, rte
*);
632 if (!rte_is_valid(old
))
634 if (!rte_is_valid(new))
637 if (new->attrs
->pref
> old
->attrs
->pref
)
639 if (new->attrs
->pref
< old
->attrs
->pref
)
641 if (new->src
->proto
->proto
!= old
->src
->proto
->proto
)
644 * If the user has configured protocol preferences, so that two different protocols
645 * have the same preference, try to break the tie by comparing addresses. Not too
646 * useful, but keeps the ordering of routes unambiguous.
648 return new->src
->proto
->proto
> old
->src
->proto
->proto
;
650 if (better
= new->src
->proto
->rte_better
)
651 return better(new, old
);
656 rte_mergable(rte
*pri
, rte
*sec
)
658 int (*mergable
)(rte
*, rte
*);
660 if (!rte_is_valid(pri
) || !rte_is_valid(sec
))
663 if (pri
->attrs
->pref
!= sec
->attrs
->pref
)
666 if (pri
->src
->proto
->proto
!= sec
->src
->proto
->proto
)
669 if (mergable
= pri
->src
->proto
->rte_mergable
)
670 return mergable(pri
, sec
);
676 rte_trace(struct channel
*c
, rte
*e
, int dir
, char *msg
)
678 log(L_TRACE
"%s.%s %c %s %N %uL %uG %s",
679 c
->proto
->name
, c
->name
?: "?", dir
, msg
, e
->net
->n
.addr
, e
->src
->private_id
, e
->src
->global_id
,
680 rta_dest_name(e
->attrs
->dest
));
684 rte_trace_in(uint flag
, struct channel
*c
, rte
*e
, char *msg
)
686 if ((c
->debug
& flag
) || (c
->proto
->debug
& flag
))
687 rte_trace(c
, e
, '>', msg
);
691 rte_trace_out(uint flag
, struct channel
*c
, rte
*e
, char *msg
)
693 if ((c
->debug
& flag
) || (c
->proto
->debug
& flag
))
694 rte_trace(c
, e
, '<', msg
);
698 export_filter_(struct channel
*c
, rte
*rt0
, rte
**rt_free
, linpool
*pool
, int silent
)
700 struct proto
*p
= c
->proto
;
701 const struct filter
*filter
= c
->out_filter
;
702 struct proto_stats
*stats
= &c
->stats
;
709 v
= p
->preexport
? p
->preexport(c
, rt
) : 0;
715 stats
->exp_updates_rejected
++;
717 rte_trace_out(D_FILTERS
, c
, rt
, "rejected by protocol");
723 rte_trace_out(D_FILTERS
, c
, rt
, "forced accept by protocol");
727 v
= filter
&& ((filter
== FILTER_REJECT
) ||
728 (f_run(filter
, &rt
, pool
,
729 (silent
? FF_SILENT
: 0)) > F_ACCEPT
));
735 stats
->exp_updates_filtered
++;
736 rte_trace_out(D_FILTERS
, c
, rt
, "filtered out");
746 /* Discard temporary rte */
753 export_filter(struct channel
*c
, rte
*rt0
, rte
**rt_free
, int silent
)
755 return export_filter_(c
, rt0
, rt_free
, rte_update_pool
, silent
);
759 do_rt_notify(struct channel
*c
, net
*net
, rte
*new, rte
*old
, int refeed
)
761 struct proto
*p
= c
->proto
;
762 struct proto_stats
*stats
= &c
->stats
;
767 /* Apply export limit */
768 struct channel_limit
*l
= &c
->out_limit
;
769 if (l
->action
&& !old
&& new)
771 if (stats
->exp_routes
>= l
->limit
)
772 channel_notify_limit(c
, l
, PLD_OUT
, stats
->exp_routes
);
774 if (l
->state
== PLS_BLOCKED
)
776 stats
->exp_updates_rejected
++;
777 rte_trace_out(D_FILTERS
, c
, new, "rejected [limit]");
782 /* Apply export table */
783 if (c
->out_table
&& !rte_update_out(c
, net
->n
.addr
, new, old
, refeed
))
787 stats
->exp_updates_accepted
++;
789 stats
->exp_withdraws_accepted
++;
793 bmap_clear(&c
->export_map
, old
->id
);
799 bmap_set(&c
->export_map
, new->id
);
803 if (p
->debug
& D_ROUTES
)
806 rte_trace_out(D_ROUTES
, c
, new, "replaced");
808 rte_trace_out(D_ROUTES
, c
, new, "added");
810 rte_trace_out(D_ROUTES
, c
, old
, "removed");
813 p
->rt_notify(p
, c
, net
, new, old
);
817 rt_notify_basic(struct channel
*c
, net
*net
, rte
*new, rte
*old
, int refeed
)
819 // struct proto *p = c->proto;
820 rte
*new_free
= NULL
;
823 c
->stats
.exp_updates_received
++;
825 c
->stats
.exp_withdraws_received
++;
828 new = export_filter(c
, new, &new_free
, 0);
830 if (old
&& !bmap_test(&c
->export_map
, old
->id
))
836 do_rt_notify(c
, net
, new, old
, refeed
);
838 /* Discard temporary rte */
844 rt_notify_accepted(struct channel
*c
, net
*net
, rte
*new_changed
, rte
*old_changed
, int refeed
)
846 // struct proto *p = c->proto;
847 rte
*new_best
= NULL
;
848 rte
*old_best
= NULL
;
849 rte
*new_free
= NULL
;
853 * We assume that there are no changes in net route order except (added)
854 * new_changed and (removed) old_changed. Therefore, the function is not
855 * compatible with deterministic_med (where nontrivial reordering can happen
856 * as a result of a route change) and with recomputation of recursive routes
857 * due to next hop update (where many routes can be changed in one step).
859 * Note that we need this assumption just for optimizations, we could just
860 * run full new_best recomputation otherwise.
862 * There are three cases:
863 * feed or old_best is old_changed -> we need to recompute new_best
864 * old_best is before new_changed -> new_best is old_best, ignore
865 * old_best is after new_changed -> try new_changed, otherwise old_best
869 c
->stats
.exp_updates_received
++;
871 c
->stats
.exp_withdraws_received
++;
873 /* Find old_best - either old_changed, or route for net->routes */
874 if (old_changed
&& bmap_test(&c
->export_map
, old_changed
->id
))
875 old_best
= old_changed
;
878 for (rte
*r
= net
->routes
; rte_is_valid(r
); r
= r
->next
)
880 if (bmap_test(&c
->export_map
, r
->id
))
886 /* Note if new_changed found before old_best */
887 if (r
== new_changed
)
893 if ((new_changed
== old_changed
) || (old_best
== old_changed
))
895 /* Feed or old_best changed -> find first accepted by filters */
896 for (rte
*r
= net
->routes
; rte_is_valid(r
); r
= r
->next
)
897 if (new_best
= export_filter(c
, r
, &new_free
, 0))
902 /* Other cases -> either new_changed, or old_best (and nothing changed) */
903 if (new_first
&& (new_changed
= export_filter(c
, new_changed
, &new_free
, 0)))
904 new_best
= new_changed
;
909 if (!new_best
&& !old_best
)
912 do_rt_notify(c
, net
, new_best
, old_best
, refeed
);
914 /* Discard temporary rte */
920 static struct nexthop
*
921 nexthop_merge_rta(struct nexthop
*nhs
, rta
*a
, linpool
*pool
, int max
)
923 return nexthop_merge(nhs
, &(a
->nh
), 1, 0, max
, pool
);
927 rt_export_merged(struct channel
*c
, net
*net
, rte
**rt_free
, linpool
*pool
, int silent
)
929 // struct proto *p = c->proto;
930 struct nexthop
*nhs
= NULL
;
931 rte
*best0
, *best
, *rt0
, *rt
, *tmp
;
936 if (!rte_is_valid(best0
))
939 best
= export_filter_(c
, best0
, rt_free
, pool
, silent
);
941 if (!best
|| !rte_is_reachable(best
))
944 for (rt0
= best0
->next
; rt0
; rt0
= rt0
->next
)
946 if (!rte_mergable(best0
, rt0
))
949 rt
= export_filter_(c
, rt0
, &tmp
, pool
, 1);
954 if (rte_is_reachable(rt
))
955 nhs
= nexthop_merge_rta(nhs
, rt
->attrs
, pool
, c
->merge_limit
);
963 nhs
= nexthop_merge_rta(nhs
, best
->attrs
, pool
, c
->merge_limit
);
967 best
= rte_cow_rta(best
, pool
);
968 nexthop_link(best
->attrs
, nhs
);
980 rt_notify_merged(struct channel
*c
, net
*net
, rte
*new_changed
, rte
*old_changed
,
981 rte
*new_best
, rte
*old_best
, int refeed
)
983 // struct proto *p = c->proto;
984 rte
*new_free
= NULL
;
986 /* We assume that all rte arguments are either NULL or rte_is_valid() */
988 /* This check should be done by the caller */
989 if (!new_best
&& !old_best
)
992 /* Check whether the change is relevant to the merged route */
993 if ((new_best
== old_best
) &&
994 (new_changed
!= old_changed
) &&
995 !rte_mergable(new_best
, new_changed
) &&
996 !rte_mergable(old_best
, old_changed
))
1000 c
->stats
.exp_updates_received
++;
1002 c
->stats
.exp_withdraws_received
++;
1004 /* Prepare new merged route */
1006 new_best
= rt_export_merged(c
, net
, &new_free
, rte_update_pool
, 0);
1008 /* Check old merged route */
1009 if (old_best
&& !bmap_test(&c
->export_map
, old_best
->id
))
1012 if (!new_best
&& !old_best
)
1015 do_rt_notify(c
, net
, new_best
, old_best
, refeed
);
1017 /* Discard temporary rte */
1024 * rte_announce - announce a routing table change
1025 * @tab: table the route has been added to
1026 * @type: type of route announcement (RA_UNDEF or RA_ANY)
1027 * @net: network in question
1028 * @new: the new or changed route
1029 * @old: the previous route replaced by the new one
1030 * @new_best: the new best route for the same network
1031 * @old_best: the previous best route for the same network
1033 * This function gets a routing table update and announces it to all protocols
1034 * that are connected to the same table by their channels.
1036 * There are two ways of how routing table changes are announced. First, there
1037 * is a change of just one route in @net (which may caused a change of the best
1038 * route of the network). In this case @new and @old describes the changed route
1039 * and @new_best and @old_best describes best routes. Other routes are not
1040 * affected, but in sorted table the order of other routes might change.
1042 * Second, There is a bulk change of multiple routes in @net, with shared best
1043 * route selection. In such case separate route changes are described using
1044 * @type of %RA_ANY, with @new and @old specifying the changed route, while
1045 * @new_best and @old_best are NULL. After that, another notification is done
1046 * where @new_best and @old_best are filled (may be the same), but @new and @old
1049 * The function announces the change to all associated channels. For each
1050 * channel, an appropriate preprocessing is done according to channel &ra_mode.
1051 * For example, %RA_OPTIMAL channels receive just changes of best routes.
1053 * In general, we first call preexport() hook of a protocol, which performs
1054 * basic checks on the route (each protocol has a right to veto or force accept
1055 * of the route before any filter is asked). Then we consult an export filter
1056 * of the channel and verify the old route in an export map of the channel.
1057 * Finally, the rt_notify() hook of the protocol gets called.
1059 * Note that there are also calls of rt_notify() hooks due to feed, but that is
1060 * done outside of scope of rte_announce().
1063 rte_announce(rtable
*tab
, uint type
, net
*net
, rte
*new, rte
*old
,
1064 rte
*new_best
, rte
*old_best
)
1066 if (!rte_is_valid(new))
1069 if (!rte_is_valid(old
))
1072 if (!rte_is_valid(new_best
))
1075 if (!rte_is_valid(old_best
))
1078 if (!new && !old
&& !new_best
&& !old_best
)
1081 if (new_best
!= old_best
)
1084 new_best
->sender
->stats
.pref_routes
++;
1086 old_best
->sender
->stats
.pref_routes
--;
1089 rt_notify_hostcache(tab
, net
);
1091 if (!EMPTY_LIST(tab
->flowspec_links
))
1092 rt_flowspec_notify(tab
, net
);
1095 rt_schedule_notify(tab
);
1097 struct channel
*c
; node
*n
;
1098 WALK_LIST2(c
, n
, tab
->channels
, table_node
)
1100 if (c
->export_state
== ES_DOWN
)
1103 if (type
&& (type
!= c
->ra_mode
))
1109 if (new_best
!= old_best
)
1110 rt_notify_basic(c
, net
, new_best
, old_best
, 0);
1115 rt_notify_basic(c
, net
, new, old
, 0);
1120 * The (new != old) condition is problematic here, as it would break
1121 * the second usage pattern (announcement after bulk change, used in
1122 * rt_next_hop_update_net(), which sends both new and old as NULL).
1124 * But recursive next hops do not work with sorted tables anyways,
1125 * such configuration is forbidden in BGP and not supported in
1126 * rt_notify_accepted().
1128 * The condition is needed to eliminate spurious announcements where
1129 * both old and new routes are not valid (so they are NULL).
1132 rt_notify_accepted(c
, net
, new, old
, 0);
1136 rt_notify_merged(c
, net
, new, old
, new_best
, old_best
, 0);
1143 rte_validate(rte
*e
)
1148 if (!net_validate(n
->n
.addr
))
1150 log(L_WARN
"Ignoring bogus prefix %N received via %s",
1151 n
->n
.addr
, e
->sender
->proto
->name
);
1155 /* FIXME: better handling different nettypes */
1156 c
= !net_is_flow(n
->n
.addr
) ?
1157 net_classify(n
->n
.addr
): (IADDR_HOST
| SCOPE_UNIVERSE
);
1158 if ((c
< 0) || !(c
& IADDR_HOST
) || ((c
& IADDR_SCOPE_MASK
) <= SCOPE_LINK
))
1160 log(L_WARN
"Ignoring bogus route %N received via %s",
1161 n
->n
.addr
, e
->sender
->proto
->name
);
1165 if (net_type_match(n
->n
.addr
, NB_DEST
) == !e
->attrs
->dest
)
1167 /* Exception for flowspec that failed validation */
1168 if (net_is_flow(n
->n
.addr
) && (e
->attrs
->dest
== RTD_UNREACHABLE
))
1171 log(L_WARN
"Ignoring route %N with invalid dest %d received via %s",
1172 n
->n
.addr
, e
->attrs
->dest
, e
->sender
->proto
->name
);
1176 if ((e
->attrs
->dest
== RTD_UNICAST
) && !nexthop_is_sorted(&(e
->attrs
->nh
)))
1178 log(L_WARN
"Ignoring unsorted multipath route %N received via %s",
1179 n
->n
.addr
, e
->sender
->proto
->name
);
1187 * rte_free - delete a &rte
1188 * @e: &rte to be deleted
1190 * rte_free() deletes the given &rte from the routing table it's linked to.
1195 rt_unlock_source(e
->src
);
1196 if (rta_is_cached(e
->attrs
))
1202 rte_free_quick(rte
*e
)
1204 rt_unlock_source(e
->src
);
1210 rte_same(rte
*x
, rte
*y
)
1212 /* rte.flags / rte.pflags are not checked, as they are internal to rtable */
1214 x
->attrs
== y
->attrs
&&
1216 rte_is_filtered(x
) == rte_is_filtered(y
);
1219 static inline int rte_is_ok(rte
*e
) { return e
&& !rte_is_filtered(e
); }
1222 rte_recalculate(struct channel
*c
, net
*net
, rte
*new, struct rte_src
*src
)
1224 struct proto
*p
= c
->proto
;
1225 struct rtable
*table
= c
->table
;
1226 struct proto_stats
*stats
= &c
->stats
;
1227 static struct tbf rl_pipe
= TBF_DEFAULT_LOG_LIMITS
;
1228 rte
*before_old
= NULL
;
1229 rte
*old_best
= net
->routes
;
1233 k
= &net
->routes
; /* Find and remove original route from the same protocol */
1236 if (old
->src
== src
)
1238 /* If there is the same route in the routing table but from
1239 * a different sender, then there are two paths from the
1240 * source protocol to this routing table through transparent
1241 * pipes, which is not allowed.
1243 * We log that and ignore the route. If it is withdraw, we
1244 * ignore it completely (there might be 'spurious withdraws',
1245 * see FIXME in do_rte_announce())
1247 if (old
->sender
->proto
!= p
)
1251 log_rl(&rl_pipe
, L_ERR
"Pipe collision detected when sending %N to table %s",
1252 net
->n
.addr
, table
->name
);
1253 rte_free_quick(new);
1258 if (new && rte_same(old
, new))
1260 /* No changes, ignore the new route and refresh the old one */
1262 old
->flags
&= ~(REF_STALE
| REF_DISCARD
| REF_MODIFY
);
1264 if (!rte_is_filtered(new))
1266 stats
->imp_updates_ignored
++;
1267 rte_trace_in(D_ROUTES
, c
, new, "ignored");
1270 rte_free_quick(new);
1281 /* Save the last accessed position */
1289 stats
->imp_withdraws_ignored
++;
1293 int new_ok
= rte_is_ok(new);
1294 int old_ok
= rte_is_ok(old
);
1296 struct channel_limit
*l
= &c
->rx_limit
;
1297 if (l
->action
&& !old
&& new && !c
->in_table
)
1299 u32 all_routes
= stats
->imp_routes
+ stats
->filt_routes
;
1301 if (all_routes
>= l
->limit
)
1302 channel_notify_limit(c
, l
, PLD_RX
, all_routes
);
1304 if (l
->state
== PLS_BLOCKED
)
1306 /* In receive limit the situation is simple, old is NULL so
1307 we just free new and exit like nothing happened */
1309 stats
->imp_updates_ignored
++;
1310 rte_trace_in(D_FILTERS
, c
, new, "ignored [limit]");
1311 rte_free_quick(new);
1317 if (l
->action
&& !old_ok
&& new_ok
)
1319 if (stats
->imp_routes
>= l
->limit
)
1320 channel_notify_limit(c
, l
, PLD_IN
, stats
->imp_routes
);
1322 if (l
->state
== PLS_BLOCKED
)
1324 /* In import limit the situation is more complicated. We
1325 shouldn't just drop the route, we should handle it like
1326 it was filtered. We also have to continue the route
1327 processing if old or new is non-NULL, but we should exit
1328 if both are NULL as this case is probably assumed to be
1331 stats
->imp_updates_ignored
++;
1332 rte_trace_in(D_FILTERS
, c
, new, "ignored [limit]");
1334 if (c
->in_keep_filtered
)
1335 new->flags
|= REF_FILTERED
;
1337 { rte_free_quick(new); new = NULL
; }
1339 /* Note that old && !new could be possible when
1340 c->in_keep_filtered changed in the recent past. */
1351 stats
->imp_updates_accepted
++;
1353 stats
->imp_withdraws_accepted
++;
1355 stats
->imp_withdraws_ignored
++;
1357 if (old_ok
|| new_ok
)
1358 table
->last_rt_change
= current_time();
1363 rte_is_filtered(new) ? stats
->filt_routes
++ : stats
->imp_routes
++;
1365 rte_is_filtered(old
) ? stats
->filt_routes
-- : stats
->imp_routes
--;
1367 if (table
->config
->sorted
)
1369 /* If routes are sorted, just insert new route to appropriate position */
1372 if (before_old
&& !rte_better(new, before_old
))
1373 k
= &before_old
->next
;
1377 for (; *k
; k
=&(*k
)->next
)
1378 if (rte_better(new, *k
))
1389 /* If routes are not sorted, find the best route and move it on
1390 the first position. There are several optimized cases. */
1392 if (src
->proto
->rte_recalculate
&& src
->proto
->rte_recalculate(table
, net
, new, old
, old_best
))
1393 goto do_recalculate
;
1395 if (new && rte_better(new, old_best
))
1397 /* The first case - the new route is cleary optimal,
1398 we link it at the first position */
1400 new->next
= net
->routes
;
1405 else if (old
== old_best
)
1407 /* The second case - the old best route disappeared, we add the
1408 new route (if we have any) to the list (we don't care about
1409 position) and then we elect the new optimal route and relink
1410 that route at the first position and announce it. New optimal
1411 route might be NULL if there is no more routes */
1414 /* Add the new route to the list */
1423 /* Find a new optimal route (if there is any) */
1426 rte
**bp
= &net
->routes
;
1427 for (k
=&(*bp
)->next
; *k
; k
=&(*k
)->next
)
1428 if (rte_better(*k
, *bp
))
1434 best
->next
= net
->routes
;
1440 /* The third case - the new route is not better than the old
1441 best route (therefore old_best != NULL) and the old best
1442 route was not removed (therefore old_best == net->routes).
1443 We just link the new route to the old/last position. */
1450 /* The fourth (empty) case - suboptimal route was removed, nothing to do */
1455 new->lastmod
= current_time();
1459 new->id
= hmap_first_zero(&table
->id_map
);
1460 hmap_set(&table
->id_map
, new->id
);
1466 /* Log the route change */
1467 if ((c
->debug
& D_ROUTES
) || (p
->debug
& D_ROUTES
))
1470 rte_trace(c
, new, '>', new == net
->routes
? "added [best]" : "added");
1473 if (old
!= old_best
)
1474 rte_trace(c
, old
, '>', "removed");
1475 else if (rte_is_ok(net
->routes
))
1476 rte_trace(c
, old
, '>', "removed [replaced]");
1478 rte_trace(c
, old
, '>', "removed [sole]");
1482 /* Propagate the route change */
1483 rte_announce(table
, RA_UNDEF
, net
, new, old
, net
->routes
, old_best
);
1486 (table
->gc_counter
++ >= table
->config
->gc_threshold
))
1487 rt_kick_prune_timer(table
);
1489 if (old_ok
&& p
->rte_remove
)
1490 p
->rte_remove(net
, old
);
1491 if (new_ok
&& p
->rte_insert
)
1492 p
->rte_insert(net
, new);
1497 hmap_clear(&table
->id_map
, old
->id
);
1499 rte_free_quick(old
);
1503 static int rte_update_nest_cnt
; /* Nesting counter to allow recursive updates */
1506 rte_update_lock(void)
1508 rte_update_nest_cnt
++;
1512 rte_update_unlock(void)
1514 if (!--rte_update_nest_cnt
)
1515 lp_flush(rte_update_pool
);
1519 * rte_update - enter a new update to a routing table
1520 * @table: table to be updated
1521 * @c: channel doing the update
1522 * @net: network node
1523 * @p: protocol submitting the update
1524 * @src: protocol originating the update
1525 * @new: a &rte representing the new route or %NULL for route removal.
1527 * This function is called by the routing protocols whenever they discover
1528 * a new route or wish to update/remove an existing route. The right announcement
1529 * sequence is to build route attributes first (either un-cached with @aflags set
1530 * to zero or a cached one using rta_lookup(); in this case please note that
1531 * you need to increase the use count of the attributes yourself by calling
1532 * rta_clone()), call rte_get_temp() to obtain a temporary &rte, fill in all
1533 * the appropriate data and finally submit the new &rte by calling rte_update().
1535 * @src specifies the protocol that originally created the route and the meaning
1536 * of protocol-dependent data of @new. If @new is not %NULL, @src have to be the
1537 * same value as @new->attrs->proto. @p specifies the protocol that called
1538 * rte_update(). In most cases it is the same protocol as @src. rte_update()
1539 * stores @p in @new->sender;
1541 * When rte_update() gets any route, it automatically validates it (checks,
1542 * whether the network and next hop address are valid IP addresses and also
1543 * whether a normal routing protocol doesn't try to smuggle a host or link
1544 * scope route to the table), converts all protocol dependent attributes stored
1545 * in the &rte to temporary extended attributes, consults import filters of the
1546 * protocol to see if the route should be accepted and/or its attributes modified,
1547 * stores the temporary attributes back to the &rte.
1549 * Now, having a "public" version of the route, we
1550 * automatically find any old route defined by the protocol @src
1551 * for network @n, replace it by the new one (or removing it if @new is %NULL),
1552 * recalculate the optimal route for this destination and finally broadcast
1553 * the change (if any) to all routing protocols by calling rte_announce().
1555 * All memory used for attribute lists and other temporary allocations is taken
1556 * from a special linear pool @rte_update_pool and freed when rte_update()
1561 rte_update2(struct channel
*c
, const net_addr
*n
, rte
*new, struct rte_src
*src
)
1563 // struct proto *p = c->proto;
1564 struct proto_stats
*stats
= &c
->stats
;
1565 const struct filter
*filter
= c
->in_filter
;
1568 ASSERT(c
->channel_state
== CS_UP
);
1573 /* Create a temporary table node */
1574 nn
= alloca(sizeof(net
) + n
->length
);
1575 memset(nn
, 0, sizeof(net
) + n
->length
);
1576 net_copy(nn
->n
.addr
, n
);
1581 stats
->imp_updates_received
++;
1582 if (!rte_validate(new))
1584 rte_trace_in(D_FILTERS
, c
, new, "invalid");
1585 stats
->imp_updates_invalid
++;
1589 if (filter
== FILTER_REJECT
)
1591 stats
->imp_updates_filtered
++;
1592 rte_trace_in(D_FILTERS
, c
, new, "filtered out");
1594 if (! c
->in_keep_filtered
)
1597 /* new is a private copy, i could modify it */
1598 new->flags
|= REF_FILTERED
;
1602 int fr
= f_run(filter
, &new, rte_update_pool
, 0);
1605 stats
->imp_updates_filtered
++;
1606 rte_trace_in(D_FILTERS
, c
, new, "filtered out");
1608 if (! c
->in_keep_filtered
)
1611 new->flags
|= REF_FILTERED
;
1614 if (!rta_is_cached(new->attrs
)) /* Need to copy attributes */
1615 new->attrs
= rta_lookup(new->attrs
);
1616 new->flags
|= REF_COW
;
1618 /* Use the actual struct network, not the dummy one */
1619 nn
= net_get(c
->table
, n
);
1624 stats
->imp_withdraws_received
++;
1626 if (!(nn
= net_find(c
->table
, n
)) || !src
)
1628 stats
->imp_withdraws_ignored
++;
1629 rte_update_unlock();
1635 /* And recalculate the best route */
1636 rte_recalculate(c
, nn
, new, src
);
1638 rte_update_unlock();
1644 if (nn
= net_find(c
->table
, n
))
1647 rte_update_unlock();
1650 /* Independent call to rte_announce(), used from next hop
1651 recalculation, outside of rte_update(). new must be non-NULL */
1653 rte_announce_i(rtable
*tab
, uint type
, net
*net
, rte
*new, rte
*old
,
1654 rte
*new_best
, rte
*old_best
)
1657 rte_announce(tab
, type
, net
, new, old
, new_best
, old_best
);
1658 rte_update_unlock();
1662 rte_discard(rte
*old
) /* Non-filtered route deletion, used during garbage collection */
1665 rte_recalculate(old
->sender
, old
->net
, NULL
, old
->src
);
1666 rte_update_unlock();
1669 /* Modify existing route by protocol hook, used for long-lived graceful restart */
1671 rte_modify(rte
*old
)
1675 rte
*new = old
->sender
->proto
->rte_modify(old
, rte_update_pool
);
1680 if (!rta_is_cached(new->attrs
))
1681 new->attrs
= rta_lookup(new->attrs
);
1682 new->flags
= (old
->flags
& ~REF_MODIFY
) | REF_COW
;
1685 rte_recalculate(old
->sender
, old
->net
, new, old
->src
);
1688 rte_update_unlock();
1691 /* Check rtable for best route to given net whether it would be exported do p */
1693 rt_examine(rtable
*t
, net_addr
*a
, struct channel
*c
, const struct filter
*filter
)
1695 struct proto
*p
= c
->proto
;
1696 net
*n
= net_find(t
, a
);
1697 rte
*rt
= n
? n
->routes
: NULL
;
1699 if (!rte_is_valid(rt
))
1704 /* Rest is stripped down export_filter() */
1705 int v
= p
->preexport
? p
->preexport(c
, rt
) : 0;
1706 if (v
== RIC_PROCESS
)
1707 v
= (f_run(filter
, &rt
, rte_update_pool
, FF_SILENT
) <= F_ACCEPT
);
1709 /* Discard temporary rte */
1710 if (rt
!= n
->routes
)
1713 rte_update_unlock();
1720 * rt_refresh_begin - start a refresh cycle
1721 * @t: related routing table
1722 * @c related channel
1724 * This function starts a refresh cycle for given routing table and announce
1725 * hook. The refresh cycle is a sequence where the protocol sends all its valid
1726 * routes to the routing table (by rte_update()). After that, all protocol
1727 * routes (more precisely routes with @c as @sender) not sent during the
1728 * refresh cycle but still in the table from the past are pruned. This is
1729 * implemented by marking all related routes as stale by REF_STALE flag in
1730 * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD
1731 * flag in rt_refresh_end() and then removing such routes in the prune loop.
1734 rt_refresh_begin(rtable
*t
, struct channel
*c
)
1736 FIB_WALK(&t
->fib
, net
, n
)
1739 for (e
= n
->routes
; e
; e
= e
->next
)
1741 e
->flags
|= REF_STALE
;
1747 * rt_refresh_end - end a refresh cycle
1748 * @t: related routing table
1749 * @c: related channel
1751 * This function ends a refresh cycle for given routing table and announce
1752 * hook. See rt_refresh_begin() for description of refresh cycles.
1755 rt_refresh_end(rtable
*t
, struct channel
*c
)
1759 FIB_WALK(&t
->fib
, net
, n
)
1762 for (e
= n
->routes
; e
; e
= e
->next
)
1763 if ((e
->sender
== c
) && (e
->flags
& REF_STALE
))
1765 e
->flags
|= REF_DISCARD
;
1772 rt_schedule_prune(t
);
1776 rt_modify_stale(rtable
*t
, struct channel
*c
)
1780 FIB_WALK(&t
->fib
, net
, n
)
1783 for (e
= n
->routes
; e
; e
= e
->next
)
1784 if ((e
->sender
== c
) && (e
->flags
& REF_STALE
) && !(e
->flags
& REF_FILTERED
))
1786 e
->flags
|= REF_MODIFY
;
1793 rt_schedule_prune(t
);
1797 * rte_dump - dump a route
1798 * @e: &rte to be dumped
1800 * This functions dumps contents of a &rte to debug output.
1806 debug("%-1N ", n
->n
.addr
);
1807 debug("PF=%02x ", e
->pflags
);
1813 * rt_dump - dump a routing table
1814 * @t: routing table to be dumped
1816 * This function dumps contents of a given routing table to debug output.
1821 debug("Dump of routing table <%s>\n", t
->name
);
1825 FIB_WALK(&t
->fib
, net
, n
)
1828 for(e
=n
->routes
; e
; e
=e
->next
)
1836 * rt_dump_all - dump all routing tables
1838 * This function dumps contents of all routing tables to debug output.
1846 WALK_LIST2(t
, n
, routing_tables
, n
)
1851 rt_schedule_hcu(rtable
*tab
)
1853 if (tab
->hcu_scheduled
)
1856 tab
->hcu_scheduled
= 1;
1857 ev_schedule(tab
->rt_event
);
1861 rt_schedule_nhu(rtable
*tab
)
1863 if (tab
->nhu_state
== NHU_CLEAN
)
1864 ev_schedule(tab
->rt_event
);
1867 * NHU_CLEAN -> NHU_SCHEDULED
1868 * NHU_RUNNING -> NHU_DIRTY
1870 tab
->nhu_state
|= NHU_SCHEDULED
;
1874 rt_schedule_prune(rtable
*tab
)
1876 if (tab
->prune_state
== 0)
1877 ev_schedule(tab
->rt_event
);
1879 /* state change 0->1, 2->3 */
1880 tab
->prune_state
|= 1;
1891 if (tab
->hcu_scheduled
)
1892 rt_update_hostcache(tab
);
1895 rt_next_hop_update(tab
);
1897 if (tab
->prune_state
)
1898 rt_prune_table(tab
);
1900 rt_unlock_table(tab
);
1905 rt_prune_timer(timer
*t
)
1907 rtable
*tab
= t
->data
;
1909 if (tab
->gc_counter
>= tab
->config
->gc_threshold
)
1910 rt_schedule_prune(tab
);
1914 rt_kick_prune_timer(rtable
*tab
)
1916 /* Return if prune is already scheduled */
1917 if (tm_active(tab
->prune_timer
) || (tab
->prune_state
& 1))
1920 /* Randomize GC period to +/- 50% */
1921 btime gc_period
= tab
->config
->gc_period
;
1922 gc_period
= (gc_period
/ 2) + (random_u32() % (uint
) gc_period
);
1923 tm_start(tab
->prune_timer
, gc_period
);
1928 rt_settled_time(rtable
*tab
)
1930 ASSUME(tab
->base_settle_time
!= 0);
1932 return MIN(tab
->last_rt_change
+ tab
->config
->min_settle_time
,
1933 tab
->base_settle_time
+ tab
->config
->max_settle_time
);
1937 rt_settle_timer(timer
*t
)
1939 rtable
*tab
= t
->data
;
1941 if (!tab
->base_settle_time
)
1944 btime settled_time
= rt_settled_time(tab
);
1945 if (current_time() < settled_time
)
1947 tm_set(tab
->settle_timer
, settled_time
);
1952 tab
->base_settle_time
= 0;
1954 struct rt_subscription
*s
;
1955 WALK_LIST(s
, tab
->subscribers
)
1960 rt_kick_settle_timer(rtable
*tab
)
1962 tab
->base_settle_time
= current_time();
1964 if (!tab
->settle_timer
)
1965 tab
->settle_timer
= tm_new_init(tab
->rp
, rt_settle_timer
, tab
, 0, 0);
1967 if (!tm_active(tab
->settle_timer
))
1968 tm_set(tab
->settle_timer
, rt_settled_time(tab
));
1972 rt_schedule_notify(rtable
*tab
)
1974 if (EMPTY_LIST(tab
->subscribers
))
1977 if (tab
->base_settle_time
)
1980 rt_kick_settle_timer(tab
);
1984 rt_subscribe(rtable
*tab
, struct rt_subscription
*s
)
1988 add_tail(&tab
->subscribers
, &s
->n
);
1992 rt_unsubscribe(struct rt_subscription
*s
)
1995 rt_unlock_table(s
->tab
);
1998 static struct rt_flowspec_link
*
1999 rt_flowspec_find_link(rtable
*src
, rtable
*dst
)
2001 struct rt_flowspec_link
*ln
;
2002 WALK_LIST(ln
, src
->flowspec_links
)
2003 if ((ln
->src
== src
) && (ln
->dst
== dst
))
2010 rt_flowspec_link(rtable
*src
, rtable
*dst
)
2012 ASSERT(rt_is_ip(src
));
2013 ASSERT(rt_is_flow(dst
));
2015 struct rt_flowspec_link
*ln
= rt_flowspec_find_link(src
, dst
);
2022 ln
= mb_allocz(src
->rp
, sizeof(struct rt_flowspec_link
));
2025 add_tail(&src
->flowspec_links
, &ln
->n
);
2032 rt_flowspec_unlink(rtable
*src
, rtable
*dst
)
2034 struct rt_flowspec_link
*ln
= rt_flowspec_find_link(src
, dst
);
2036 ASSERT(ln
&& (ln
->uc
> 0));
2045 rt_unlock_table(src
);
2046 rt_unlock_table(dst
);
2051 rt_flowspec_notify(rtable
*src
, net
*net
)
2053 /* Only IP tables are src links */
2054 ASSERT(rt_is_ip(src
));
2056 struct rt_flowspec_link
*ln
;
2057 WALK_LIST(ln
, src
->flowspec_links
)
2059 rtable
*dst
= ln
->dst
;
2060 ASSERT(rt_is_flow(dst
));
2062 /* No need to inspect it further if recalculation is already active */
2063 if ((dst
->nhu_state
== NHU_SCHEDULED
) || (dst
->nhu_state
== NHU_DIRTY
))
2066 if (trie_match_net(dst
->flowspec_trie
, net
->n
.addr
))
2067 rt_schedule_nhu(dst
);
2072 rt_flowspec_reset_trie(rtable
*tab
)
2074 linpool
*lp
= tab
->flowspec_trie
->lp
;
2075 int ipv4
= tab
->flowspec_trie
->ipv4
;
2078 tab
->flowspec_trie
= f_new_trie(lp
, 0);
2079 tab
->flowspec_trie
->ipv4
= ipv4
;
2083 rt_free(resource
*_r
)
2085 rtable
*r
= (rtable
*) _r
;
2087 DBG("Deleting routing table %s\n", r
->name
);
2088 ASSERT_DIE(r
->use_count
== 0);
2093 r
->config
->table
= NULL
;
2097 rt_free_hostcache(r
);
2099 /* Freed automagically by the resource pool
2101 hmap_free(&r->id_map);
2103 rfree(r->settle_timer);
2109 rt_res_dump(resource
*_r
)
2111 rtable
*r
= (rtable
*) _r
;
2112 debug("name \"%s\", addr_type=%s, rt_count=%u, use_count=%d\n",
2113 r
->name
, net_label
[r
->addr_type
], r
->rt_count
, r
->use_count
);
2116 static struct resclass rt_class
= {
2117 .name
= "Routing table",
2118 .size
= sizeof(struct rtable
),
2120 .dump
= rt_res_dump
,
2126 rt_setup(pool
*pp
, struct rtable_config
*cf
)
2128 pool
*p
= rp_newf(pp
, "Routing table %s", cf
->name
);
2130 rtable
*t
= ralloc(p
, &rt_class
);
2135 t
->addr_type
= cf
->addr_type
;
2137 fib_init(&t
->fib
, p
, t
->addr_type
, sizeof(net
), OFFSETOF(net
, n
), 0, NULL
);
2141 t
->trie
= f_new_trie(lp_new_default(p
), 0);
2142 t
->trie
->ipv4
= net_val_match(t
->addr_type
, NB_IP4
| NB_VPN4
| NB_ROA4
);
2144 t
->fib
.init
= net_init_with_trie
;
2147 init_list(&t
->channels
);
2148 init_list(&t
->flowspec_links
);
2149 init_list(&t
->subscribers
);
2151 if (!(t
->internal
= cf
->internal
))
2153 hmap_init(&t
->id_map
, p
, 1024);
2154 hmap_set(&t
->id_map
, 0);
2156 t
->rt_event
= ev_new_init(p
, rt_event
, t
);
2157 t
->prune_timer
= tm_new_init(p
, rt_prune_timer
, t
, 0, 0);
2158 t
->last_rt_change
= t
->gc_time
= current_time();
2162 t
->flowspec_trie
= f_new_trie(lp_new_default(p
), 0);
2163 t
->flowspec_trie
->ipv4
= (t
->addr_type
== NET_FLOW4
);
2171 * rt_init - initialize routing tables
2173 * This function is called during BIRD startup. It initializes the
2174 * routing table module.
2180 rt_table_pool
= rp_new(&root_pool
, "Routing tables");
2181 rte_update_pool
= lp_new_default(rt_table_pool
);
2182 rte_slab
= sl_new(rt_table_pool
, sizeof(rte
));
2183 init_list(&routing_tables
);
2188 * rt_prune_table - prune a routing table
2190 * The prune loop scans routing tables and removes routes belonging to flushing
2191 * protocols, discarded routes and also stale network entries. It is called from
2192 * rt_event(). The event is rescheduled if the current iteration do not finish
2193 * the table. The pruning is directed by the prune state (@prune_state),
2194 * specifying whether the prune cycle is scheduled or running, and there
2195 * is also a persistent pruning iterator (@prune_fit).
2197 * The prune loop is used also for channel flushing. For this purpose, the
2198 * channels to flush are marked before the iteration and notified after the
2202 rt_prune_table(rtable
*tab
)
2204 struct fib_iterator
*fit
= &tab
->prune_fit
;
2210 DBG("Pruning route table %s\n", tab
->name
);
2212 fib_check(&tab
->fib
);
2215 if (tab
->prune_state
== 0)
2218 if (tab
->prune_state
== 1)
2220 /* Mark channels to flush */
2221 WALK_LIST2(c
, n
, tab
->channels
, table_node
)
2222 if (c
->channel_state
== CS_FLUSHING
)
2223 c
->flush_active
= 1;
2225 FIB_ITERATE_INIT(fit
, &tab
->fib
);
2226 tab
->prune_state
= 2;
2228 tab
->gc_counter
= 0;
2229 tab
->gc_time
= current_time();
2231 if (tab
->prune_trie
)
2233 /* Init prefix trie pruning */
2234 tab
->trie_new
= f_new_trie(lp_new_default(tab
->rp
), 0);
2235 tab
->trie_new
->ipv4
= tab
->trie
->ipv4
;
2240 FIB_ITERATE_START(&tab
->fib
, fit
, net
, n
)
2247 FIB_ITERATE_PUT(fit
);
2248 ev_schedule(tab
->rt_event
);
2252 for (e
=n
->routes
; e
; e
=e
->next
)
2254 if (e
->sender
->flush_active
|| (e
->flags
& REF_DISCARD
))
2262 if (e
->flags
& REF_MODIFY
)
2271 if (!n
->routes
) /* Orphaned FIB entry */
2273 FIB_ITERATE_PUT(fit
);
2274 fib_delete(&tab
->fib
, n
);
2280 trie_add_prefix(tab
->trie_new
, n
->n
.addr
, n
->n
.addr
->pxlen
, n
->n
.addr
->pxlen
);
2287 fib_check(&tab
->fib
);
2290 /* state change 2->0, 3->1 */
2291 tab
->prune_state
&= 1;
2295 /* Finish prefix trie pruning */
2297 if (!tab
->trie_lock_count
)
2299 rfree(tab
->trie
->lp
);
2303 ASSERT(!tab
->trie_old
);
2304 tab
->trie_old
= tab
->trie
;
2305 tab
->trie_old_lock_count
= tab
->trie_lock_count
;
2306 tab
->trie_lock_count
= 0;
2309 tab
->trie
= tab
->trie_new
;
2310 tab
->trie_new
= NULL
;
2311 tab
->prune_trie
= 0;
2315 /* Schedule prefix trie pruning */
2316 if (tab
->trie
&& !tab
->trie_old
&& (tab
->trie
->prefix_count
> (2 * tab
->fib
.entries
)))
2318 /* state change 0->1, 2->3 */
2319 tab
->prune_state
|= 1;
2320 tab
->prune_trie
= 1;
2324 if (tab
->prune_state
> 0)
2325 ev_schedule(tab
->rt_event
);
2327 /* FIXME: This should be handled in a better way */
2330 /* Close flushed channels */
2331 WALK_LIST2_DELSAFE(c
, n
, x
, tab
->channels
, table_node
)
2332 if (c
->flush_active
)
2334 c
->flush_active
= 0;
2335 channel_set_state(c
, CS_DOWN
);
2342 * rt_lock_trie - lock a prefix trie of a routing table
2343 * @tab: routing table with prefix trie to be locked
2345 * The prune loop may rebuild the prefix trie and invalidate f_trie_walk_state
2346 * structures. Therefore, asynchronous walks should lock the prefix trie using
2347 * this function. That allows the prune loop to rebuild the trie, but postpones
2348 * its freeing until all walks are done (unlocked by rt_unlock_trie()).
2350 * Return a current trie that will be locked, the value should be passed back to
2351 * rt_unlock_trie() for unlocking.
2355 rt_lock_trie(rtable
*tab
)
2359 tab
->trie_lock_count
++;
2364 * rt_unlock_trie - unlock a prefix trie of a routing table
2365 * @tab: routing table with prefix trie to be locked
2366 * @trie: value returned by matching rt_lock_trie()
2368 * Done for trie locked by rt_lock_trie() after walk over the trie is done.
2369 * It may free the trie and schedule next trie pruning.
2372 rt_unlock_trie(rtable
*tab
, struct f_trie
*trie
)
2376 if (trie
== tab
->trie
)
2378 /* Unlock the current prefix trie */
2379 ASSERT(tab
->trie_lock_count
);
2380 tab
->trie_lock_count
--;
2382 else if (trie
== tab
->trie_old
)
2384 /* Unlock the old prefix trie */
2385 ASSERT(tab
->trie_old_lock_count
);
2386 tab
->trie_old_lock_count
--;
2388 /* Free old prefix trie that is no longer needed */
2389 if (!tab
->trie_old_lock_count
)
2391 rfree(tab
->trie_old
->lp
);
2392 tab
->trie_old
= NULL
;
2394 /* Kick prefix trie pruning that was postponed */
2395 if (tab
->trie
&& (tab
->trie
->prefix_count
> (2 * tab
->fib
.entries
)))
2397 tab
->prune_trie
= 1;
2398 rt_schedule_prune(tab
);
2403 log(L_BUG
"Invalid arg to rt_unlock_trie()");
2408 rt_preconfig(struct config
*c
)
2410 init_list(&c
->tables
);
2412 rt_new_table(cf_get_symbol("master4"), NET_IP4
);
2413 rt_new_table(cf_get_symbol("master6"), NET_IP6
);
2417 rt_postconfig(struct config
*c
)
2419 uint num_tables
= list_length(&c
->tables
);
2420 btime def_gc_period
= 400 MS
* num_tables
;
2421 def_gc_period
= MAX(def_gc_period
, 10 S
);
2422 def_gc_period
= MIN(def_gc_period
, 600 S
);
2424 struct rtable_config
*rc
;
2425 WALK_LIST(rc
, c
->tables
)
2426 if (rc
->gc_period
== (uint
) -1)
2427 rc
->gc_period
= (uint
) def_gc_period
;
2432 * Some functions for handing internal next hop updates
2433 * triggered by rt_schedule_nhu().
2437 rta_apply_hostentry(rta
*a
, struct hostentry
*he
, mpls_label_stack
*mls
)
2441 a
->igp_metric
= he
->igp_metric
;
2443 if (a
->dest
!= RTD_UNICAST
)
2447 a
->nh
= (struct nexthop
) {};
2449 { /* Store the label stack for later changes */
2450 a
->nh
.labels_orig
= a
->nh
.labels
= mls
->len
;
2451 memcpy(a
->nh
.label
, mls
->stack
, mls
->len
* sizeof(u32
));
2456 if (((!mls
) || (!mls
->len
)) && he
->nexthop_linkable
)
2457 { /* Just link the nexthop chain, no label append happens. */
2458 memcpy(&(a
->nh
), &(he
->src
->nh
), nexthop_size(&(he
->src
->nh
)));
2462 struct nexthop
*nhp
= NULL
, *nhr
= NULL
;
2463 int skip_nexthop
= 0;
2465 for (struct nexthop
*nh
= &(he
->src
->nh
); nh
; nh
= nh
->next
)
2472 nhp
= (nhp
? (nhp
->next
= lp_alloc(rte_update_pool
, NEXTHOP_MAX_SIZE
)) : &(a
->nh
));
2475 memset(nhp
, 0, NEXTHOP_MAX_SIZE
);
2476 nhp
->iface
= nh
->iface
;
2477 nhp
->weight
= nh
->weight
;
2481 nhp
->labels
= nh
->labels
+ mls
->len
;
2482 nhp
->labels_orig
= mls
->len
;
2483 if (nhp
->labels
<= MPLS_MAX_LABEL_STACK
)
2485 memcpy(nhp
->label
, nh
->label
, nh
->labels
* sizeof(u32
)); /* First the hostentry labels */
2486 memcpy(&(nhp
->label
[nh
->labels
]), mls
->stack
, mls
->len
* sizeof(u32
)); /* Then the bottom labels */
2490 log(L_WARN
"Sum of label stack sizes %d + %d = %d exceedes allowed maximum (%d)",
2491 nh
->labels
, mls
->len
, nhp
->labels
, MPLS_MAX_LABEL_STACK
);
2496 else if (nh
->labels
)
2498 nhp
->labels
= nh
->labels
;
2499 nhp
->labels_orig
= 0;
2500 memcpy(nhp
->label
, nh
->label
, nh
->labels
* sizeof(u32
));
2503 if (ipa_nonzero(nh
->gw
))
2505 nhp
->gw
= nh
->gw
; /* Router nexthop */
2506 nhp
->flags
|= (nh
->flags
& RNF_ONLINK
);
2508 else if (!(nh
->iface
->flags
& IF_MULTIACCESS
) || (nh
->iface
->flags
& IF_LOOPBACK
))
2509 nhp
->gw
= IPA_NONE
; /* PtP link - no need for nexthop */
2510 else if (ipa_nonzero(he
->link
))
2511 nhp
->gw
= he
->link
; /* Device nexthop with link-local address known */
2513 nhp
->gw
= he
->addr
; /* Device nexthop with link-local address unknown */
2521 a
->dest
= RTD_UNREACHABLE
;
2522 log(L_WARN
"No valid nexthop remaining, setting route unreachable");
2528 rta_next_hop_outdated(rta
*a
)
2530 struct hostentry
*he
= a
->hostentry
;
2536 return a
->dest
!= RTD_UNREACHABLE
;
2538 return (a
->dest
!= he
->dest
) || (a
->igp_metric
!= he
->igp_metric
) ||
2539 (!he
->nexthop_linkable
) || !nexthop_same(&(a
->nh
), &(he
->src
->nh
));
2543 rt_next_hop_update_rte(rtable
*tab UNUSED
, rte
*old
)
2545 if (!rta_next_hop_outdated(old
->attrs
))
2548 rta
*a
= alloca(RTA_MAX_SIZE
);
2549 memcpy(a
, old
->attrs
, rta_size(old
->attrs
));
2551 mpls_label_stack mls
= { .len
= a
->nh
.labels_orig
};
2552 memcpy(mls
.stack
, &a
->nh
.label
[a
->nh
.labels
- mls
.len
], mls
.len
* sizeof(u32
));
2554 rta_apply_hostentry(a
, old
->attrs
->hostentry
, &mls
);
2557 rte
*e
= sl_alloc(rte_slab
);
2558 memcpy(e
, old
, sizeof(rte
));
2559 e
->attrs
= rta_lookup(a
);
2560 rt_lock_source(e
->src
);
2569 net_flow_has_dst_prefix(const net_addr
*n
)
2571 ASSUME(net_is_flow(n
));
2576 if (n
->type
== NET_FLOW4
)
2578 const net_addr_flow4
*n4
= (void *) n
;
2579 return (n4
->length
> sizeof(net_addr_flow4
)) && (n4
->data
[0] == FLOW_TYPE_DST_PREFIX
);
2583 const net_addr_flow6
*n6
= (void *) n
;
2584 return (n6
->length
> sizeof(net_addr_flow6
)) && (n6
->data
[0] == FLOW_TYPE_DST_PREFIX
);
2589 rta_as_path_is_empty(rta
*a
)
2591 eattr
*e
= ea_find(a
->eattrs
, EA_CODE(PROTOCOL_BGP
, BA_AS_PATH
));
2592 return !e
|| (as_path_getlen(e
->u
.ptr
) == 0);
2596 rta_get_first_asn(rta
*a
)
2598 eattr
*e
= ea_find(a
->eattrs
, EA_CODE(PROTOCOL_BGP
, BA_AS_PATH
));
2601 return (e
&& as_path_get_first_regular(e
->u
.ptr
, &asn
)) ? asn
: 0;
2605 rt_flowspec_check(rtable
*tab_ip
, rtable
*tab_flow
, const net_addr
*n
, rta
*a
, int interior
)
2607 ASSERT(rt_is_ip(tab_ip
));
2608 ASSERT(rt_is_flow(tab_flow
));
2609 ASSERT(tab_ip
->trie
);
2611 /* RFC 8955 6. a) Flowspec has defined dst prefix */
2612 if (!net_flow_has_dst_prefix(n
))
2615 /* RFC 9117 4.1. Accept AS_PATH is empty (fr */
2616 if (interior
&& rta_as_path_is_empty(a
))
2620 /* RFC 8955 6. b) Flowspec and its best-match route have the same originator */
2622 /* Find flowspec dst prefix */
2624 if (n
->type
== NET_FLOW4
)
2625 net_fill_ip4(&dst
, net4_prefix(n
), net4_pxlen(n
));
2627 net_fill_ip6(&dst
, net6_prefix(n
), net6_pxlen(n
));
2629 /* Find best-match BGP unicast route for flowspec dst prefix */
2630 net
*nb
= net_route(tab_ip
, &dst
);
2631 rte
*rb
= nb
? nb
->routes
: NULL
;
2633 /* Register prefix to trie for tracking further changes */
2634 int max_pxlen
= (n
->type
== NET_FLOW4
) ? IP4_MAX_PREFIX_LENGTH
: IP6_MAX_PREFIX_LENGTH
;
2635 trie_add_prefix(tab_flow
->flowspec_trie
, &dst
, (nb
? nb
->n
.addr
->pxlen
: 0), max_pxlen
);
2637 /* No best-match BGP route -> no flowspec */
2638 if (!rb
|| (rb
->attrs
->source
!= RTS_BGP
))
2641 /* Find ORIGINATOR_ID values */
2642 u32 orig_a
= ea_get_int(a
->eattrs
, EA_CODE(PROTOCOL_BGP
, BA_ORIGINATOR_ID
), 0);
2643 u32 orig_b
= ea_get_int(rb
->attrs
->eattrs
, EA_CODE(PROTOCOL_BGP
, BA_ORIGINATOR_ID
), 0);
2645 /* Originator is either ORIGINATOR_ID (if present), or BGP neighbor address (if not) */
2646 if ((orig_a
!= orig_b
) || (!orig_a
&& !orig_b
&& !ipa_equal(a
->from
, rb
->attrs
->from
)))
2650 /* Find ASN of the best-match route, for use in next checks */
2651 u32 asn_b
= rta_get_first_asn(rb
->attrs
);
2655 /* RFC 9117 4.2. For EBGP, flowspec and its best-match route are from the same AS */
2656 if (!interior
&& (rta_get_first_asn(a
) != asn_b
))
2659 /* RFC 8955 6. c) More-specific routes are from the same AS as the best-match route */
2660 TRIE_WALK(tab_ip
->trie
, subnet
, &dst
)
2662 net
*nc
= net_find_valid(tab_ip
, &subnet
);
2666 rte
*rc
= nc
->routes
;
2667 if (rc
->attrs
->source
!= RTS_BGP
)
2670 if (rta_get_first_asn(rc
->attrs
) != asn_b
)
2678 #endif /* CONFIG_BGP */
2681 rt_flowspec_update_rte(rtable
*tab
, rte
*r
)
2684 if ((r
->attrs
->source
!= RTS_BGP
) || (r
->sender
->proto
!= r
->src
->proto
))
2687 struct bgp_channel
*bc
= (struct bgp_channel
*) r
->sender
;
2688 if (!bc
->base_table
)
2691 const net_addr
*n
= r
->net
->n
.addr
;
2692 struct bgp_proto
*p
= (void *) r
->src
->proto
;
2693 int valid
= rt_flowspec_check(bc
->base_table
, tab
, n
, r
->attrs
, p
->is_interior
);
2694 int dest
= valid
? RTD_NONE
: RTD_UNREACHABLE
;
2696 if (dest
== r
->attrs
->dest
)
2699 rta
*a
= alloca(RTA_MAX_SIZE
);
2700 memcpy(a
, r
->attrs
, rta_size(r
->attrs
));
2704 rte
*new = sl_alloc(rte_slab
);
2705 memcpy(new, r
, sizeof(rte
));
2706 new->attrs
= rta_lookup(a
);
2707 rt_lock_source(new->src
);
2717 rt_next_hop_update_net(rtable
*tab
, net
*n
)
2719 rte
**k
, *e
, *new, *old_best
, **new_best
;
2721 int free_old_best
= 0;
2723 old_best
= n
->routes
;
2727 for (k
= &n
->routes
; e
= *k
; k
= &e
->next
)
2729 if (!net_is_flow(n
->n
.addr
))
2730 new = rt_next_hop_update_rte(tab
, e
);
2732 new = rt_flowspec_update_rte(tab
, e
);
2738 rte_trace_in(D_ROUTES
, new->sender
, new, "updated");
2739 rte_announce_i(tab
, RA_ANY
, n
, new, e
, NULL
, NULL
);
2741 /* Call a pre-comparison hook */
2742 /* Not really an efficient way to compute this */
2743 if (e
->src
->proto
->rte_recalculate
)
2744 e
->src
->proto
->rte_recalculate(tab
, n
, new, e
, NULL
);
2748 else /* Freeing of the old best rte is postponed */
2759 /* Find the new best route */
2761 for (k
= &n
->routes
; e
= *k
; k
= &e
->next
)
2763 if (!new_best
|| rte_better(e
, *new_best
))
2767 /* Relink the new best route to the first position */
2769 if (new != n
->routes
)
2771 *new_best
= new->next
;
2772 new->next
= n
->routes
;
2776 /* Announce the new best route */
2777 if (new != old_best
)
2778 rte_trace_in(D_ROUTES
, new->sender
, new, "updated [best]");
2780 /* Propagate changes */
2781 rte_announce_i(tab
, RA_UNDEF
, n
, NULL
, NULL
, n
->routes
, old_best
);
2784 rte_free_quick(old_best
);
2790 rt_next_hop_update(rtable
*tab
)
2792 struct fib_iterator
*fit
= &tab
->nhu_fit
;
2795 if (tab
->nhu_state
== NHU_CLEAN
)
2798 if (tab
->nhu_state
== NHU_SCHEDULED
)
2800 FIB_ITERATE_INIT(fit
, &tab
->fib
);
2801 tab
->nhu_state
= NHU_RUNNING
;
2803 if (tab
->flowspec_trie
)
2804 rt_flowspec_reset_trie(tab
);
2807 FIB_ITERATE_START(&tab
->fib
, fit
, net
, n
)
2811 FIB_ITERATE_PUT(fit
);
2812 ev_schedule(tab
->rt_event
);
2815 max_feed
-= rt_next_hop_update_net(tab
, n
);
2820 * NHU_DIRTY -> NHU_SCHEDULED
2821 * NHU_RUNNING -> NHU_CLEAN
2823 tab
->nhu_state
&= 1;
2825 if (tab
->nhu_state
!= NHU_CLEAN
)
2826 ev_schedule(tab
->rt_event
);
2830 struct rtable_config
*
2831 rt_new_table(struct symbol
*s
, uint addr_type
)
2833 /* Hack that allows to 'redefine' the master table */
2834 if ((s
->class == SYM_TABLE
) &&
2835 (s
->table
== new_config
->def_tables
[addr_type
]) &&
2836 ((addr_type
== NET_IP4
) || (addr_type
== NET_IP6
)))
2839 struct rtable_config
*c
= cfg_allocz(sizeof(struct rtable_config
));
2841 cf_define_symbol(s
, SYM_TABLE
, table
, c
);
2843 c
->addr_type
= addr_type
;
2844 c
->gc_threshold
= 1000;
2845 c
->gc_period
= (uint
) -1; /* set in rt_postconfig() */
2846 c
->min_settle_time
= 1 S
;
2847 c
->max_settle_time
= 20 S
;
2849 add_tail(&new_config
->tables
, &c
->n
);
2851 /* First table of each type is kept as default */
2852 if (! new_config
->def_tables
[addr_type
])
2853 new_config
->def_tables
[addr_type
] = c
;
2859 * rt_lock_table - lock a routing table
2860 * @r: routing table to be locked
2862 * Lock a routing table, because it's in use by a protocol,
2863 * preventing it from being freed when it gets undefined in a new
2867 rt_lock_table(rtable
*r
)
2873 * rt_unlock_table - unlock a routing table
2874 * @r: routing table to be unlocked
2876 * Unlock a routing table formerly locked by rt_lock_table(),
2877 * that is decrease its use count and delete it if it's scheduled
2878 * for deletion by configuration changes.
2881 rt_unlock_table(rtable
*r
)
2883 if (!--r
->use_count
&& r
->deleted
)
2885 struct config
*conf
= r
->deleted
;
2887 /* Delete the routing table by freeing its pool */
2889 config_del_obstacle(conf
);
2894 rt_reconfigure(rtable
*tab
, struct rtable_config
*new, struct rtable_config
*old
)
2896 if ((new->addr_type
!= old
->addr_type
) ||
2897 (new->sorted
!= old
->sorted
) ||
2898 (new->trie_used
!= old
->trie_used
))
2901 DBG("\t%s: same\n", new->name
);
2903 tab
->name
= new->name
;
2909 static struct rtable_config
*
2910 rt_find_table_config(struct config
*cf
, char *name
)
2912 struct symbol
*sym
= cf_find_symbol(cf
, name
);
2913 return (sym
&& (sym
->class == SYM_TABLE
)) ? sym
->table
: NULL
;
2917 * rt_commit - commit new routing table configuration
2918 * @new: new configuration
2919 * @old: original configuration or %NULL if it's boot time config
2921 * Scan differences between @old and @new configuration and modify
2922 * the routing tables according to these changes. If @new defines a
2923 * previously unknown table, create it, if it omits a table existing
2924 * in @old, schedule it for deletion (it gets deleted when all protocols
2925 * disconnect from it by calling rt_unlock_table()), if it exists
2926 * in both configurations, leave it unchanged.
2929 rt_commit(struct config
*new, struct config
*old
)
2931 struct rtable_config
*o
, *r
;
2933 DBG("rt_commit:\n");
2936 WALK_LIST(o
, old
->tables
)
2938 rtable
*tab
= o
->table
;
2942 r
= rt_find_table_config(new, o
->name
);
2943 if (r
&& !new->shutdown
&& rt_reconfigure(tab
, r
, o
))
2946 DBG("\t%s: deleted\n", o
->name
);
2948 config_add_obstacle(old
);
2950 rt_unlock_table(tab
);
2954 WALK_LIST(r
, new->tables
)
2957 r
->table
= rt_setup(rt_table_pool
, r
);
2958 DBG("\t%s: created\n", r
->name
);
2959 add_tail(&routing_tables
, &r
->table
->n
);
2965 do_feed_channel(struct channel
*c
, net
*n
, rte
*e
)
2968 if (c
->ra_mode
== RA_ACCEPTED
)
2969 rt_notify_accepted(c
, n
, NULL
, NULL
, c
->refeeding
);
2970 else if (c
->ra_mode
== RA_MERGED
)
2971 rt_notify_merged(c
, n
, NULL
, NULL
, e
, e
, c
->refeeding
);
2973 rt_notify_basic(c
, n
, e
, e
, c
->refeeding
);
2974 rte_update_unlock();
2978 * rt_feed_channel - advertise all routes to a channel
2979 * @c: channel to be fed
2981 * This function performs one pass of advertisement of routes to a channel that
2982 * is in the ES_FEEDING state. It is called by the protocol code as long as it
2983 * has something to do. (We avoid transferring all the routes in single pass in
2984 * order not to monopolize CPU time.)
2987 rt_feed_channel(struct channel
*c
)
2989 struct fib_iterator
*fit
= &c
->feed_fit
;
2992 ASSERT(c
->export_state
== ES_FEEDING
);
2994 if (!c
->feed_active
)
2996 FIB_ITERATE_INIT(fit
, &c
->table
->fib
);
3000 FIB_ITERATE_START(&c
->table
->fib
, fit
, net
, n
)
3005 FIB_ITERATE_PUT(fit
);
3009 if ((c
->ra_mode
== RA_OPTIMAL
) ||
3010 (c
->ra_mode
== RA_ACCEPTED
) ||
3011 (c
->ra_mode
== RA_MERGED
))
3012 if (rte_is_valid(e
))
3014 /* In the meantime, the protocol may fell down */
3015 if (c
->export_state
!= ES_FEEDING
)
3018 do_feed_channel(c
, n
, e
);
3022 if (c
->ra_mode
== RA_ANY
)
3023 for(e
= n
->routes
; e
; e
= e
->next
)
3025 /* In the meantime, the protocol may fell down */
3026 if (c
->export_state
!= ES_FEEDING
)
3029 if (!rte_is_valid(e
))
3032 do_feed_channel(c
, n
, e
);
3044 * rt_feed_baby_abort - abort protocol feeding
3047 * This function is called by the protocol code when the protocol stops or
3048 * ceases to exist during the feeding.
3051 rt_feed_channel_abort(struct channel
*c
)
3055 /* Unlink the iterator */
3056 fit_get(&c
->table
->fib
, &c
->feed_fit
);
3067 rte_update_in(struct channel
*c
, const net_addr
*n
, rte
*new, struct rte_src
*src
)
3069 struct rtable
*tab
= c
->in_table
;
3075 net
= net_get(tab
, n
);
3077 if (!rta_is_cached(new->attrs
))
3078 new->attrs
= rta_lookup(new->attrs
);
3082 net
= net_find(tab
, n
);
3088 /* Find the old rte */
3089 for (pos
= &net
->routes
; old
= *pos
; pos
= &old
->next
)
3090 if (old
->src
== src
)
3092 if (new && rte_same(old
, new))
3094 /* Refresh the old rte, continue with update to main rtable */
3095 if (old
->flags
& (REF_STALE
| REF_DISCARD
| REF_MODIFY
))
3097 old
->flags
&= ~(REF_STALE
| REF_DISCARD
| REF_MODIFY
);
3099 if (c
->proto
->rte_update_in_notify
)
3100 c
->proto
->rte_update_in_notify(c
, n
, old
, src
);
3108 /* Move iterator if needed */
3109 if (old
== c
->reload_next_rte
)
3110 c
->reload_next_rte
= old
->next
;
3112 /* Remove the old rte */
3114 rte_free_quick(old
);
3126 fib_delete(&tab
->fib
, net
);
3128 if (c
->proto
->rte_update_in_notify
)
3129 c
->proto
->rte_update_in_notify(c
, n
, NULL
, src
);
3134 struct channel_limit
*l
= &c
->rx_limit
;
3135 if (l
->action
&& !old
)
3137 if (tab
->rt_count
>= l
->limit
)
3138 channel_notify_limit(c
, l
, PLD_RX
, tab
->rt_count
);
3140 if (l
->state
== PLS_BLOCKED
)
3142 /* Required by rte_trace_in() */
3145 rte_trace_in(D_FILTERS
, c
, new, "ignored [limit]");
3150 /* Insert the new rte */
3151 rte
*e
= rte_do_cow(new);
3152 e
->flags
|= REF_COW
;
3155 e
->lastmod
= current_time();
3160 if (c
->proto
->rte_update_in_notify
)
3161 c
->proto
->rte_update_in_notify(c
, n
, e
, src
);
3166 c
->stats
.imp_updates_received
++;
3167 c
->stats
.imp_updates_ignored
++;
3171 fib_delete(&tab
->fib
, net
);
3176 c
->stats
.imp_withdraws_received
++;
3177 c
->stats
.imp_withdraws_ignored
++;
3182 rt_reload_channel(struct channel
*c
)
3184 struct rtable
*tab
= c
->in_table
;
3185 struct fib_iterator
*fit
= &c
->reload_fit
;
3188 ASSERT(c
->channel_state
== CS_UP
);
3190 if (!c
->reload_active
)
3192 FIB_ITERATE_INIT(fit
, &tab
->fib
);
3193 c
->reload_active
= 1;
3197 for (rte
*e
= c
->reload_next_rte
; e
; e
= e
->next
)
3199 if (max_feed
-- <= 0)
3201 c
->reload_next_rte
= e
;
3202 debug("%s channel reload burst split (max_feed=%d)", c
->proto
->name
, max_feed
);
3206 rte_update2(c
, e
->net
->n
.addr
, rte_do_cow(e
), e
->src
);
3209 c
->reload_next_rte
= NULL
;
3211 FIB_ITERATE_START(&tab
->fib
, fit
, net
, n
)
3213 if (c
->reload_next_rte
= n
->routes
)
3215 FIB_ITERATE_PUT_NEXT(fit
, &tab
->fib
);
3221 while (c
->reload_next_rte
);
3223 c
->reload_active
= 0;
3228 rt_reload_channel_abort(struct channel
*c
)
3230 if (c
->reload_active
)
3232 /* Unlink the iterator */
3233 fit_get(&c
->in_table
->fib
, &c
->reload_fit
);
3234 c
->reload_next_rte
= NULL
;
3235 c
->reload_active
= 0;
3240 rt_prune_sync(rtable
*t
, int all
)
3242 struct fib_iterator fit
;
3244 FIB_ITERATE_INIT(&fit
, &t
->fib
);
3247 FIB_ITERATE_START(&t
->fib
, &fit
, net
, n
)
3249 rte
*e
, **ee
= &n
->routes
;
3253 if (all
|| (e
->flags
& (REF_STALE
| REF_DISCARD
)))
3263 if (all
|| !n
->routes
)
3265 FIB_ITERATE_PUT(&fit
);
3266 fib_delete(&t
->fib
, n
);
3279 rte_update_out(struct channel
*c
, const net_addr
*n
, rte
*new, rte
*old0
, int refeed
)
3281 struct rtable
*tab
= c
->out_table
;
3282 struct rte_src
*src
;
3288 net
= net_get(tab
, n
);
3291 if (!rta_is_cached(new->attrs
))
3292 new->attrs
= rta_lookup(new->attrs
);
3296 net
= net_find(tab
, n
);
3303 /* Find the old rte */
3304 for (pos
= &net
->routes
; old
= *pos
; pos
= &old
->next
)
3305 if ((c
->ra_mode
!= RA_ANY
) || (old
->src
== src
))
3307 if (new && rte_same(old
, new))
3309 /* REF_STALE / REF_DISCARD not used in export table */
3311 if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY))
3313 old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
3321 /* Remove the old rte */
3323 rte_free_quick(old
);
3335 fib_delete(&tab
->fib
, net
);
3340 /* Insert the new rte */
3341 rte
*e
= rte_do_cow(new);
3342 e
->flags
|= REF_COW
;
3345 e
->lastmod
= current_time();
3364 hc_hash(ip_addr a
, rtable
*dep
)
3366 return ipa_hash(a
) ^ ptr_hash(dep
);
3370 hc_insert(struct hostcache
*hc
, struct hostentry
*he
)
3372 uint k
= he
->hash_key
>> hc
->hash_shift
;
3373 he
->next
= hc
->hash_table
[k
];
3374 hc
->hash_table
[k
] = he
;
3378 hc_remove(struct hostcache
*hc
, struct hostentry
*he
)
3380 struct hostentry
**hep
;
3381 uint k
= he
->hash_key
>> hc
->hash_shift
;
3383 for (hep
= &hc
->hash_table
[k
]; *hep
!= he
; hep
= &(*hep
)->next
);
3387 #define HC_DEF_ORDER 10
3388 #define HC_HI_MARK *4
3389 #define HC_HI_STEP 2
3390 #define HC_HI_ORDER 16 /* Must be at most 16 */
3391 #define HC_LO_MARK /5
3392 #define HC_LO_STEP 2
3393 #define HC_LO_ORDER 10
3396 hc_alloc_table(struct hostcache
*hc
, pool
*p
, unsigned order
)
3398 uint hsize
= 1 << order
;
3399 hc
->hash_order
= order
;
3400 hc
->hash_shift
= 32 - order
;
3401 hc
->hash_max
= (order
>= HC_HI_ORDER
) ? ~0U : (hsize HC_HI_MARK
);
3402 hc
->hash_min
= (order
<= HC_LO_ORDER
) ? 0U : (hsize HC_LO_MARK
);
3404 hc
->hash_table
= mb_allocz(p
, hsize
* sizeof(struct hostentry
*));
3408 hc_resize(struct hostcache
*hc
, pool
*p
, unsigned new_order
)
3410 struct hostentry
**old_table
= hc
->hash_table
;
3411 struct hostentry
*he
, *hen
;
3412 uint old_size
= 1 << hc
->hash_order
;
3415 hc_alloc_table(hc
, p
, new_order
);
3416 for (i
= 0; i
< old_size
; i
++)
3417 for (he
= old_table
[i
]; he
!= NULL
; he
=hen
)
3425 static struct hostentry
*
3426 hc_new_hostentry(struct hostcache
*hc
, pool
*p
, ip_addr a
, ip_addr ll
, rtable
*dep
, unsigned k
)
3428 struct hostentry
*he
= sl_alloc(hc
->slab
);
3430 *he
= (struct hostentry
) {
3437 add_tail(&hc
->hostentries
, &he
->ln
);
3441 if (hc
->hash_items
> hc
->hash_max
)
3442 hc_resize(hc
, p
, hc
->hash_order
+ HC_HI_STEP
);
3448 hc_delete_hostentry(struct hostcache
*hc
, pool
*p
, struct hostentry
*he
)
3457 if (hc
->hash_items
< hc
->hash_min
)
3458 hc_resize(hc
, p
, hc
->hash_order
- HC_LO_STEP
);
3462 rt_init_hostcache(rtable
*tab
)
3464 struct hostcache
*hc
= mb_allocz(tab
->rp
, sizeof(struct hostcache
));
3465 init_list(&hc
->hostentries
);
3468 hc_alloc_table(hc
, tab
->rp
, HC_DEF_ORDER
);
3469 hc
->slab
= sl_new(tab
->rp
, sizeof(struct hostentry
));
3471 hc
->lp
= lp_new(tab
->rp
);
3472 hc
->trie
= f_new_trie(hc
->lp
, 0);
3474 tab
->hostcache
= hc
;
3478 rt_free_hostcache(rtable
*tab
)
3480 struct hostcache
*hc
= tab
->hostcache
;
3483 WALK_LIST(n
, hc
->hostentries
)
3485 struct hostentry
*he
= SKIP_BACK(struct hostentry
, ln
, n
);
3489 log(L_ERR
"Hostcache is not empty in table %s", tab
->name
);
3492 /* Freed automagically by the resource pool
3495 mb_free(hc->hash_table);
3501 rt_notify_hostcache(rtable
*tab
, net
*net
)
3503 if (tab
->hcu_scheduled
)
3506 if (trie_match_net(tab
->hostcache
->trie
, net
->n
.addr
))
3507 rt_schedule_hcu(tab
);
3511 if_local_addr(ip_addr a
, struct iface
*i
)
3515 WALK_LIST(b
, i
->addrs
)
3516 if (ipa_equal(a
, b
->ip
))
3523 rt_get_igp_metric(rte
*rt
)
3525 eattr
*ea
= ea_find(rt
->attrs
->eattrs
, EA_GEN_IGP_METRIC
);
3530 if (rt
->attrs
->source
== RTS_DEVICE
)
3533 if (rt
->src
->proto
->rte_igp_metric
)
3534 return rt
->src
->proto
->rte_igp_metric(rt
);
3536 return IGP_METRIC_UNKNOWN
;
3540 rt_update_hostentry(rtable
*tab
, struct hostentry
*he
)
3542 rta
*old_src
= he
->src
;
3546 /* Reset the hostentry */
3548 he
->dest
= RTD_UNREACHABLE
;
3549 he
->nexthop_linkable
= 0;
3553 net_fill_ip_host(&he_addr
, he
->addr
);
3554 net
*n
= net_route(tab
, &he_addr
);
3559 word pref
= a
->pref
;
3561 for (rte
*ee
= n
->routes
; ee
; ee
= ee
->next
)
3562 if ((ee
->attrs
->pref
>= pref
) && ee
->attrs
->hostentry
)
3564 /* Recursive route should not depend on another recursive route */
3565 log(L_WARN
"Next hop address %I resolvable through recursive route for %N",
3566 he
->addr
, n
->n
.addr
);
3570 pxlen
= n
->n
.addr
->pxlen
;
3572 if (a
->dest
== RTD_UNICAST
)
3574 for (struct nexthop
*nh
= &(a
->nh
); nh
; nh
= nh
->next
)
3575 if (ipa_zero(nh
->gw
))
3577 if (if_local_addr(he
->addr
, nh
->iface
))
3579 /* The host address is a local address, this is not valid */
3580 log(L_WARN
"Next hop address %I is a local address of iface %s",
3581 he
->addr
, nh
->iface
->name
);
3589 he
->src
= rta_clone(a
);
3591 he
->nexthop_linkable
= !direct
;
3592 he
->igp_metric
= rt_get_igp_metric(e
);
3596 /* Add a prefix range to the trie */
3597 trie_add_prefix(tab
->hostcache
->trie
, &he_addr
, pxlen
, he_addr
.pxlen
);
3600 return old_src
!= he
->src
;
3604 rt_update_hostcache(rtable
*tab
)
3606 struct hostcache
*hc
= tab
->hostcache
;
3607 struct hostentry
*he
;
3610 /* Reset the trie */
3612 hc
->trie
= f_new_trie(hc
->lp
, 0);
3614 WALK_LIST_DELSAFE(n
, x
, hc
->hostentries
)
3616 he
= SKIP_BACK(struct hostentry
, ln
, n
);
3619 hc_delete_hostentry(hc
, tab
->rp
, he
);
3623 if (rt_update_hostentry(tab
, he
))
3624 rt_schedule_nhu(he
->tab
);
3627 tab
->hcu_scheduled
= 0;
3631 rt_get_hostentry(rtable
*tab
, ip_addr a
, ip_addr ll
, rtable
*dep
)
3633 ip_addr link
= ipa_zero(ll
) ? a
: ll
;
3634 struct hostentry
*he
;
3636 if (!tab
->hostcache
)
3637 rt_init_hostcache(tab
);
3639 u32 k
= hc_hash(a
, dep
);
3640 struct hostcache
*hc
= tab
->hostcache
;
3641 for (he
= hc
->hash_table
[k
>> hc
->hash_shift
]; he
!= NULL
; he
= he
->next
)
3642 if (ipa_equal(he
->addr
, a
) && ipa_equal(he
->link
, link
) && (he
->tab
== dep
))
3645 he
= hc_new_hostentry(hc
, tab
->rp
, a
, link
, dep
, k
);
3646 rt_update_hostentry(tab
, he
);
3652 * Documentation for functions declared inline in route.h
3657 * net_find - find a network entry
3658 * @tab: a routing table
3659 * @addr: address of the network
3661 * net_find() looks up the given network in routing table @tab and
3662 * returns a pointer to its &net entry or %NULL if no such network
3665 static inline net
*net_find(rtable
*tab
, net_addr
*addr
)
3669 * net_get - obtain a network entry
3670 * @tab: a routing table
3671 * @addr: address of the network
3673 * net_get() looks up the given network in routing table @tab and
3674 * returns a pointer to its &net entry. If no such entry exists, it's
3677 static inline net
*net_get(rtable
*tab
, net_addr
*addr
)
3681 * rte_cow - copy a route for writing
3682 * @r: a route entry to be copied
3684 * rte_cow() takes a &rte and prepares it for modification. The exact action
3685 * taken depends on the flags of the &rte -- if it's a temporary entry, it's
3686 * just returned unchanged, else a new temporary entry with the same contents
3689 * The primary use of this function is inside the filter machinery -- when
3690 * a filter wants to modify &rte contents (to change the preference or to
3691 * attach another set of attributes), it must ensure that the &rte is not
3692 * shared with anyone else (and especially that it isn't stored in any routing
3695 * Result: a pointer to the new writable &rte.
3697 static inline rte
* rte_cow(rte
*r
)