]>
git.ipfire.org Git - thirdparty/bird.git/blob - nest/rt-table.c
ee3f11882fd613c43eb4c87be9e9f5217d9699fc
2 * BIRD -- Routing Tables
4 * (c) 1998--2000 Martin Mares <mj@ucw.cz>
6 * Can be freely distributed and used under the terms of the GNU GPL.
12 * Routing tables are probably the most important structures BIRD uses. They
13 * hold all the information about known networks, the associated routes and
16 * There are multiple routing tables (a primary one together with any
17 * number of secondary ones if requested by the configuration). Each table
18 * is basically a FIB containing entries describing the individual
19 * destination networks. For each network (represented by structure &net),
20 * there is a one-way linked list of route entries (&rte), the first entry
21 * on the list being the best one (i.e., the one we currently use
22 * for routing), the order of the other ones is undetermined.
24 * The &rte contains information about the route. There are net and src, which
25 * together forms a key identifying the route in a routing table. There is a
26 * pointer to a &rta structure (see the route attribute module for a precise
27 * explanation) holding the route attributes, which are primary data about the
28 * route. There are several technical fields used by routing table code (route
29 * id, REF_* flags), There is also the pflags field, holding protocol-specific
30 * flags. They are not used by routing table code, but by protocol-specific
31 * hooks. In contrast to route attributes, they are not primary data and their
32 * validity is also limited to the routing table.
34 * There are several mechanisms that allow automatic update of routes in one
35 * routing table (dst) as a result of changes in another routing table (src).
36 * They handle issues of recursive next hop resolving, flowspec validation and
39 * The first such mechanism is handling of recursive next hops. A route in the
40 * dst table has an indirect next hop address, which is resolved through a route
41 * in the src table (which may also be the same table) to get an immediate next
42 * hop. This is implemented using structure &hostcache attached to the src
43 * table, which contains &hostentry structures for each tracked next hop
44 * address. These structures are linked from recursive routes in dst tables,
45 * possibly multiple routes sharing one hostentry (as many routes may have the
46 * same indirect next hop). There is also a trie in the hostcache, which matches
47 * all prefixes that may influence resolving of tracked next hops.
49 * When a best route changes in the src table, the hostcache is notified using
50 * rt_notify_hostcache(), which immediately checks using the trie whether the
51 * change is relevant and if it is, then it schedules asynchronous hostcache
52 * recomputation. The recomputation is done by rt_update_hostcache() (called
53 * from rt_event() of src table), it walks through all hostentries and resolves
54 * them (by rt_update_hostentry()). It also updates the trie. If a change in
55 * hostentry resolution was found, then it schedules asynchronous nexthop
56 * recomputation of associated dst table. That is done by rt_next_hop_update()
57 * (called from rt_event() of dst table), it iterates over all routes in the dst
58 * table and re-examines their hostentries for changes. Note that in contrast to
59 * hostcache update, next hop update can be interrupted by main loop. These two
60 * full-table walks (over hostcache and dst table) are necessary due to absence
61 * of direct lookups (route -> affected nexthop, nexthop -> its route).
63 * The second mechanism is for flowspec validation, where validity of flowspec
64 * routes depends of resolving their network prefixes in IP routing tables. This
65 * is similar to the recursive next hop mechanism, but simpler as there are no
66 * intermediate hostcache and hostentries (because flows are less likely to
67 * share common net prefix than routes sharing a common next hop). In src table,
68 * there is a list of dst tables (list flowspec_links), this list is updated by
69 * flowpsec channels (by rt_flowspec_link() and rt_flowspec_unlink() during
70 * channel start/stop). Each dst table has its own trie of prefixes that may
71 * influence validation of flowspec routes in it (flowspec_trie).
73 * When a best route changes in the src table, rt_flowspec_notify() immediately
74 * checks all dst tables from the list using their tries to see whether the
75 * change is relevant for them. If it is, then an asynchronous re-validation of
76 * flowspec routes in the dst table is scheduled. That is also done by function
77 * rt_next_hop_update(), like nexthop recomputation above. It iterates over all
78 * flowspec routes and re-validates them. It also recalculates the trie.
80 * Note that in contrast to the hostcache update, here the trie is recalculated
81 * during the rt_next_hop_update(), which may be interleaved with IP route
82 * updates. The trie is flushed at the beginning of recalculation, which means
83 * that such updates may use partial trie to see if they are relevant. But it
84 * works anyway! Either affected flowspec was already re-validated and added to
85 * the trie, then IP route change would match the trie and trigger a next round
86 * of re-validation, or it was not yet re-validated and added to the trie, but
87 * will be re-validated later in this round anyway.
89 * The third mechanism is used for RPKI re-validation of IP routes and it is the
90 * simplest. It is just a list of subscribers in src table, who are notified
91 * when any change happened, but only after a settle time. Also, in RPKI case
92 * the dst is not a table, but a channel, who refeeds routes through a filter.
97 #include "nest/bird.h"
98 #include "nest/route.h"
99 #include "nest/protocol.h"
100 #include "nest/iface.h"
101 #include "nest/mpls.h"
102 #include "lib/resource.h"
103 #include "lib/event.h"
104 #include "lib/timer.h"
105 #include "lib/string.h"
106 #include "conf/conf.h"
107 #include "filter/filter.h"
108 #include "filter/data.h"
109 #include "lib/hash.h"
110 #include "lib/string.h"
111 #include "lib/alloca.h"
112 #include "lib/flowspec.h"
115 #include "proto/bgp/bgp.h"
120 static slab
*rte_slab
;
121 linpool
*rte_update_pool
;
125 static void rt_free_hostcache(rtable
*tab
);
126 static void rt_notify_hostcache(rtable
*tab
, net
*net
);
127 static void rt_update_hostcache(rtable
*tab
);
128 static void rt_next_hop_update(rtable
*tab
);
129 static inline void rt_prune_table(rtable
*tab
);
130 static inline void rt_schedule_notify(rtable
*tab
);
131 static void rt_flowspec_notify(rtable
*tab
, net
*net
);
132 static void rt_kick_prune_timer(rtable
*tab
);
136 net_init_with_trie(struct fib
*f
, void *N
)
138 rtable
*tab
= SKIP_BACK(rtable
, fib
, f
);
142 trie_add_prefix(tab
->trie
, n
->n
.addr
, n
->n
.addr
->pxlen
, n
->n
.addr
->pxlen
);
145 trie_add_prefix(tab
->trie_new
, n
->n
.addr
, n
->n
.addr
->pxlen
, n
->n
.addr
->pxlen
);
149 net_route_ip6_sadr_trie(rtable
*t
, const net_addr_ip6_sadr
*n0
)
151 TRIE_WALK_TO_ROOT_IP6(t
->trie
, (const net_addr_ip6
*) n0
, px
)
153 net_addr_ip6_sadr n
= NET_ADDR_IP6_SADR(px
.prefix
, px
.pxlen
, n0
->src_prefix
, n0
->src_pxlen
);
157 /* We need to do dst first matching. Since sadr addresses are hashed on dst
158 prefix only, find the hash table chain and go through it to find the
159 match with the longest matching src prefix. */
160 for (struct fib_node
*fn
= fib_get_chain(&t
->fib
, (net_addr
*) &n
); fn
; fn
= fn
->next
)
162 net_addr_ip6_sadr
*a
= (void *) fn
->addr
;
164 if (net_equal_dst_ip6_sadr(&n
, a
) &&
165 net_in_net_src_ip6_sadr(&n
, a
) &&
166 (a
->src_pxlen
>= best_pxlen
))
168 best
= fib_node_to_user(&t
->fib
, fn
);
169 best_pxlen
= a
->src_pxlen
;
176 TRIE_WALK_TO_ROOT_END
;
183 net_route_ip6_sadr_fib(rtable
*t
, const net_addr_ip6_sadr
*n0
)
186 net_copy_ip6_sadr(&n
, n0
);
193 /* We need to do dst first matching. Since sadr addresses are hashed on dst
194 prefix only, find the hash table chain and go through it to find the
195 match with the longest matching src prefix. */
196 for (struct fib_node
*fn
= fib_get_chain(&t
->fib
, (net_addr
*) &n
); fn
; fn
= fn
->next
)
198 net_addr_ip6_sadr
*a
= (void *) fn
->addr
;
200 if (net_equal_dst_ip6_sadr(&n
, a
) &&
201 net_in_net_src_ip6_sadr(&n
, a
) &&
202 (a
->src_pxlen
>= best_pxlen
))
204 best
= fib_node_to_user(&t
->fib
, fn
);
205 best_pxlen
= a
->src_pxlen
;
216 ip6_clrbit(&n
.dst_prefix
, n
.dst_pxlen
);
223 net_route(rtable
*tab
, const net_addr
*n
)
225 ASSERT(tab
->addr_type
== n
->type
);
226 net_addr_union
*nu
= SKIP_BACK(net_addr_union
, n
, n
);
228 #define TW(ipv, what) \
229 TRIE_WALK_TO_ROOT_IP##ipv(tab->trie, &(nu->ip##ipv), var) \
230 { what(ipv, var); } \
231 TRIE_WALK_TO_ROOT_END; return NULL;
233 #define FW(ipv, what) do { \
234 net_addr_union nuc; net_copy(&nuc.n, n); \
236 what(ipv, nuc.ip##ipv); if (!nuc.n.pxlen) return NULL; \
237 nuc.n.pxlen--; ip##ipv##_clrbit(&nuc.ip##ipv.prefix, nuc.ip##ipv.pxlen); \
239 } while(0); return NULL;
241 #define FVR_IP(ipv, var) \
242 net *r; if (r = net_find_valid(tab, (net_addr *) &var)) return r;
244 #define FVR_VPN(ipv, var) \
245 net_addr_vpn##ipv _var0 = NET_ADDR_VPN##ipv(var.prefix, var.pxlen, nu->vpn##ipv.rd); FVR_IP(ipv, _var0);
249 case NET_IP4
: TW(4, FVR_IP
);
250 case NET_VPN4
: TW(4, FVR_VPN
);
251 case NET_IP6
: TW(6, FVR_IP
);
252 case NET_VPN6
: TW(6, FVR_VPN
);
255 return net_route_ip6_sadr_trie(tab
, (net_addr_ip6_sadr
*) n
);
261 case NET_IP4
: FW(4, FVR_IP
);
262 case NET_VPN4
: FW(4, FVR_VPN
);
263 case NET_IP6
: FW(6, FVR_IP
);
264 case NET_VPN6
: FW(6, FVR_VPN
);
267 return net_route_ip6_sadr_fib (tab
, (net_addr_ip6_sadr
*) n
);
280 * roa_check - check validity of route origination in a ROA table
282 * @n: network prefix to check
283 * @asn: AS number of network prefix
285 * Implements RFC 6483 route validation for the given network prefix. The
286 * procedure is to find all candidate ROAs - ROAs whose prefixes cover the given
287 * network prefix. If there is no candidate ROA, return ROA_UNKNOWN. If there is
288 * a candidate ROA with matching ASN and maxlen field greater than or equal to
289 * the given prefix length, return ROA_VALID. Otherwise, return ROA_INVALID. If
290 * caller cannot determine origin AS, 0 could be used (in that case ROA_VALID
291 * cannot happen). Table @tab must have type NET_ROA4 or NET_ROA6, network @n
292 * must have type NET_IP4 or NET_IP6, respectively.
295 net_roa_check(rtable
*tab
, const net_addr
*n
, u32 asn
)
297 net_addr_union
*nu
= SKIP_BACK(net_addr_union
, n
, n
);
301 #define TW(ipv) do { \
302 TRIE_WALK_TO_ROOT_IP##ipv(tab->trie, &(nu->ip##ipv), var) { \
303 net_addr_roa##ipv roa0 = NET_ADDR_ROA##ipv(var.prefix, var.pxlen, 0, 0); \
304 ROA_PARTIAL_CHECK(ipv); \
305 } TRIE_WALK_TO_ROOT_END; \
306 return anything ? ROA_INVALID : ROA_UNKNOWN; \
309 #define FW(ipv) do { \
310 net_addr_roa##ipv roa0 = NET_ADDR_ROA##ipv(nu->ip##ipv.prefix, nu->ip##ipv.pxlen, 0, 0);\
312 ROA_PARTIAL_CHECK(ipv); \
313 if (roa0.pxlen == 0) break; \
314 roa0.pxlen--; ip##ipv##_clrbit(&roa0.prefix, roa0.pxlen); \
318 #define ROA_PARTIAL_CHECK(ipv) do { \
319 for (fn = fib_get_chain(&tab->fib, (net_addr *) &roa0); fn; fn = fn->next) \
321 net_addr_roa##ipv *roa = (void *) fn->addr; \
322 net *r = fib_node_to_user(&tab->fib, fn); \
323 if (net_equal_prefix_roa##ipv(roa, &roa0) && rte_is_valid(r->routes)) \
326 if (asn && (roa->asn == asn) && (roa->max_pxlen >= nu->ip##ipv.pxlen)) \
332 if ((tab
->addr_type
== NET_ROA4
) && (n
->type
== NET_IP4
))
334 if (tab
->trie
) TW(4);
337 else if ((tab
->addr_type
== NET_ROA6
) && (n
->type
== NET_IP6
))
339 if (tab
->trie
) TW(6);
343 return anything
? ROA_INVALID
: ROA_UNKNOWN
;
344 #undef ROA_PARTIAL_CHECK
350 * aspa_check - check validity of AS Path in an ASPA table
352 * @path: AS Path to check
354 * Implements draft-ietf-sidrops-aspa-verification-16.
356 enum aspa_result
aspa_check(rtable
*tab
, const adata
*path
, bool force_upstream
)
359 lp_save(tmp_linpool
, &lps
);
361 /* No support for confed paths */
362 if (as_path_contains_confed(path
))
365 /* Check path length */
366 uint len
= as_path_getlen(path
);
370 /* Normalize the AS Path: drop stuffings */
371 u32
*asns
= alloca(sizeof(u32
) * len
);
374 while (as_path_walk(path
, &ppos
, &asns
[nsz
]))
375 if ((nsz
== 0) || (asns
[nsz
] != asns
[nsz
-1]))
378 /* Find the provider blocks for every AS on the path
379 * and check allowed directions */
380 uint max_up
= 0, min_up
= 0, max_down
= 0, min_down
= 0;
382 for (uint ap
=0; ap
<nsz
; ap
++)
384 net_addr_union nau
= { .aspa
= NET_ADDR_ASPA(asns
[ap
]), };
385 net
*n
= net_find(tab
, &nau
.n
);
387 bool found
= false, down
= false, up
= false;
389 for (rte
*e
= (n
? n
->routes
: NULL
); e
; e
= e
->next
)
391 if (!rte_is_valid(e
))
394 eattr
*ea
= ea_find(e
->attrs
->eattrs
, EA_ASPA_PROVIDERS
);
398 /* Actually found some ASPA */
401 for (uint i
=0; i
* sizeof(u32
) < ea
->u
.ptr
->length
; i
++)
403 if ((ap
> 0) && ((u32
*) ea
->u
.ptr
->data
)[i
] == asns
[ap
-1])
405 if ((ap
+ 1 < nsz
) && ((u32
*) ea
->u
.ptr
->data
)[i
] == asns
[ap
+1])
409 /* Both peers found */
415 /* Fast path for the upstream check */
419 /* Move min-upstream */
422 /* Exists but doesn't allow this upstream */
426 /* Fast path for no ASPA here */
429 /* Extend max-downstream (min-downstream is stopped by unknown) */
432 /* Move min-upstream (can't include unknown) */
436 /* ASPA exists and downstream may be extended */
439 /* Extending max-downstream always */
442 /* Extending min-downstream unless unknown seen */
446 /* Downstream only */
448 min_up
= max_up
= ap
;
451 /* No extension for downstream, force upstream only from now */
456 /* Not even upstream, move the ending here */
458 min_up
= max_up
= ap
;
462 /* Is the path surely valid? */
463 if (min_up
<= min_down
)
466 /* Is the path maybe valid? */
467 if (max_up
<= max_down
)
470 /* Now there is surely a valley there. */
475 * rte_find - find a route
479 * The rte_find() function returns a route for destination @net
480 * which is from route source @src.
483 rte_find(net
*net
, struct rte_src
*src
)
485 rte
*e
= net
->routes
;
487 while (e
&& e
->src
!= src
)
493 * rte_get_temp - get a temporary &rte
494 * @a: attributes to assign to the new route (a &rta; in case it's
495 * un-cached, rte_update() will create a cached copy automatically)
498 * Create a temporary &rte and bind it with the attributes @a.
501 rte_get_temp(rta
*a
, struct rte_src
*src
)
503 rte
*e
= sl_alloc(rte_slab
);
509 rt_lock_source(e
->src
= src
);
516 rte
*e
= sl_alloc(rte_slab
);
518 memcpy(e
, r
, sizeof(rte
));
520 rt_lock_source(e
->src
);
521 e
->attrs
= rta_clone(r
->attrs
);
527 * rte_cow_rta - get a private writable copy of &rte with writable &rta
528 * @r: a route entry to be copied
529 * @lp: a linpool from which to allocate &rta
531 * rte_cow_rta() takes a &rte and prepares it and associated &rta for
532 * modification. There are three possibilities: First, both &rte and &rta are
533 * private copies, in that case they are returned unchanged. Second, &rte is
534 * private copy, but &rta is cached, in that case &rta is duplicated using
535 * rta_do_cow(). Third, both &rte is shared and &rta is cached, in that case
536 * both structures are duplicated by rte_do_cow() and rta_do_cow().
538 * Note that in the second case, cached &rta loses one reference, while private
539 * copy created by rta_do_cow() is a shallow copy sharing indirect data (eattrs,
540 * nexthops, ...) with it. To work properly, original shared &rta should have
541 * another reference during the life of created private copy.
543 * Result: a pointer to the new writable &rte with writable &rta.
546 rte_cow_rta(rte
*r
, linpool
*lp
)
548 if (!rta_is_cached(r
->attrs
))
552 rta
*a
= rta_do_cow(r
->attrs
, lp
);
558 static int /* Actually better or at least as good as */
559 rte_better(rte
*new, rte
*old
)
561 int (*better
)(rte
*, rte
*);
563 if (!rte_is_valid(old
))
565 if (!rte_is_valid(new))
568 if (new->attrs
->pref
> old
->attrs
->pref
)
570 if (new->attrs
->pref
< old
->attrs
->pref
)
572 if (new->src
->proto
->proto
!= old
->src
->proto
->proto
)
575 * If the user has configured protocol preferences, so that two different protocols
576 * have the same preference, try to break the tie by comparing addresses. Not too
577 * useful, but keeps the ordering of routes unambiguous.
579 return new->src
->proto
->proto
> old
->src
->proto
->proto
;
581 if (better
= new->src
->proto
->rte_better
)
582 return better(new, old
);
587 rte_mergable(rte
*pri
, rte
*sec
)
589 int (*mergable
)(rte
*, rte
*);
591 if (!rte_is_valid(pri
) || !rte_is_valid(sec
))
594 if (pri
->attrs
->pref
!= sec
->attrs
->pref
)
597 if (pri
->src
->proto
->proto
!= sec
->src
->proto
->proto
)
600 if (mergable
= pri
->src
->proto
->rte_mergable
)
601 return mergable(pri
, sec
);
607 rte_trace(struct channel
*c
, rte
*e
, int dir
, char *msg
)
609 log(L_TRACE
"%s.%s %c %s %N %luL %uG %s",
610 c
->proto
->name
, c
->name
?: "?", dir
, msg
, e
->net
->n
.addr
, e
->src
->private_id
, e
->src
->global_id
,
611 rta_dest_name(e
->attrs
->dest
));
615 rte_trace_in(uint flag
, struct channel
*c
, rte
*e
, char *msg
)
617 if ((c
->debug
& flag
) || (c
->proto
->debug
& flag
))
618 rte_trace(c
, e
, '>', msg
);
622 rte_trace_out(uint flag
, struct channel
*c
, rte
*e
, char *msg
)
624 if ((c
->debug
& flag
) || (c
->proto
->debug
& flag
))
625 rte_trace(c
, e
, '<', msg
);
629 export_filter_(struct channel
*c
, rte
*rt0
, rte
**rt_free
, linpool
*pool
, int silent
)
631 struct proto
*p
= c
->proto
;
632 const struct filter
*filter
= c
->out_filter
;
633 struct proto_stats
*stats
= &c
->stats
;
640 v
= p
->preexport
? p
->preexport(c
, rt
) : 0;
646 stats
->exp_updates_rejected
++;
648 rte_trace_out(D_FILTERS
, c
, rt
, "rejected by protocol");
654 rte_trace_out(D_FILTERS
, c
, rt
, "forced accept by protocol");
658 v
= filter
&& ((filter
== FILTER_REJECT
) ||
659 (f_run(filter
, &rt
, pool
,
660 (silent
? FF_SILENT
: 0)) > F_ACCEPT
));
666 stats
->exp_updates_filtered
++;
667 rte_trace_out(D_FILTERS
, c
, rt
, "filtered out");
677 /* Discard temporary rte */
684 export_filter(struct channel
*c
, rte
*rt0
, rte
**rt_free
, int silent
)
686 return export_filter_(c
, rt0
, rt_free
, rte_update_pool
, silent
);
690 do_rt_notify(struct channel
*c
, net
*net
, rte
*new, rte
*old
, int refeed
)
692 struct proto
*p
= c
->proto
;
693 struct proto_stats
*stats
= &c
->stats
;
698 /* Apply export limit */
699 struct channel_limit
*l
= &c
->out_limit
;
700 if (l
->action
&& !old
&& new)
702 if (stats
->exp_routes
>= l
->limit
)
703 channel_notify_limit(c
, l
, PLD_OUT
, stats
->exp_routes
);
705 if (l
->state
== PLS_BLOCKED
)
707 stats
->exp_updates_rejected
++;
708 rte_trace_out(D_FILTERS
, c
, new, "rejected [limit]");
713 /* Apply export table */
714 if (c
->out_table
&& !rte_update_out(c
, net
->n
.addr
, new, old
, refeed
))
718 stats
->exp_updates_accepted
++;
720 stats
->exp_withdraws_accepted
++;
724 bmap_clear(&c
->export_map
, old
->id
);
730 bmap_set(&c
->export_map
, new->id
);
734 if (p
->debug
& D_ROUTES
)
737 rte_trace_out(D_ROUTES
, c
, new, "replaced");
739 rte_trace_out(D_ROUTES
, c
, new, "added");
741 rte_trace_out(D_ROUTES
, c
, old
, "removed");
744 p
->rt_notify(p
, c
, net
, new, old
);
748 rt_notify_basic(struct channel
*c
, net
*net
, rte
*new, rte
*old
, int refeed
)
750 // struct proto *p = c->proto;
751 rte
*new_free
= NULL
;
754 c
->stats
.exp_updates_received
++;
756 c
->stats
.exp_withdraws_received
++;
759 new = export_filter(c
, new, &new_free
, 0);
761 if (old
&& !bmap_test(&c
->export_map
, old
->id
))
767 do_rt_notify(c
, net
, new, old
, refeed
);
769 /* Discard temporary rte */
775 rt_notify_accepted(struct channel
*c
, net
*net
, rte
*new_changed
, rte
*old_changed
, int refeed
)
777 // struct proto *p = c->proto;
778 rte
*new_best
= NULL
;
779 rte
*old_best
= NULL
;
780 rte
*new_free
= NULL
;
784 * We assume that there are no changes in net route order except (added)
785 * new_changed and (removed) old_changed. Therefore, the function is not
786 * compatible with deterministic_med (where nontrivial reordering can happen
787 * as a result of a route change) and with recomputation of recursive routes
788 * due to next hop update (where many routes can be changed in one step).
790 * Note that we need this assumption just for optimizations, we could just
791 * run full new_best recomputation otherwise.
793 * There are three cases:
794 * feed or old_best is old_changed -> we need to recompute new_best
795 * old_best is before new_changed -> new_best is old_best, ignore
796 * old_best is after new_changed -> try new_changed, otherwise old_best
800 c
->stats
.exp_updates_received
++;
802 c
->stats
.exp_withdraws_received
++;
804 /* Find old_best - either old_changed, or route for net->routes */
805 if (old_changed
&& bmap_test(&c
->export_map
, old_changed
->id
))
806 old_best
= old_changed
;
809 for (rte
*r
= net
->routes
; rte_is_valid(r
); r
= r
->next
)
811 if (bmap_test(&c
->export_map
, r
->id
))
817 /* Note if new_changed found before old_best */
818 if (r
== new_changed
)
824 if ((new_changed
== old_changed
) || (old_best
== old_changed
))
826 /* Feed or old_best changed -> find first accepted by filters */
827 for (rte
*r
= net
->routes
; rte_is_valid(r
); r
= r
->next
)
828 if (new_best
= export_filter(c
, r
, &new_free
, 0))
833 /* Other cases -> either new_changed, or old_best (and nothing changed) */
834 if (new_first
&& (new_changed
= export_filter(c
, new_changed
, &new_free
, 0)))
835 new_best
= new_changed
;
840 if (!new_best
&& !old_best
)
843 do_rt_notify(c
, net
, new_best
, old_best
, refeed
);
845 /* Discard temporary rte */
851 static struct nexthop
*
852 nexthop_merge_rta(struct nexthop
*nhs
, rta
*a
, linpool
*pool
, int max
)
854 return nexthop_merge(nhs
, &(a
->nh
), 1, 0, max
, pool
);
858 rt_export_merged(struct channel
*c
, net
*net
, rte
**rt_free
, linpool
*pool
, int silent
)
860 // struct proto *p = c->proto;
861 struct nexthop
*nhs
= NULL
;
862 rte
*best0
, *best
, *rt0
, *rt
, *tmp
;
867 if (!rte_is_valid(best0
))
870 best
= export_filter_(c
, best0
, rt_free
, pool
, silent
);
872 if (!best
|| !rte_is_reachable(best
))
875 for (rt0
= best0
->next
; rt0
; rt0
= rt0
->next
)
877 if (!rte_mergable(best0
, rt0
))
880 rt
= export_filter_(c
, rt0
, &tmp
, pool
, 1);
885 if (rte_is_reachable(rt
))
886 nhs
= nexthop_merge_rta(nhs
, rt
->attrs
, pool
, c
->merge_limit
);
894 nhs
= nexthop_merge_rta(nhs
, best
->attrs
, pool
, c
->merge_limit
);
898 best
= rte_cow_rta(best
, pool
);
899 nexthop_link(best
->attrs
, nhs
);
910 rt_notify_merged(struct channel
*c
, net
*net
, rte
*new_changed
, rte
*old_changed
,
911 rte
*new_best
, rte
*old_best
, int refeed
)
913 // struct proto *p = c->proto;
914 rte
*new_free
= NULL
;
916 /* We assume that all rte arguments are either NULL or rte_is_valid() */
918 /* This check should be done by the caller */
919 if (!new_best
&& !old_best
)
922 /* Check whether the change is relevant to the merged route */
923 if ((new_best
== old_best
) &&
924 (new_changed
!= old_changed
) &&
925 !rte_mergable(new_best
, new_changed
) &&
926 !rte_mergable(old_best
, old_changed
))
930 c
->stats
.exp_updates_received
++;
932 c
->stats
.exp_withdraws_received
++;
934 /* Prepare new merged route */
936 new_best
= rt_export_merged(c
, net
, &new_free
, rte_update_pool
, 0);
938 /* Check old merged route */
939 if (old_best
&& !bmap_test(&c
->export_map
, old_best
->id
))
942 if (!new_best
&& !old_best
)
945 do_rt_notify(c
, net
, new_best
, old_best
, refeed
);
947 /* Discard temporary rte */
954 * rte_announce - announce a routing table change
955 * @tab: table the route has been added to
956 * @type: type of route announcement (RA_UNDEF or RA_ANY)
957 * @net: network in question
958 * @new: the new or changed route
959 * @old: the previous route replaced by the new one
960 * @new_best: the new best route for the same network
961 * @old_best: the previous best route for the same network
963 * This function gets a routing table update and announces it to all protocols
964 * that are connected to the same table by their channels.
966 * There are two ways of how routing table changes are announced. First, there
967 * is a change of just one route in @net (which may caused a change of the best
968 * route of the network). In this case @new and @old describes the changed route
969 * and @new_best and @old_best describes best routes. Other routes are not
970 * affected, but in sorted table the order of other routes might change.
972 * Second, There is a bulk change of multiple routes in @net, with shared best
973 * route selection. In such case separate route changes are described using
974 * @type of %RA_ANY, with @new and @old specifying the changed route, while
975 * @new_best and @old_best are NULL. After that, another notification is done
976 * where @new_best and @old_best are filled (may be the same), but @new and @old
979 * The function announces the change to all associated channels. For each
980 * channel, an appropriate preprocessing is done according to channel &ra_mode.
981 * For example, %RA_OPTIMAL channels receive just changes of best routes.
983 * In general, we first call preexport() hook of a protocol, which performs
984 * basic checks on the route (each protocol has a right to veto or force accept
985 * of the route before any filter is asked). Then we consult an export filter
986 * of the channel and verify the old route in an export map of the channel.
987 * Finally, the rt_notify() hook of the protocol gets called.
989 * Note that there are also calls of rt_notify() hooks due to feed, but that is
990 * done outside of scope of rte_announce().
993 rte_announce(rtable
*tab
, uint type
, net
*net
, rte
*new, rte
*old
,
994 rte
*new_best
, rte
*old_best
)
996 if (!rte_is_valid(new))
999 if (!rte_is_valid(old
))
1002 if (!rte_is_valid(new_best
))
1005 if (!rte_is_valid(old_best
))
1008 if (!new && !old
&& !new_best
&& !old_best
)
1011 if (new_best
!= old_best
)
1014 new_best
->sender
->stats
.pref_routes
++;
1016 old_best
->sender
->stats
.pref_routes
--;
1019 rt_notify_hostcache(tab
, net
);
1021 if (!EMPTY_LIST(tab
->flowspec_links
))
1022 rt_flowspec_notify(tab
, net
);
1025 rt_schedule_notify(tab
);
1027 struct channel
*c
; node
*n
;
1028 WALK_LIST2(c
, n
, tab
->channels
, table_node
)
1030 if (c
->export_state
== ES_DOWN
)
1033 if (type
&& (type
!= c
->ra_mode
))
1039 if (new_best
!= old_best
)
1040 rt_notify_basic(c
, net
, new_best
, old_best
, 0);
1045 rt_notify_basic(c
, net
, new, old
, 0);
1050 * The (new != old) condition is problematic here, as it would break
1051 * the second usage pattern (announcement after bulk change, used in
1052 * rt_next_hop_update_net(), which sends both new and old as NULL).
1054 * But recursive next hops do not work with sorted tables anyways,
1055 * such configuration is forbidden in BGP and not supported in
1056 * rt_notify_accepted().
1058 * The condition is needed to eliminate spurious announcements where
1059 * both old and new routes are not valid (so they are NULL).
1062 rt_notify_accepted(c
, net
, new, old
, 0);
1066 rt_notify_merged(c
, net
, new, old
, new_best
, old_best
, 0);
1073 rte_validate(rte
*e
)
1078 if (!net_validate(n
->n
.addr
))
1080 log(L_WARN
"Ignoring bogus prefix %N received via %s",
1081 n
->n
.addr
, e
->sender
->proto
->name
);
1085 /* FIXME: better handling different nettypes */
1086 c
= !net_is_flow(n
->n
.addr
) ?
1087 net_classify(n
->n
.addr
): (IADDR_HOST
| SCOPE_UNIVERSE
);
1088 if ((c
< 0) || !(c
& IADDR_HOST
) || ((c
& IADDR_SCOPE_MASK
) <= SCOPE_LINK
))
1090 log(L_WARN
"Ignoring bogus route %N received via %s",
1091 n
->n
.addr
, e
->sender
->proto
->name
);
1095 if (net_type_match(n
->n
.addr
, NB_DEST
) == !e
->attrs
->dest
)
1097 /* Exception for flowspec that failed validation */
1098 if (net_is_flow(n
->n
.addr
) && (e
->attrs
->dest
== RTD_UNREACHABLE
))
1101 log(L_WARN
"Ignoring route %N with invalid dest %d received via %s",
1102 n
->n
.addr
, e
->attrs
->dest
, e
->sender
->proto
->name
);
1106 if ((e
->attrs
->dest
== RTD_UNICAST
) && !nexthop_is_sorted(&(e
->attrs
->nh
)))
1108 log(L_WARN
"Ignoring unsorted multipath route %N received via %s",
1109 n
->n
.addr
, e
->sender
->proto
->name
);
1117 * rte_free - delete a &rte
1118 * @e: &rte to be deleted
1120 * rte_free() deletes the given &rte from the routing table it's linked to.
1125 rt_unlock_source(e
->src
);
1126 if (rta_is_cached(e
->attrs
))
1132 rte_free_quick(rte
*e
)
1134 rt_unlock_source(e
->src
);
1140 rte_same(rte
*x
, rte
*y
)
1142 /* rte.flags / rte.pflags are not checked, as they are internal to rtable */
1144 x
->attrs
== y
->attrs
&&
1146 rte_is_filtered(x
) == rte_is_filtered(y
);
1149 static inline int rte_is_ok(rte
*e
) { return e
&& !rte_is_filtered(e
); }
1152 rte_recalculate(struct channel
*c
, net
*net
, rte
*new, struct rte_src
*src
)
1154 struct proto
*p
= c
->proto
;
1155 struct rtable
*table
= c
->table
;
1156 struct proto_stats
*stats
= &c
->stats
;
1157 static struct tbf rl_pipe
= TBF_DEFAULT_LOG_LIMITS
;
1158 rte
*before_old
= NULL
;
1159 rte
*old_best
= net
->routes
;
1163 k
= &net
->routes
; /* Find and remove original route from the same protocol */
1166 if (old
->src
== src
)
1168 /* If there is the same route in the routing table but from
1169 * a different sender, then there are two paths from the
1170 * source protocol to this routing table through transparent
1171 * pipes, which is not allowed.
1173 * We log that and ignore the route. If it is withdraw, we
1174 * ignore it completely (there might be 'spurious withdraws',
1175 * see FIXME in do_rte_announce())
1177 if (old
->sender
->proto
!= p
)
1181 log_rl(&rl_pipe
, L_ERR
"Pipe collision detected when sending %N to table %s",
1182 net
->n
.addr
, table
->name
);
1183 rte_free_quick(new);
1188 if (new && rte_same(old
, new))
1190 /* No changes, ignore the new route and refresh the old one */
1192 old
->flags
&= ~(REF_STALE
| REF_DISCARD
| REF_MODIFY
);
1194 if (!rte_is_filtered(new))
1196 stats
->imp_updates_ignored
++;
1197 rte_trace_in(D_ROUTES
, c
, new, "ignored");
1200 rte_free_quick(new);
1211 /* Save the last accessed position */
1219 stats
->imp_withdraws_ignored
++;
1223 int new_ok
= rte_is_ok(new);
1224 int old_ok
= rte_is_ok(old
);
1226 struct channel_limit
*l
= &c
->rx_limit
;
1227 if (l
->action
&& !old
&& new && !c
->in_table
)
1229 u32 all_routes
= stats
->imp_routes
+ stats
->filt_routes
;
1231 if (all_routes
>= l
->limit
)
1232 channel_notify_limit(c
, l
, PLD_RX
, all_routes
);
1234 if (l
->state
== PLS_BLOCKED
)
1236 /* In receive limit the situation is simple, old is NULL so
1237 we just free new and exit like nothing happened */
1239 stats
->imp_updates_ignored
++;
1240 rte_trace_in(D_FILTERS
, c
, new, "ignored [limit]");
1241 rte_free_quick(new);
1247 if (l
->action
&& !old_ok
&& new_ok
)
1249 if (stats
->imp_routes
>= l
->limit
)
1250 channel_notify_limit(c
, l
, PLD_IN
, stats
->imp_routes
);
1252 if (l
->state
== PLS_BLOCKED
)
1254 /* In import limit the situation is more complicated. We
1255 shouldn't just drop the route, we should handle it like
1256 it was filtered. We also have to continue the route
1257 processing if old or new is non-NULL, but we should exit
1258 if both are NULL as this case is probably assumed to be
1261 stats
->imp_updates_ignored
++;
1262 rte_trace_in(D_FILTERS
, c
, new, "ignored [limit]");
1264 if (c
->in_keep_filtered
)
1265 new->flags
|= REF_FILTERED
;
1267 { rte_free_quick(new); new = NULL
; }
1269 /* Note that old && !new could be possible when
1270 c->in_keep_filtered changed in the recent past. */
1281 stats
->imp_updates_accepted
++;
1283 stats
->imp_withdraws_accepted
++;
1285 stats
->imp_withdraws_ignored
++;
1287 if (old_ok
|| new_ok
)
1288 table
->last_rt_change
= current_time();
1293 rte_is_filtered(new) ? stats
->filt_routes
++ : stats
->imp_routes
++;
1295 rte_is_filtered(old
) ? stats
->filt_routes
-- : stats
->imp_routes
--;
1297 if (table
->config
->sorted
)
1299 /* If routes are sorted, just insert new route to appropriate position */
1302 if (before_old
&& !rte_better(new, before_old
))
1303 k
= &before_old
->next
;
1307 for (; *k
; k
=&(*k
)->next
)
1308 if (rte_better(new, *k
))
1319 /* If routes are not sorted, find the best route and move it on
1320 the first position. There are several optimized cases. */
1322 if (src
->proto
->rte_recalculate
&& src
->proto
->rte_recalculate(table
, net
, new, old
, old_best
))
1323 goto do_recalculate
;
1325 if (new && rte_better(new, old_best
))
1327 /* The first case - the new route is cleary optimal,
1328 we link it at the first position */
1330 new->next
= net
->routes
;
1335 else if (old
== old_best
)
1337 /* The second case - the old best route disappeared, we add the
1338 new route (if we have any) to the list (we don't care about
1339 position) and then we elect the new optimal route and relink
1340 that route at the first position and announce it. New optimal
1341 route might be NULL if there is no more routes */
1344 /* Add the new route to the list */
1353 /* Find a new optimal route (if there is any) */
1356 rte
**bp
= &net
->routes
;
1357 for (k
=&(*bp
)->next
; *k
; k
=&(*k
)->next
)
1358 if (rte_better(*k
, *bp
))
1364 best
->next
= net
->routes
;
1370 /* The third case - the new route is not better than the old
1371 best route (therefore old_best != NULL) and the old best
1372 route was not removed (therefore old_best == net->routes).
1373 We just link the new route to the old/last position. */
1380 /* The fourth (empty) case - suboptimal route was removed, nothing to do */
1385 new->lastmod
= current_time();
1389 new->id
= hmap_first_zero(&table
->id_map
);
1390 hmap_set(&table
->id_map
, new->id
);
1396 /* Log the route change */
1397 if ((c
->debug
& D_ROUTES
) || (p
->debug
& D_ROUTES
))
1400 rte_trace(c
, new, '>', new == net
->routes
? "added [best]" : "added");
1403 if (old
!= old_best
)
1404 rte_trace(c
, old
, '>', "removed");
1405 else if (rte_is_ok(net
->routes
))
1406 rte_trace(c
, old
, '>', "removed [replaced]");
1408 rte_trace(c
, old
, '>', "removed [sole]");
1412 /* Propagate the route change */
1413 rte_announce(table
, RA_UNDEF
, net
, new, old
, net
->routes
, old_best
);
1416 (table
->gc_counter
++ >= table
->config
->gc_threshold
))
1417 rt_kick_prune_timer(table
);
1419 if (old_ok
&& p
->rte_remove
)
1420 p
->rte_remove(net
, old
);
1421 if (new_ok
&& p
->rte_insert
)
1422 p
->rte_insert(net
, new);
1427 hmap_clear(&table
->id_map
, old
->id
);
1429 rte_free_quick(old
);
1433 static int rte_update_nest_cnt
; /* Nesting counter to allow recursive updates */
1436 rte_update_lock(void)
1438 rte_update_nest_cnt
++;
1442 rte_update_unlock(void)
1444 if (!--rte_update_nest_cnt
)
1445 lp_flush(rte_update_pool
);
1449 * rte_update - enter a new update to a routing table
1450 * @table: table to be updated
1451 * @c: channel doing the update
1452 * @net: network node
1453 * @p: protocol submitting the update
1454 * @src: protocol originating the update
1455 * @new: a &rte representing the new route or %NULL for route removal.
1457 * This function is called by the routing protocols whenever they discover
1458 * a new route or wish to update/remove an existing route. The right announcement
1459 * sequence is to build route attributes first (either un-cached with @aflags set
1460 * to zero or a cached one using rta_lookup(); in this case please note that
1461 * you need to increase the use count of the attributes yourself by calling
1462 * rta_clone()), call rte_get_temp() to obtain a temporary &rte, fill in all
1463 * the appropriate data and finally submit the new &rte by calling rte_update().
1465 * @src specifies the protocol that originally created the route and the meaning
1466 * of protocol-dependent data of @new. If @new is not %NULL, @src have to be the
1467 * same value as @new->attrs->proto. @p specifies the protocol that called
1468 * rte_update(). In most cases it is the same protocol as @src. rte_update()
1469 * stores @p in @new->sender;
1471 * When rte_update() gets any route, it automatically validates it (checks,
1472 * whether the network and next hop address are valid IP addresses and also
1473 * whether a normal routing protocol doesn't try to smuggle a host or link
1474 * scope route to the table), converts all protocol dependent attributes stored
1475 * in the &rte to temporary extended attributes, consults import filters of the
1476 * protocol to see if the route should be accepted and/or its attributes modified,
1477 * stores the temporary attributes back to the &rte.
1479 * Now, having a "public" version of the route, we
1480 * automatically find any old route defined by the protocol @src
1481 * for network @n, replace it by the new one (or removing it if @new is %NULL),
1482 * recalculate the optimal route for this destination and finally broadcast
1483 * the change (if any) to all routing protocols by calling rte_announce().
1485 * All memory used for attribute lists and other temporary allocations is taken
1486 * from a special linear pool @rte_update_pool and freed when rte_update()
1491 rte_update2(struct channel
*c
, const net_addr
*n
, rte
*new, struct rte_src
*src
)
1493 struct proto
*p
= c
->proto
;
1494 struct proto_stats
*stats
= &c
->stats
;
1495 const struct filter
*filter
= c
->in_filter
;
1496 struct mpls_fec
*fec
= NULL
;
1499 ASSERT(c
->channel_state
== CS_UP
);
1504 /* Create a temporary table node */
1505 nn
= alloca(sizeof(net
) + n
->length
);
1506 memset(nn
, 0, sizeof(net
) + n
->length
);
1507 net_copy(nn
->n
.addr
, n
);
1512 stats
->imp_updates_received
++;
1513 if (!rte_validate(new))
1515 rte_trace_in(D_FILTERS
, c
, new, "invalid");
1516 stats
->imp_updates_invalid
++;
1520 if (filter
== FILTER_REJECT
)
1522 stats
->imp_updates_filtered
++;
1523 rte_trace_in(D_FILTERS
, c
, new, "filtered out");
1525 if (! c
->in_keep_filtered
)
1528 /* new is a private copy, i could modify it */
1529 new->flags
|= REF_FILTERED
;
1533 int fr
= f_run(filter
, &new, rte_update_pool
, 0);
1536 stats
->imp_updates_filtered
++;
1537 rte_trace_in(D_FILTERS
, c
, new, "filtered out");
1539 if (! c
->in_keep_filtered
)
1542 new->flags
|= REF_FILTERED
;
1548 if (mpls_handle_rte(p
->mpls_map
, n
, new, rte_update_pool
, &fec
) < 0)
1550 rte_trace_in(D_FILTERS
, c
, new, "invalid");
1551 stats
->imp_updates_invalid
++;
1556 if (!rta_is_cached(new->attrs
)) /* Need to copy attributes */
1557 new->attrs
= rta_lookup(new->attrs
);
1558 new->flags
|= REF_COW
;
1560 /* Use the actual struct network, not the dummy one */
1561 nn
= net_get(c
->table
, n
);
1566 stats
->imp_withdraws_received
++;
1568 if (!(nn
= net_find(c
->table
, n
)) || !src
)
1570 stats
->imp_withdraws_ignored
++;
1571 rte_update_unlock();
1577 /* And recalculate the best route */
1578 rte_recalculate(c
, nn
, new, src
);
1581 mpls_handle_rte_cleanup(p
->mpls_map
, &fec
);
1583 rte_update_unlock();
1589 if (nn
= net_find(c
->table
, n
))
1592 rte_update_unlock();
1595 /* Independent call to rte_announce(), used from next hop
1596 recalculation, outside of rte_update(). new must be non-NULL */
1598 rte_announce_i(rtable
*tab
, uint type
, net
*net
, rte
*new, rte
*old
,
1599 rte
*new_best
, rte
*old_best
)
1602 rte_announce(tab
, type
, net
, new, old
, new_best
, old_best
);
1603 rte_update_unlock();
1607 rte_discard(rte
*old
) /* Non-filtered route deletion, used during garbage collection */
1610 rte_recalculate(old
->sender
, old
->net
, NULL
, old
->src
);
1611 rte_update_unlock();
1614 /* Modify existing route by protocol hook, used for long-lived graceful restart */
1616 rte_modify(rte
*old
)
1620 rte
*new = old
->sender
->proto
->rte_modify(old
, rte_update_pool
);
1625 if (!rta_is_cached(new->attrs
))
1626 new->attrs
= rta_lookup(new->attrs
);
1627 new->flags
= (old
->flags
& ~REF_MODIFY
) | REF_COW
;
1630 rte_recalculate(old
->sender
, old
->net
, new, old
->src
);
1633 rte_update_unlock();
1636 /* Check rtable for best route to given net whether it would be exported do p */
1638 rt_examine(rtable
*t
, net_addr
*a
, struct channel
*c
, const struct filter
*filter
)
1640 struct proto
*p
= c
->proto
;
1641 net
*n
= net_find(t
, a
);
1642 rte
*rt
= n
? n
->routes
: NULL
;
1644 if (!rte_is_valid(rt
))
1649 /* Rest is stripped down export_filter() */
1650 int v
= p
->preexport
? p
->preexport(c
, rt
) : 0;
1651 if (v
== RIC_PROCESS
)
1652 v
= (f_run(filter
, &rt
, rte_update_pool
, FF_SILENT
) <= F_ACCEPT
);
1654 /* Discard temporary rte */
1655 if (rt
!= n
->routes
)
1658 rte_update_unlock();
1665 * rt_refresh_begin - start a refresh cycle
1666 * @t: related routing table
1667 * @c related channel
1669 * This function starts a refresh cycle for given routing table and announce
1670 * hook. The refresh cycle is a sequence where the protocol sends all its valid
1671 * routes to the routing table (by rte_update()). After that, all protocol
1672 * routes (more precisely routes with @c as @sender) not sent during the
1673 * refresh cycle but still in the table from the past are pruned. This is
1674 * implemented by marking all related routes as stale by REF_STALE flag in
1675 * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD
1676 * flag in rt_refresh_end() and then removing such routes in the prune loop.
1679 rt_refresh_begin(rtable
*t
, struct channel
*c
)
1681 if (c
->debug
& D_EVENTS
)
1682 log(L_TRACE
"%s.%s: Route refresh begin", c
->proto
->name
, c
->name
);
1684 FIB_WALK(&t
->fib
, net
, n
)
1687 for (e
= n
->routes
; e
; e
= e
->next
)
1689 e
->flags
|= REF_STALE
;
1695 * rt_refresh_end - end a refresh cycle
1696 * @t: related routing table
1697 * @c: related channel
1699 * This function ends a refresh cycle for given routing table and announce
1700 * hook. See rt_refresh_begin() for description of refresh cycles.
1703 rt_refresh_end(rtable
*t
, struct channel
*c
)
1705 if (c
->debug
& D_EVENTS
)
1706 log(L_TRACE
"%s.%s: Route refresh end", c
->proto
->name
, c
->name
);
1710 FIB_WALK(&t
->fib
, net
, n
)
1713 for (e
= n
->routes
; e
; e
= e
->next
)
1714 if ((e
->sender
== c
) && (e
->flags
& REF_STALE
))
1716 e
->flags
|= REF_DISCARD
;
1723 rt_schedule_prune(t
);
1727 rt_modify_stale(rtable
*t
, struct channel
*c
)
1731 FIB_WALK(&t
->fib
, net
, n
)
1734 for (e
= n
->routes
; e
; e
= e
->next
)
1735 if ((e
->sender
== c
) && (e
->flags
& REF_STALE
) && !(e
->flags
& REF_FILTERED
))
1737 e
->flags
|= REF_MODIFY
;
1744 rt_schedule_prune(t
);
1748 * rte_dump - dump a route
1749 * @e: &rte to be dumped
1751 * This functions dumps contents of a &rte to debug output.
1754 rte_dump(struct dump_request
*dreq
, rte
*e
)
1757 RDUMP("%-1N ", n
->n
.addr
);
1758 RDUMP("PF=%02x ", e
->pflags
);
1759 rta_dump(dreq
, e
->attrs
);
1764 * rt_dump - dump a routing table
1765 * @t: routing table to be dumped
1767 * This function dumps contents of a given routing table to debug output.
1770 rt_dump(struct dump_request
*dreq
, rtable
*t
)
1772 RDUMP("Dump of routing table <%s>\n", t
->name
);
1776 FIB_WALK(&t
->fib
, net
, n
)
1779 for(e
=n
->routes
; e
; e
=e
->next
)
1787 * rt_dump_all - dump all routing tables
1789 * This function dumps contents of all routing tables to debug output.
1792 rt_dump_all(struct dump_request
*dreq
)
1797 WALK_LIST2(t
, n
, routing_tables
, n
)
1802 rt_schedule_hcu(rtable
*tab
)
1804 if (tab
->hcu_scheduled
)
1807 tab
->hcu_scheduled
= 1;
1808 ev_schedule(tab
->rt_event
);
1812 rt_schedule_nhu(rtable
*tab
)
1814 if (tab
->nhu_state
== NHU_CLEAN
)
1815 ev_schedule(tab
->rt_event
);
1818 * NHU_CLEAN -> NHU_SCHEDULED
1819 * NHU_RUNNING -> NHU_DIRTY
1821 tab
->nhu_state
|= NHU_SCHEDULED
;
1825 rt_schedule_prune(rtable
*tab
)
1827 if (tab
->prune_state
== 0)
1828 ev_schedule(tab
->rt_event
);
1830 /* state change 0->1, 2->3 */
1831 tab
->prune_state
|= 1;
1842 if (tab
->hcu_scheduled
)
1843 rt_update_hostcache(tab
);
1846 rt_next_hop_update(tab
);
1848 if (tab
->prune_state
)
1849 rt_prune_table(tab
);
1851 rt_unlock_table(tab
);
1856 rt_prune_timer(timer
*t
)
1858 rtable
*tab
= t
->data
;
1860 if (tab
->gc_counter
>= tab
->config
->gc_threshold
)
1861 rt_schedule_prune(tab
);
1865 rt_kick_prune_timer(rtable
*tab
)
1867 /* Return if prune is already scheduled */
1868 if (tm_active(tab
->prune_timer
) || (tab
->prune_state
& 1))
1871 /* Randomize GC period to +/- 50% */
1872 btime gc_period
= tab
->config
->gc_period
;
1873 gc_period
= (gc_period
/ 2) + (random_u32() % (uint
) gc_period
);
1874 tm_start(tab
->prune_timer
, gc_period
);
1879 rt_settled_time(rtable
*tab
)
1881 ASSUME(tab
->base_settle_time
!= 0);
1883 return MIN(tab
->last_rt_change
+ tab
->config
->min_settle_time
,
1884 tab
->base_settle_time
+ tab
->config
->max_settle_time
);
1888 rt_settle_timer(timer
*t
)
1890 rtable
*tab
= t
->data
;
1892 if (!tab
->base_settle_time
)
1895 btime settled_time
= rt_settled_time(tab
);
1896 if (current_time() < settled_time
)
1898 tm_set(tab
->settle_timer
, settled_time
);
1903 tab
->base_settle_time
= 0;
1905 struct rt_subscription
*s
;
1906 WALK_LIST(s
, tab
->subscribers
)
1911 rt_kick_settle_timer(rtable
*tab
)
1913 tab
->base_settle_time
= current_time();
1915 if (!tab
->settle_timer
)
1916 tab
->settle_timer
= tm_new_init(tab
->rp
, rt_settle_timer
, tab
, 0, 0);
1918 if (!tm_active(tab
->settle_timer
))
1919 tm_set(tab
->settle_timer
, rt_settled_time(tab
));
1923 rt_schedule_notify(rtable
*tab
)
1925 if (EMPTY_LIST(tab
->subscribers
))
1928 if (tab
->base_settle_time
)
1931 rt_kick_settle_timer(tab
);
1935 rt_subscribe(rtable
*tab
, struct rt_subscription
*s
)
1939 add_tail(&tab
->subscribers
, &s
->n
);
1943 rt_unsubscribe(struct rt_subscription
*s
)
1946 rt_unlock_table(s
->tab
);
1949 static struct rt_flowspec_link
*
1950 rt_flowspec_find_link(rtable
*src
, rtable
*dst
)
1952 struct rt_flowspec_link
*ln
;
1953 WALK_LIST(ln
, src
->flowspec_links
)
1954 if ((ln
->src
== src
) && (ln
->dst
== dst
))
1961 rt_flowspec_link(rtable
*src
, rtable
*dst
)
1963 ASSERT(rt_is_ip(src
));
1964 ASSERT(rt_is_flow(dst
));
1966 struct rt_flowspec_link
*ln
= rt_flowspec_find_link(src
, dst
);
1973 ln
= mb_allocz(src
->rp
, sizeof(struct rt_flowspec_link
));
1976 add_tail(&src
->flowspec_links
, &ln
->n
);
1983 rt_flowspec_unlink(rtable
*src
, rtable
*dst
)
1985 struct rt_flowspec_link
*ln
= rt_flowspec_find_link(src
, dst
);
1987 ASSERT(ln
&& (ln
->uc
> 0));
1996 rt_unlock_table(src
);
1997 rt_unlock_table(dst
);
2002 rt_flowspec_notify(rtable
*src
, net
*net
)
2004 /* Only IP tables are src links */
2005 ASSERT(rt_is_ip(src
));
2007 struct rt_flowspec_link
*ln
;
2008 WALK_LIST(ln
, src
->flowspec_links
)
2010 rtable
*dst
= ln
->dst
;
2011 ASSERT(rt_is_flow(dst
));
2013 /* No need to inspect it further if recalculation is already active */
2014 if ((dst
->nhu_state
== NHU_SCHEDULED
) || (dst
->nhu_state
== NHU_DIRTY
))
2017 if (trie_match_net(dst
->flowspec_trie
, net
->n
.addr
))
2018 rt_schedule_nhu(dst
);
2023 rt_flowspec_reset_trie(rtable
*tab
)
2025 linpool
*lp
= tab
->flowspec_trie
->lp
;
2026 int ipv4
= tab
->flowspec_trie
->ipv4
;
2029 tab
->flowspec_trie
= f_new_trie(lp
, 0);
2030 tab
->flowspec_trie
->ipv4
= ipv4
;
2034 rt_free(resource
*_r
)
2036 rtable
*r
= (rtable
*) _r
;
2038 DBG("Deleting routing table %s\n", r
->name
);
2039 ASSERT_DIE(r
->use_count
== 0);
2044 r
->config
->table
= NULL
;
2048 rt_free_hostcache(r
);
2050 /* Freed automagically by the resource pool
2052 hmap_free(&r->id_map);
2054 rfree(r->settle_timer);
2060 rt_res_dump(struct dump_request
*dreq
, resource
*_r
)
2062 rtable
*r
= (rtable
*) _r
;
2063 RDUMP("name \"%s\", addr_type=%s, rt_count=%u, use_count=%d\n",
2064 r
->name
, net_label
[r
->addr_type
], r
->rt_count
, r
->use_count
);
2067 static struct resclass rt_class
= {
2068 .name
= "Routing table",
2069 .size
= sizeof(struct rtable
),
2071 .dump
= rt_res_dump
,
2077 rt_setup(pool
*pp
, struct rtable_config
*cf
)
2079 pool
*p
= rp_newf(pp
, "Routing table %s", cf
->name
);
2081 rtable
*t
= ralloc(p
, &rt_class
);
2086 t
->addr_type
= cf
->addr_type
;
2087 t
->debug
= cf
->debug
;
2089 fib_init(&t
->fib
, p
, t
->addr_type
, sizeof(net
), OFFSETOF(net
, n
), 0, NULL
);
2093 t
->trie
= f_new_trie(lp_new_default(p
), 0);
2094 t
->trie
->ipv4
= net_val_match(t
->addr_type
, NB_IP4
| NB_VPN4
| NB_ROA4
);
2096 t
->fib
.init
= net_init_with_trie
;
2099 init_list(&t
->channels
);
2100 init_list(&t
->flowspec_links
);
2101 init_list(&t
->subscribers
);
2103 hmap_init(&t
->id_map
, p
, 1024);
2104 hmap_set(&t
->id_map
, 0);
2106 if (!(t
->internal
= cf
->internal
))
2108 t
->rt_event
= ev_new_init(p
, rt_event
, t
);
2109 t
->prune_timer
= tm_new_init(p
, rt_prune_timer
, t
, 0, 0);
2110 t
->last_rt_change
= t
->gc_time
= current_time();
2114 t
->flowspec_trie
= f_new_trie(lp_new_default(p
), 0);
2115 t
->flowspec_trie
->ipv4
= (t
->addr_type
== NET_FLOW4
);
2123 * rt_init - initialize routing tables
2125 * This function is called during BIRD startup. It initializes the
2126 * routing table module.
2132 rt_table_pool
= rp_new(&root_pool
, "Routing tables");
2133 rte_update_pool
= lp_new_default(rt_table_pool
);
2134 rte_slab
= sl_new(rt_table_pool
, sizeof(rte
));
2135 init_list(&routing_tables
);
2140 * rt_prune_table - prune a routing table
2142 * The prune loop scans routing tables and removes routes belonging to flushing
2143 * protocols, discarded routes and also stale network entries. It is called from
2144 * rt_event(). The event is rescheduled if the current iteration do not finish
2145 * the table. The pruning is directed by the prune state (@prune_state),
2146 * specifying whether the prune cycle is scheduled or running, and there
2147 * is also a persistent pruning iterator (@prune_fit).
2149 * The prune loop is used also for channel flushing. For this purpose, the
2150 * channels to flush are marked before the iteration and notified after the
2154 rt_prune_table(rtable
*tab
)
2156 struct fib_iterator
*fit
= &tab
->prune_fit
;
2162 DBG("Pruning route table %s\n", tab
->name
);
2164 fib_check(&tab
->fib
);
2167 if (tab
->prune_state
== 0)
2170 if (tab
->prune_state
== 1)
2172 /* Mark channels to flush */
2173 WALK_LIST2(c
, n
, tab
->channels
, table_node
)
2174 if (c
->channel_state
== CS_FLUSHING
)
2175 c
->flush_active
= 1;
2177 FIB_ITERATE_INIT(fit
, &tab
->fib
);
2178 tab
->prune_state
= 2;
2180 tab
->gc_counter
= 0;
2181 tab
->gc_time
= current_time();
2183 if (tab
->prune_trie
)
2185 /* Init prefix trie pruning */
2186 tab
->trie_new
= f_new_trie(lp_new_default(tab
->rp
), 0);
2187 tab
->trie_new
->ipv4
= tab
->trie
->ipv4
;
2192 FIB_ITERATE_START(&tab
->fib
, fit
, net
, n
)
2199 FIB_ITERATE_PUT(fit
);
2200 ev_schedule(tab
->rt_event
);
2204 for (e
=n
->routes
; e
; e
=e
->next
)
2206 if (e
->sender
->flush_active
|| (e
->flags
& REF_DISCARD
))
2214 if (e
->flags
& REF_MODIFY
)
2223 if (!n
->routes
) /* Orphaned FIB entry */
2225 FIB_ITERATE_PUT(fit
);
2226 fib_delete(&tab
->fib
, n
);
2232 trie_add_prefix(tab
->trie_new
, n
->n
.addr
, n
->n
.addr
->pxlen
, n
->n
.addr
->pxlen
);
2239 fib_check(&tab
->fib
);
2242 /* state change 2->0, 3->1 */
2243 tab
->prune_state
&= 1;
2247 /* Finish prefix trie pruning */
2249 if (!tab
->trie_lock_count
)
2251 rfree(tab
->trie
->lp
);
2255 ASSERT(!tab
->trie_old
);
2256 tab
->trie_old
= tab
->trie
;
2257 tab
->trie_old_lock_count
= tab
->trie_lock_count
;
2258 tab
->trie_lock_count
= 0;
2261 tab
->trie
= tab
->trie_new
;
2262 tab
->trie_new
= NULL
;
2263 tab
->prune_trie
= 0;
2267 /* Schedule prefix trie pruning */
2268 if (tab
->trie
&& !tab
->trie_old
&& (tab
->trie
->prefix_count
> (2 * tab
->fib
.entries
)))
2270 /* state change 0->1, 2->3 */
2271 tab
->prune_state
|= 1;
2272 tab
->prune_trie
= 1;
2276 if (tab
->prune_state
> 0)
2277 ev_schedule(tab
->rt_event
);
2279 /* FIXME: This should be handled in a better way */
2282 /* Close flushed channels */
2283 WALK_LIST2_DELSAFE(c
, n
, x
, tab
->channels
, table_node
)
2284 if (c
->flush_active
)
2286 c
->flush_active
= 0;
2287 channel_set_state(c
, CS_DOWN
);
2294 * rt_lock_trie - lock a prefix trie of a routing table
2295 * @tab: routing table with prefix trie to be locked
2297 * The prune loop may rebuild the prefix trie and invalidate f_trie_walk_state
2298 * structures. Therefore, asynchronous walks should lock the prefix trie using
2299 * this function. That allows the prune loop to rebuild the trie, but postpones
2300 * its freeing until all walks are done (unlocked by rt_unlock_trie()).
2302 * Return a current trie that will be locked, the value should be passed back to
2303 * rt_unlock_trie() for unlocking.
2307 rt_lock_trie(rtable
*tab
)
2311 tab
->trie_lock_count
++;
2316 * rt_unlock_trie - unlock a prefix trie of a routing table
2317 * @tab: routing table with prefix trie to be locked
2318 * @trie: value returned by matching rt_lock_trie()
2320 * Done for trie locked by rt_lock_trie() after walk over the trie is done.
2321 * It may free the trie and schedule next trie pruning.
2324 rt_unlock_trie(rtable
*tab
, struct f_trie
*trie
)
2328 if (trie
== tab
->trie
)
2330 /* Unlock the current prefix trie */
2331 ASSERT(tab
->trie_lock_count
);
2332 tab
->trie_lock_count
--;
2334 else if (trie
== tab
->trie_old
)
2336 /* Unlock the old prefix trie */
2337 ASSERT(tab
->trie_old_lock_count
);
2338 tab
->trie_old_lock_count
--;
2340 /* Free old prefix trie that is no longer needed */
2341 if (!tab
->trie_old_lock_count
)
2343 rfree(tab
->trie_old
->lp
);
2344 tab
->trie_old
= NULL
;
2346 /* Kick prefix trie pruning that was postponed */
2347 if (tab
->trie
&& (tab
->trie
->prefix_count
> (2 * tab
->fib
.entries
)))
2349 tab
->prune_trie
= 1;
2350 rt_schedule_prune(tab
);
2355 log(L_BUG
"Invalid arg to rt_unlock_trie()");
2360 rt_preconfig(struct config
*c
)
2362 init_list(&c
->tables
);
2364 rt_new_table(cf_get_symbol(c
, "master4"), NET_IP4
);
2365 rt_new_table(cf_get_symbol(c
, "master6"), NET_IP6
);
2369 rt_postconfig(struct config
*c
)
2371 uint num_tables
= list_length(&c
->tables
);
2372 btime def_gc_period
= 400 MS
* num_tables
;
2373 def_gc_period
= MAX(def_gc_period
, 10 S
);
2374 def_gc_period
= MIN(def_gc_period
, 600 S
);
2376 struct rtable_config
*rc
;
2377 WALK_LIST(rc
, c
->tables
)
2378 if (rc
->gc_period
== (uint
) -1)
2379 rc
->gc_period
= (uint
) def_gc_period
;
2384 * Some functions for handing internal next hop updates
2385 * triggered by rt_schedule_nhu().
2389 rta_apply_hostentry(rta
*a
, struct hostentry
*he
, mpls_label_stack
*mls
)
2393 a
->igp_metric
= he
->igp_metric
;
2395 if (a
->dest
!= RTD_UNICAST
)
2399 a
->nh
= (struct nexthop
) {};
2401 { /* Store the label stack for later changes */
2402 a
->nh
.labels_orig
= a
->nh
.labels
= mls
->len
;
2403 memcpy(a
->nh
.label
, mls
->stack
, mls
->len
* sizeof(u32
));
2408 if (((!mls
) || (!mls
->len
)) && he
->nexthop_linkable
)
2409 { /* Just link the nexthop chain, no label append happens. */
2410 memcpy(&(a
->nh
), &(he
->src
->nh
), nexthop_size(&(he
->src
->nh
)));
2414 struct nexthop
*nhp
= NULL
, *nhr
= NULL
;
2415 int skip_nexthop
= 0;
2417 for (struct nexthop
*nh
= &(he
->src
->nh
); nh
; nh
= nh
->next
)
2424 nhp
= (nhp
? (nhp
->next
= lp_alloc(rte_update_pool
, NEXTHOP_MAX_SIZE
)) : &(a
->nh
));
2427 memset(nhp
, 0, NEXTHOP_MAX_SIZE
);
2428 nhp
->iface
= nh
->iface
;
2429 nhp
->weight
= nh
->weight
;
2433 nhp
->labels
= nh
->labels
+ mls
->len
;
2434 nhp
->labels_orig
= mls
->len
;
2435 if (nhp
->labels
<= MPLS_MAX_LABEL_STACK
)
2437 memcpy(nhp
->label
, nh
->label
, nh
->labels
* sizeof(u32
)); /* First the hostentry labels */
2438 memcpy(&(nhp
->label
[nh
->labels
]), mls
->stack
, mls
->len
* sizeof(u32
)); /* Then the bottom labels */
2442 log(L_WARN
"Sum of label stack sizes %d + %d = %d exceedes allowed maximum (%d)",
2443 nh
->labels
, mls
->len
, nhp
->labels
, MPLS_MAX_LABEL_STACK
);
2448 else if (nh
->labels
)
2450 nhp
->labels
= nh
->labels
;
2451 nhp
->labels_orig
= 0;
2452 memcpy(nhp
->label
, nh
->label
, nh
->labels
* sizeof(u32
));
2455 if (ipa_nonzero(nh
->gw
))
2457 nhp
->gw
= nh
->gw
; /* Router nexthop */
2458 nhp
->flags
|= (nh
->flags
& RNF_ONLINK
);
2460 else if (!(nh
->iface
->flags
& IF_MULTIACCESS
) || (nh
->iface
->flags
& IF_LOOPBACK
))
2461 nhp
->gw
= IPA_NONE
; /* PtP link - no need for nexthop */
2462 else if (ipa_nonzero(he
->link
))
2463 nhp
->gw
= he
->link
; /* Device nexthop with link-local address known */
2465 nhp
->gw
= he
->addr
; /* Device nexthop with link-local address unknown */
2473 a
->dest
= RTD_UNREACHABLE
;
2474 log(L_WARN
"No valid nexthop remaining, setting route unreachable");
2480 rta_next_hop_outdated(rta
*a
)
2482 struct hostentry
*he
= a
->hostentry
;
2488 return a
->dest
!= RTD_UNREACHABLE
;
2490 return (a
->dest
!= he
->dest
) || (a
->igp_metric
!= he
->igp_metric
) ||
2491 (!he
->nexthop_linkable
) || !nexthop_same(&(a
->nh
), &(he
->src
->nh
));
2495 rt_next_hop_update_rte(rtable
*tab UNUSED
, rte
*old
)
2497 if (!rta_next_hop_outdated(old
->attrs
))
2500 rta
*a
= alloca(RTA_MAX_SIZE
);
2501 memcpy(a
, old
->attrs
, rta_size(old
->attrs
));
2503 mpls_label_stack mls
= { .len
= a
->nh
.labels_orig
};
2504 memcpy(mls
.stack
, &a
->nh
.label
[a
->nh
.labels
- mls
.len
], mls
.len
* sizeof(u32
));
2506 rta_apply_hostentry(a
, old
->attrs
->hostentry
, &mls
);
2509 rte
*e
= sl_alloc(rte_slab
);
2510 memcpy(e
, old
, sizeof(rte
));
2511 e
->attrs
= rta_lookup(a
);
2512 rt_lock_source(e
->src
);
2521 net_flow_has_dst_prefix(const net_addr
*n
)
2523 ASSUME(net_is_flow(n
));
2528 if (n
->type
== NET_FLOW4
)
2530 const net_addr_flow4
*n4
= (void *) n
;
2531 return (n4
->length
> sizeof(net_addr_flow4
)) && (n4
->data
[0] == FLOW_TYPE_DST_PREFIX
);
2535 const net_addr_flow6
*n6
= (void *) n
;
2536 return (n6
->length
> sizeof(net_addr_flow6
)) && (n6
->data
[0] == FLOW_TYPE_DST_PREFIX
);
2541 rta_as_path_is_empty(rta
*a
)
2543 eattr
*e
= ea_find(a
->eattrs
, EA_CODE(PROTOCOL_BGP
, BA_AS_PATH
));
2544 return !e
|| (as_path_getlen(e
->u
.ptr
) == 0);
2548 rta_get_first_asn(rta
*a
)
2550 eattr
*e
= ea_find(a
->eattrs
, EA_CODE(PROTOCOL_BGP
, BA_AS_PATH
));
2553 return (e
&& as_path_get_first_regular(e
->u
.ptr
, &asn
)) ? asn
: 0;
2557 rt_flowspec_check(rtable
*tab_ip
, rtable
*tab_flow
, const net_addr
*n
, rta
*a
, int interior
)
2559 ASSERT(rt_is_ip(tab_ip
));
2560 ASSERT(rt_is_flow(tab_flow
));
2561 ASSERT(tab_ip
->trie
);
2563 /* RFC 8955 6. a) Flowspec has defined dst prefix */
2564 if (!net_flow_has_dst_prefix(n
))
2567 /* RFC 9117 4.1. Accept AS_PATH is empty (fr */
2568 if (interior
&& rta_as_path_is_empty(a
))
2572 /* RFC 8955 6. b) Flowspec and its best-match route have the same originator */
2574 /* Find flowspec dst prefix */
2576 if (n
->type
== NET_FLOW4
)
2577 net_fill_ip4(&dst
, net4_prefix(n
), net4_pxlen(n
));
2579 net_fill_ip6(&dst
, net6_prefix(n
), net6_pxlen(n
));
2581 /* Find best-match BGP unicast route for flowspec dst prefix */
2582 net
*nb
= net_route(tab_ip
, &dst
);
2583 rte
*rb
= nb
? nb
->routes
: NULL
;
2585 /* Register prefix to trie for tracking further changes */
2586 int max_pxlen
= (n
->type
== NET_FLOW4
) ? IP4_MAX_PREFIX_LENGTH
: IP6_MAX_PREFIX_LENGTH
;
2587 trie_add_prefix(tab_flow
->flowspec_trie
, &dst
, (nb
? nb
->n
.addr
->pxlen
: 0), max_pxlen
);
2589 /* No best-match BGP route -> no flowspec */
2590 if (!rb
|| (rb
->attrs
->source
!= RTS_BGP
))
2593 /* Find ORIGINATOR_ID values */
2594 u32 orig_a
= ea_get_int(a
->eattrs
, EA_CODE(PROTOCOL_BGP
, BA_ORIGINATOR_ID
), 0);
2595 u32 orig_b
= ea_get_int(rb
->attrs
->eattrs
, EA_CODE(PROTOCOL_BGP
, BA_ORIGINATOR_ID
), 0);
2597 /* Originator is either ORIGINATOR_ID (if present), or BGP neighbor address (if not) */
2598 if ((orig_a
!= orig_b
) || (!orig_a
&& !orig_b
&& !ipa_equal(a
->from
, rb
->attrs
->from
)))
2602 /* Find ASN of the best-match route, for use in next checks */
2603 u32 asn_b
= rta_get_first_asn(rb
->attrs
);
2607 /* RFC 9117 4.2. For EBGP, flowspec and its best-match route are from the same AS */
2608 if (!interior
&& (rta_get_first_asn(a
) != asn_b
))
2611 /* RFC 8955 6. c) More-specific routes are from the same AS as the best-match route */
2612 TRIE_WALK(tab_ip
->trie
, subnet
, &dst
)
2614 net
*nc
= net_find_valid(tab_ip
, &subnet
);
2618 rte
*rc
= nc
->routes
;
2619 if (rc
->attrs
->source
!= RTS_BGP
)
2622 if (rta_get_first_asn(rc
->attrs
) != asn_b
)
2630 #endif /* CONFIG_BGP */
2633 rt_flowspec_update_rte(rtable
*tab
, rte
*r
)
2636 if ((r
->attrs
->source
!= RTS_BGP
) || (r
->sender
->proto
!= r
->src
->proto
))
2639 struct bgp_channel
*bc
= (struct bgp_channel
*) r
->sender
;
2640 if (!bc
->base_table
)
2643 const net_addr
*n
= r
->net
->n
.addr
;
2644 struct bgp_proto
*p
= (void *) r
->src
->proto
;
2645 int valid
= rt_flowspec_check(bc
->base_table
, tab
, n
, r
->attrs
, p
->is_interior
);
2646 int dest
= valid
? RTD_NONE
: RTD_UNREACHABLE
;
2648 if (dest
== r
->attrs
->dest
)
2651 rta
*a
= alloca(RTA_MAX_SIZE
);
2652 memcpy(a
, r
->attrs
, rta_size(r
->attrs
));
2656 rte
*new = sl_alloc(rte_slab
);
2657 memcpy(new, r
, sizeof(rte
));
2658 new->attrs
= rta_lookup(a
);
2659 rt_lock_source(new->src
);
2669 rt_next_hop_update_net(rtable
*tab
, net
*n
)
2671 rte
**k
, *e
, *new, *old_best
, **new_best
;
2673 int free_old_best
= 0;
2675 old_best
= n
->routes
;
2679 for (k
= &n
->routes
; e
= *k
; k
= &e
->next
)
2681 if (!net_is_flow(n
->n
.addr
))
2682 new = rt_next_hop_update_rte(tab
, e
);
2684 new = rt_flowspec_update_rte(tab
, e
);
2690 rte_trace_in(D_ROUTES
, new->sender
, new, "updated");
2691 rte_announce_i(tab
, RA_ANY
, n
, new, e
, NULL
, NULL
);
2693 /* Call a pre-comparison hook */
2694 /* Not really an efficient way to compute this */
2695 if (e
->src
->proto
->rte_recalculate
)
2696 e
->src
->proto
->rte_recalculate(tab
, n
, new, e
, NULL
);
2700 else /* Freeing of the old best rte is postponed */
2711 /* Find the new best route */
2713 for (k
= &n
->routes
; e
= *k
; k
= &e
->next
)
2715 if (!new_best
|| rte_better(e
, *new_best
))
2719 /* Relink the new best route to the first position */
2721 if (new != n
->routes
)
2723 *new_best
= new->next
;
2724 new->next
= n
->routes
;
2728 /* Announce the new best route */
2729 if (new != old_best
)
2730 rte_trace_in(D_ROUTES
, new->sender
, new, "updated [best]");
2732 /* Propagate changes */
2733 rte_announce_i(tab
, RA_UNDEF
, n
, NULL
, NULL
, n
->routes
, old_best
);
2736 rte_free_quick(old_best
);
2742 rt_next_hop_update(rtable
*tab
)
2744 struct fib_iterator
*fit
= &tab
->nhu_fit
;
2747 if (tab
->nhu_state
== NHU_CLEAN
)
2750 if (tab
->nhu_state
== NHU_SCHEDULED
)
2752 FIB_ITERATE_INIT(fit
, &tab
->fib
);
2753 tab
->nhu_state
= NHU_RUNNING
;
2755 if (tab
->flowspec_trie
)
2756 rt_flowspec_reset_trie(tab
);
2759 FIB_ITERATE_START(&tab
->fib
, fit
, net
, n
)
2763 FIB_ITERATE_PUT(fit
);
2764 ev_schedule(tab
->rt_event
);
2767 max_feed
-= rt_next_hop_update_net(tab
, n
);
2772 * NHU_DIRTY -> NHU_SCHEDULED
2773 * NHU_RUNNING -> NHU_CLEAN
2775 tab
->nhu_state
&= 1;
2777 if (tab
->nhu_state
!= NHU_CLEAN
)
2778 ev_schedule(tab
->rt_event
);
2782 struct rtable_config
*
2783 rt_new_table(struct symbol
*s
, uint addr_type
)
2785 /* Hack that allows to 'redefine' the master table */
2786 if ((s
->class == SYM_TABLE
) &&
2787 (s
->table
== new_config
->def_tables
[addr_type
]) &&
2788 ((addr_type
== NET_IP4
) || (addr_type
== NET_IP6
)))
2791 struct rtable_config
*c
= cfg_allocz(sizeof(struct rtable_config
));
2793 cf_define_symbol(new_config
, s
, SYM_TABLE
, table
, c
);
2795 c
->addr_type
= addr_type
;
2796 c
->gc_threshold
= 1000;
2797 c
->gc_period
= (uint
) -1; /* set in rt_postconfig() */
2798 c
->min_settle_time
= 1 S
;
2799 c
->max_settle_time
= 20 S
;
2800 c
->debug
= new_config
->table_default_debug
;
2802 add_tail(&new_config
->tables
, &c
->n
);
2804 /* First table of each type is kept as default */
2805 if (! new_config
->def_tables
[addr_type
])
2806 new_config
->def_tables
[addr_type
] = c
;
2812 * rt_lock_table - lock a routing table
2813 * @r: routing table to be locked
2815 * Lock a routing table, because it's in use by a protocol,
2816 * preventing it from being freed when it gets undefined in a new
2820 rt_lock_table(rtable
*r
)
2826 * rt_unlock_table - unlock a routing table
2827 * @r: routing table to be unlocked
2829 * Unlock a routing table formerly locked by rt_lock_table(),
2830 * that is decrease its use count and delete it if it's scheduled
2831 * for deletion by configuration changes.
2834 rt_unlock_table(rtable
*r
)
2836 if (!--r
->use_count
&& r
->deleted
)
2838 struct config
*conf
= r
->deleted
;
2840 /* Delete the routing table by freeing its pool */
2842 config_del_obstacle(conf
);
2847 rt_reconfigure(rtable
*tab
, struct rtable_config
*new, struct rtable_config
*old
)
2849 if ((new->addr_type
!= old
->addr_type
) ||
2850 (new->sorted
!= old
->sorted
) ||
2851 (new->trie_used
!= old
->trie_used
))
2854 DBG("\t%s: same\n", new->name
);
2856 tab
->name
= new->name
;
2858 tab
->debug
= new->debug
;
2863 static struct rtable_config
*
2864 rt_find_table_config(struct config
*cf
, char *name
)
2866 struct symbol
*sym
= cf_find_symbol(cf
, name
);
2867 return (sym
&& (sym
->class == SYM_TABLE
)) ? sym
->table
: NULL
;
2871 * rt_commit - commit new routing table configuration
2872 * @new: new configuration
2873 * @old: original configuration or %NULL if it's boot time config
2875 * Scan differences between @old and @new configuration and modify
2876 * the routing tables according to these changes. If @new defines a
2877 * previously unknown table, create it, if it omits a table existing
2878 * in @old, schedule it for deletion (it gets deleted when all protocols
2879 * disconnect from it by calling rt_unlock_table()), if it exists
2880 * in both configurations, leave it unchanged.
2883 rt_commit(struct config
*new, struct config
*old
)
2885 struct rtable_config
*o
, *r
;
2887 DBG("rt_commit:\n");
2890 WALK_LIST(o
, old
->tables
)
2892 rtable
*tab
= o
->table
;
2896 r
= rt_find_table_config(new, o
->name
);
2897 if (r
&& !new->shutdown
&& rt_reconfigure(tab
, r
, o
))
2900 DBG("\t%s: deleted\n", o
->name
);
2902 config_add_obstacle(old
);
2904 rt_unlock_table(tab
);
2908 WALK_LIST(r
, new->tables
)
2911 r
->table
= rt_setup(rt_table_pool
, r
);
2912 DBG("\t%s: created\n", r
->name
);
2913 add_tail(&routing_tables
, &r
->table
->n
);
2919 do_feed_channel(struct channel
*c
, net
*n
, rte
*e
)
2922 if (c
->ra_mode
== RA_ACCEPTED
)
2923 rt_notify_accepted(c
, n
, NULL
, NULL
, c
->refeeding
);
2924 else if (c
->ra_mode
== RA_MERGED
)
2925 rt_notify_merged(c
, n
, NULL
, NULL
, e
, e
, c
->refeeding
);
2927 rt_notify_basic(c
, n
, e
, e
, c
->refeeding
);
2928 rte_update_unlock();
2932 * rt_feed_channel - advertise all routes to a channel
2933 * @c: channel to be fed
2935 * This function performs one pass of advertisement of routes to a channel that
2936 * is in the ES_FEEDING state. It is called by the protocol code as long as it
2937 * has something to do. (We avoid transferring all the routes in single pass in
2938 * order not to monopolize CPU time.)
2941 rt_feed_channel(struct channel
*c
)
2943 struct fib_iterator
*fit
= &c
->feed_fit
;
2946 ASSERT(c
->export_state
== ES_FEEDING
);
2948 if (!c
->feed_active
)
2950 FIB_ITERATE_INIT(fit
, &c
->table
->fib
);
2954 FIB_ITERATE_START(&c
->table
->fib
, fit
, net
, n
)
2959 FIB_ITERATE_PUT(fit
);
2963 if ((c
->ra_mode
== RA_OPTIMAL
) ||
2964 (c
->ra_mode
== RA_ACCEPTED
) ||
2965 (c
->ra_mode
== RA_MERGED
))
2966 if (rte_is_valid(e
))
2968 /* In the meantime, the protocol may fell down */
2969 if (c
->export_state
!= ES_FEEDING
)
2972 do_feed_channel(c
, n
, e
);
2976 if (c
->ra_mode
== RA_ANY
)
2977 for(e
= n
->routes
; e
; e
= e
->next
)
2979 /* In the meantime, the protocol may fell down */
2980 if (c
->export_state
!= ES_FEEDING
)
2983 if (!rte_is_valid(e
))
2986 do_feed_channel(c
, n
, e
);
2998 * rt_feed_baby_abort - abort protocol feeding
3001 * This function is called by the protocol code when the protocol stops or
3002 * ceases to exist during the feeding.
3005 rt_feed_channel_abort(struct channel
*c
)
3009 /* Unlink the iterator */
3010 fit_get(&c
->table
->fib
, &c
->feed_fit
);
3021 rte_update_in(struct channel
*c
, const net_addr
*n
, rte
*new, struct rte_src
*src
)
3023 struct rtable
*tab
= c
->in_table
;
3029 net
= net_get(tab
, n
);
3031 if (!rta_is_cached(new->attrs
))
3032 new->attrs
= rta_lookup(new->attrs
);
3036 net
= net_find(tab
, n
);
3042 /* Find the old rte */
3043 for (pos
= &net
->routes
; old
= *pos
; pos
= &old
->next
)
3044 if (old
->src
== src
)
3046 if (new && rte_same(old
, new))
3048 /* Refresh the old rte, continue with update to main rtable */
3049 if (old
->flags
& (REF_STALE
| REF_DISCARD
| REF_MODIFY
))
3051 old
->flags
&= ~(REF_STALE
| REF_DISCARD
| REF_MODIFY
);
3059 /* Move iterator if needed */
3060 if (old
== c
->reload_next_rte
)
3061 c
->reload_next_rte
= old
->next
;
3063 /* Remove the old rte */
3072 struct channel_limit
*l
= &c
->rx_limit
;
3073 if (l
->action
&& !old
&& new)
3075 if (tab
->rt_count
>= l
->limit
)
3076 channel_notify_limit(c
, l
, PLD_RX
, tab
->rt_count
);
3078 if (l
->state
== PLS_BLOCKED
)
3080 /* Required by rte_trace_in() */
3083 rte_trace_in(D_FILTERS
, c
, new, "ignored [limit]");
3090 /* Insert the new rte */
3091 rte
*e
= rte_do_cow(new);
3092 e
->flags
|= REF_COW
;
3095 e
->lastmod
= current_time();
3102 new->id
= hmap_first_zero(&tab
->id_map
);
3103 hmap_set(&tab
->id_map
, new->id
);
3109 rte_announce(tab
, RA_ANY
, net
, new, old
, NULL
, NULL
);
3114 hmap_clear(&tab
->id_map
, old
->id
);
3116 rte_free_quick(old
);
3120 fib_delete(&tab
->fib
, net
);
3125 c
->stats
.imp_updates_received
++;
3126 c
->stats
.imp_updates_ignored
++;
3130 fib_delete(&tab
->fib
, net
);
3135 c
->stats
.imp_withdraws_received
++;
3136 c
->stats
.imp_withdraws_ignored
++;
3141 rt_reload_channel(struct channel
*c
)
3143 struct rtable
*tab
= c
->in_table
;
3144 struct fib_iterator
*fit
= &c
->reload_fit
;
3147 ASSERT(c
->channel_state
== CS_UP
);
3149 if (!c
->reload_active
)
3151 FIB_ITERATE_INIT(fit
, &tab
->fib
);
3152 c
->reload_active
= 1;
3156 for (rte
*e
= c
->reload_next_rte
; e
; e
= e
->next
)
3158 if (max_feed
-- <= 0)
3160 c
->reload_next_rte
= e
;
3161 debug("%s channel reload burst split (max_feed=%d)", c
->proto
->name
, max_feed
);
3165 rte_update2(c
, e
->net
->n
.addr
, rte_do_cow(e
), e
->src
);
3168 c
->reload_next_rte
= NULL
;
3170 FIB_ITERATE_START(&tab
->fib
, fit
, net
, n
)
3172 if (c
->reload_next_rte
= n
->routes
)
3174 FIB_ITERATE_PUT_NEXT(fit
, &tab
->fib
);
3180 while (c
->reload_next_rte
);
3182 c
->reload_active
= 0;
3187 rt_reload_channel_abort(struct channel
*c
)
3189 if (c
->reload_active
)
3191 /* Unlink the iterator */
3192 fit_get(&c
->in_table
->fib
, &c
->reload_fit
);
3193 c
->reload_next_rte
= NULL
;
3194 c
->reload_active
= 0;
3199 rt_prune_sync(rtable
*t
, int all
)
3201 struct fib_iterator fit
;
3203 FIB_ITERATE_INIT(&fit
, &t
->fib
);
3206 FIB_ITERATE_START(&t
->fib
, &fit
, net
, n
)
3208 rte
*e
, **ee
= &n
->routes
;
3212 if (all
|| (e
->flags
& (REF_STALE
| REF_DISCARD
)))
3222 if (all
|| !n
->routes
)
3224 FIB_ITERATE_PUT(&fit
);
3225 fib_delete(&t
->fib
, n
);
3238 rte_update_out(struct channel
*c
, const net_addr
*n
, rte
*new, rte
*old0
, int refeed
)
3240 struct rtable
*tab
= c
->out_table
;
3241 struct rte_src
*src
;
3247 net
= net_get(tab
, n
);
3250 if (!rta_is_cached(new->attrs
))
3251 new->attrs
= rta_lookup(new->attrs
);
3255 net
= net_find(tab
, n
);
3262 /* Find the old rte */
3263 for (pos
= &net
->routes
; old
= *pos
; pos
= &old
->next
)
3264 if ((c
->ra_mode
!= RA_ANY
) || (old
->src
== src
))
3266 if (new && rte_same(old
, new))
3268 /* REF_STALE / REF_DISCARD not used in export table */
3270 if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY))
3272 old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
3280 /* Remove the old rte */
3282 rte_free_quick(old
);
3294 fib_delete(&tab
->fib
, net
);
3299 /* Insert the new rte */
3300 rte
*e
= rte_do_cow(new);
3301 e
->flags
|= REF_COW
;
3304 e
->lastmod
= current_time();
3323 hc_hash(ip_addr a
, rtable
*dep
)
3325 return ipa_hash(a
) ^ ptr_hash(dep
);
3329 hc_insert(struct hostcache
*hc
, struct hostentry
*he
)
3331 uint k
= he
->hash_key
>> hc
->hash_shift
;
3332 he
->next
= hc
->hash_table
[k
];
3333 hc
->hash_table
[k
] = he
;
3337 hc_remove(struct hostcache
*hc
, struct hostentry
*he
)
3339 struct hostentry
**hep
;
3340 uint k
= he
->hash_key
>> hc
->hash_shift
;
3342 for (hep
= &hc
->hash_table
[k
]; *hep
!= he
; hep
= &(*hep
)->next
);
3346 #define HC_DEF_ORDER 10
3347 #define HC_HI_MARK *4
3348 #define HC_HI_STEP 2
3349 #define HC_HI_ORDER 16 /* Must be at most 16 */
3350 #define HC_LO_MARK /5
3351 #define HC_LO_STEP 2
3352 #define HC_LO_ORDER 10
3355 hc_alloc_table(struct hostcache
*hc
, pool
*p
, unsigned order
)
3357 uint hsize
= 1 << order
;
3358 hc
->hash_order
= order
;
3359 hc
->hash_shift
= 32 - order
;
3360 hc
->hash_max
= (order
>= HC_HI_ORDER
) ? ~0U : (hsize HC_HI_MARK
);
3361 hc
->hash_min
= (order
<= HC_LO_ORDER
) ? 0U : (hsize HC_LO_MARK
);
3363 hc
->hash_table
= mb_allocz(p
, hsize
* sizeof(struct hostentry
*));
3367 hc_resize(struct hostcache
*hc
, pool
*p
, unsigned new_order
)
3369 struct hostentry
**old_table
= hc
->hash_table
;
3370 struct hostentry
*he
, *hen
;
3371 uint old_size
= 1 << hc
->hash_order
;
3374 hc_alloc_table(hc
, p
, new_order
);
3375 for (i
= 0; i
< old_size
; i
++)
3376 for (he
= old_table
[i
]; he
!= NULL
; he
=hen
)
3384 static struct hostentry
*
3385 hc_new_hostentry(struct hostcache
*hc
, pool
*p
, ip_addr a
, ip_addr ll
, rtable
*dep
, unsigned k
)
3387 struct hostentry
*he
= sl_alloc(hc
->slab
);
3389 *he
= (struct hostentry
) {
3396 add_tail(&hc
->hostentries
, &he
->ln
);
3400 if (hc
->hash_items
> hc
->hash_max
)
3401 hc_resize(hc
, p
, hc
->hash_order
+ HC_HI_STEP
);
3407 hc_delete_hostentry(struct hostcache
*hc
, pool
*p
, struct hostentry
*he
)
3416 if (hc
->hash_items
< hc
->hash_min
)
3417 hc_resize(hc
, p
, hc
->hash_order
- HC_LO_STEP
);
3421 rt_init_hostcache(rtable
*tab
)
3423 struct hostcache
*hc
= mb_allocz(tab
->rp
, sizeof(struct hostcache
));
3424 init_list(&hc
->hostentries
);
3427 hc_alloc_table(hc
, tab
->rp
, HC_DEF_ORDER
);
3428 hc
->slab
= sl_new(tab
->rp
, sizeof(struct hostentry
));
3430 hc
->lp
= lp_new(tab
->rp
);
3431 hc
->trie
= f_new_trie(hc
->lp
, 0);
3433 tab
->hostcache
= hc
;
3437 rt_free_hostcache(rtable
*tab
)
3439 struct hostcache
*hc
= tab
->hostcache
;
3442 WALK_LIST(n
, hc
->hostentries
)
3444 struct hostentry
*he
= SKIP_BACK(struct hostentry
, ln
, n
);
3448 log(L_ERR
"Hostcache is not empty in table %s", tab
->name
);
3451 /* Freed automagically by the resource pool
3454 mb_free(hc->hash_table);
3460 rt_notify_hostcache(rtable
*tab
, net
*net
)
3462 if (tab
->hcu_scheduled
)
3465 if (trie_match_net(tab
->hostcache
->trie
, net
->n
.addr
))
3466 rt_schedule_hcu(tab
);
3470 if_local_addr(ip_addr a
, struct iface
*i
)
3474 WALK_LIST(b
, i
->addrs
)
3475 if (ipa_equal(a
, b
->ip
))
3482 rt_get_igp_metric(rte
*rt
)
3484 eattr
*ea
= ea_find(rt
->attrs
->eattrs
, EA_GEN_IGP_METRIC
);
3489 if (rt
->attrs
->source
== RTS_DEVICE
)
3492 if (rt
->src
->proto
->rte_igp_metric
)
3493 return rt
->src
->proto
->rte_igp_metric(rt
);
3495 return IGP_METRIC_UNKNOWN
;
3499 rt_update_hostentry(rtable
*tab
, struct hostentry
*he
)
3501 rta
*old_src
= he
->src
;
3505 /* Reset the hostentry */
3507 he
->dest
= RTD_UNREACHABLE
;
3508 he
->nexthop_linkable
= 0;
3512 net_fill_ip_host(&he_addr
, he
->addr
);
3513 net
*n
= net_route(tab
, &he_addr
);
3518 word pref
= a
->pref
;
3520 for (rte
*ee
= n
->routes
; ee
; ee
= ee
->next
)
3521 if ((ee
->attrs
->pref
>= pref
) && ee
->attrs
->hostentry
)
3523 /* Recursive route should not depend on another recursive route */
3524 log(L_WARN
"Next hop address %I resolvable through recursive route for %N",
3525 he
->addr
, n
->n
.addr
);
3529 pxlen
= n
->n
.addr
->pxlen
;
3531 if (a
->dest
== RTD_UNICAST
)
3533 for (struct nexthop
*nh
= &(a
->nh
); nh
; nh
= nh
->next
)
3534 if (ipa_zero(nh
->gw
))
3536 if (if_local_addr(he
->addr
, nh
->iface
))
3538 /* The host address is a local address, this is not valid */
3539 log(L_WARN
"Next hop address %I is a local address of iface %s",
3540 he
->addr
, nh
->iface
->name
);
3548 he
->src
= rta_clone(a
);
3550 he
->nexthop_linkable
= !direct
;
3551 he
->igp_metric
= rt_get_igp_metric(e
);
3555 /* Add a prefix range to the trie */
3556 trie_add_prefix(tab
->hostcache
->trie
, &he_addr
, pxlen
, he_addr
.pxlen
);
3559 return old_src
!= he
->src
;
3563 rt_update_hostcache(rtable
*tab
)
3565 struct hostcache
*hc
= tab
->hostcache
;
3566 struct hostentry
*he
;
3569 /* Reset the trie */
3571 hc
->trie
= f_new_trie(hc
->lp
, 0);
3573 WALK_LIST_DELSAFE(n
, x
, hc
->hostentries
)
3575 he
= SKIP_BACK(struct hostentry
, ln
, n
);
3578 hc_delete_hostentry(hc
, tab
->rp
, he
);
3582 if (rt_update_hostentry(tab
, he
))
3583 rt_schedule_nhu(he
->tab
);
3586 tab
->hcu_scheduled
= 0;
3590 rt_get_hostentry(rtable
*tab
, ip_addr a
, ip_addr ll
, rtable
*dep
)
3592 ip_addr link
= ipa_zero(ll
) ? a
: ll
;
3593 struct hostentry
*he
;
3595 if (!tab
->hostcache
)
3596 rt_init_hostcache(tab
);
3598 u32 k
= hc_hash(a
, dep
);
3599 struct hostcache
*hc
= tab
->hostcache
;
3600 for (he
= hc
->hash_table
[k
>> hc
->hash_shift
]; he
!= NULL
; he
= he
->next
)
3601 if (ipa_equal(he
->addr
, a
) && ipa_equal(he
->link
, link
) && (he
->tab
== dep
))
3604 he
= hc_new_hostentry(hc
, tab
->rp
, a
, link
, dep
, k
);
3606 rt_update_hostentry(tab
, he
);
3612 * Documentation for functions declared inline in route.h
3617 * net_find - find a network entry
3618 * @tab: a routing table
3619 * @addr: address of the network
3621 * net_find() looks up the given network in routing table @tab and
3622 * returns a pointer to its &net entry or %NULL if no such network
3625 static inline net
*net_find(rtable
*tab
, net_addr
*addr
)
3629 * net_get - obtain a network entry
3630 * @tab: a routing table
3631 * @addr: address of the network
3633 * net_get() looks up the given network in routing table @tab and
3634 * returns a pointer to its &net entry. If no such entry exists, it's
3637 static inline net
*net_get(rtable
*tab
, net_addr
*addr
)
3641 * rte_cow - copy a route for writing
3642 * @r: a route entry to be copied
3644 * rte_cow() takes a &rte and prepares it for modification. The exact action
3645 * taken depends on the flags of the &rte -- if it's a temporary entry, it's
3646 * just returned unchanged, else a new temporary entry with the same contents
3649 * The primary use of this function is inside the filter machinery -- when
3650 * a filter wants to modify &rte contents (to change the preference or to
3651 * attach another set of attributes), it must ensure that the &rte is not
3652 * shared with anyone else (and especially that it isn't stored in any routing
3655 * Result: a pointer to the new writable &rte.
3657 static inline rte
* rte_cow(rte
*r
)