]> git.ipfire.org Git - thirdparty/bird.git/blob - nest/rt-table.c
BMP: Fix reconfiguration
[thirdparty/bird.git] / nest / rt-table.c
1 /*
2 * BIRD -- Routing Tables
3 *
4 * (c) 1998--2000 Martin Mares <mj@ucw.cz>
5 *
6 * Can be freely distributed and used under the terms of the GNU GPL.
7 */
8
9 /**
10 * DOC: Routing tables
11 *
12 * Routing tables are probably the most important structures BIRD uses. They
13 * hold all the information about known networks, the associated routes and
14 * their attributes.
15 *
16 * There are multiple routing tables (a primary one together with any
17 * number of secondary ones if requested by the configuration). Each table
18 * is basically a FIB containing entries describing the individual
19 * destination networks. For each network (represented by structure &net),
20 * there is a one-way linked list of route entries (&rte), the first entry
21 * on the list being the best one (i.e., the one we currently use
22 * for routing), the order of the other ones is undetermined.
23 *
24 * The &rte contains information about the route. There are net and src, which
25 * together forms a key identifying the route in a routing table. There is a
26 * pointer to a &rta structure (see the route attribute module for a precise
27 * explanation) holding the route attributes, which are primary data about the
28 * route. There are several technical fields used by routing table code (route
29 * id, REF_* flags), There is also the pflags field, holding protocol-specific
30 * flags. They are not used by routing table code, but by protocol-specific
31 * hooks. In contrast to route attributes, they are not primary data and their
32 * validity is also limited to the routing table.
33 *
34 * There are several mechanisms that allow automatic update of routes in one
35 * routing table (dst) as a result of changes in another routing table (src).
36 * They handle issues of recursive next hop resolving, flowspec validation and
37 * RPKI validation.
38 *
39 * The first such mechanism is handling of recursive next hops. A route in the
40 * dst table has an indirect next hop address, which is resolved through a route
41 * in the src table (which may also be the same table) to get an immediate next
42 * hop. This is implemented using structure &hostcache attached to the src
43 * table, which contains &hostentry structures for each tracked next hop
44 * address. These structures are linked from recursive routes in dst tables,
45 * possibly multiple routes sharing one hostentry (as many routes may have the
46 * same indirect next hop). There is also a trie in the hostcache, which matches
47 * all prefixes that may influence resolving of tracked next hops.
48 *
49 * When a best route changes in the src table, the hostcache is notified using
50 * rt_notify_hostcache(), which immediately checks using the trie whether the
51 * change is relevant and if it is, then it schedules asynchronous hostcache
52 * recomputation. The recomputation is done by rt_update_hostcache() (called
53 * from rt_event() of src table), it walks through all hostentries and resolves
54 * them (by rt_update_hostentry()). It also updates the trie. If a change in
55 * hostentry resolution was found, then it schedules asynchronous nexthop
56 * recomputation of associated dst table. That is done by rt_next_hop_update()
57 * (called from rt_event() of dst table), it iterates over all routes in the dst
58 * table and re-examines their hostentries for changes. Note that in contrast to
59 * hostcache update, next hop update can be interrupted by main loop. These two
60 * full-table walks (over hostcache and dst table) are necessary due to absence
61 * of direct lookups (route -> affected nexthop, nexthop -> its route).
62 *
63 * The second mechanism is for flowspec validation, where validity of flowspec
64 * routes depends of resolving their network prefixes in IP routing tables. This
65 * is similar to the recursive next hop mechanism, but simpler as there are no
66 * intermediate hostcache and hostentries (because flows are less likely to
67 * share common net prefix than routes sharing a common next hop). In src table,
68 * there is a list of dst tables (list flowspec_links), this list is updated by
69 * flowpsec channels (by rt_flowspec_link() and rt_flowspec_unlink() during
70 * channel start/stop). Each dst table has its own trie of prefixes that may
71 * influence validation of flowspec routes in it (flowspec_trie).
72 *
73 * When a best route changes in the src table, rt_flowspec_notify() immediately
74 * checks all dst tables from the list using their tries to see whether the
75 * change is relevant for them. If it is, then an asynchronous re-validation of
76 * flowspec routes in the dst table is scheduled. That is also done by function
77 * rt_next_hop_update(), like nexthop recomputation above. It iterates over all
78 * flowspec routes and re-validates them. It also recalculates the trie.
79 *
80 * Note that in contrast to the hostcache update, here the trie is recalculated
81 * during the rt_next_hop_update(), which may be interleaved with IP route
82 * updates. The trie is flushed at the beginning of recalculation, which means
83 * that such updates may use partial trie to see if they are relevant. But it
84 * works anyway! Either affected flowspec was already re-validated and added to
85 * the trie, then IP route change would match the trie and trigger a next round
86 * of re-validation, or it was not yet re-validated and added to the trie, but
87 * will be re-validated later in this round anyway.
88 *
89 * The third mechanism is used for RPKI re-validation of IP routes and it is the
90 * simplest. It is just a list of subscribers in src table, who are notified
91 * when any change happened, but only after a settle time. Also, in RPKI case
92 * the dst is not a table, but a channel, who refeeds routes through a filter.
93 */
94
95 #undef LOCAL_DEBUG
96
97 #include "nest/bird.h"
98 #include "nest/route.h"
99 #include "nest/protocol.h"
100 #include "nest/iface.h"
101 #include "lib/resource.h"
102 #include "lib/event.h"
103 #include "lib/timer.h"
104 #include "lib/string.h"
105 #include "conf/conf.h"
106 #include "filter/filter.h"
107 #include "filter/data.h"
108 #include "lib/hash.h"
109 #include "lib/string.h"
110 #include "lib/alloca.h"
111 #include "lib/flowspec.h"
112
113 #ifdef CONFIG_BGP
114 #include "proto/bgp/bgp.h"
115 #endif
116
117 pool *rt_table_pool;
118
119 static slab *rte_slab;
120 static linpool *rte_update_pool;
121
122 list routing_tables;
123
124 static void rt_free_hostcache(rtable *tab);
125 static void rt_notify_hostcache(rtable *tab, net *net);
126 static void rt_update_hostcache(rtable *tab);
127 static void rt_next_hop_update(rtable *tab);
128 static inline void rt_prune_table(rtable *tab);
129 static inline void rt_schedule_notify(rtable *tab);
130 static void rt_flowspec_notify(rtable *tab, net *net);
131 static void rt_kick_prune_timer(rtable *tab);
132
133
134 static void
135 net_init_with_trie(struct fib *f, void *N)
136 {
137 rtable *tab = SKIP_BACK(rtable, fib, f);
138 net *n = N;
139
140 if (tab->trie)
141 trie_add_prefix(tab->trie, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen);
142
143 if (tab->trie_new)
144 trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen);
145 }
146
147 static inline net *
148 net_route_ip4_trie(rtable *t, const net_addr_ip4 *n0)
149 {
150 TRIE_WALK_TO_ROOT_IP4(t->trie, n0, n)
151 {
152 net *r;
153 if (r = net_find_valid(t, (net_addr *) &n))
154 return r;
155 }
156 TRIE_WALK_TO_ROOT_END;
157
158 return NULL;
159 }
160
161 static inline net *
162 net_route_vpn4_trie(rtable *t, const net_addr_vpn4 *n0)
163 {
164 TRIE_WALK_TO_ROOT_IP4(t->trie, (const net_addr_ip4 *) n0, px)
165 {
166 net_addr_vpn4 n = NET_ADDR_VPN4(px.prefix, px.pxlen, n0->rd);
167
168 net *r;
169 if (r = net_find_valid(t, (net_addr *) &n))
170 return r;
171 }
172 TRIE_WALK_TO_ROOT_END;
173
174 return NULL;
175 }
176
177 static inline net *
178 net_route_ip6_trie(rtable *t, const net_addr_ip6 *n0)
179 {
180 TRIE_WALK_TO_ROOT_IP6(t->trie, n0, n)
181 {
182 net *r;
183 if (r = net_find_valid(t, (net_addr *) &n))
184 return r;
185 }
186 TRIE_WALK_TO_ROOT_END;
187
188 return NULL;
189 }
190
191 static inline net *
192 net_route_vpn6_trie(rtable *t, const net_addr_vpn6 *n0)
193 {
194 TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px)
195 {
196 net_addr_vpn6 n = NET_ADDR_VPN6(px.prefix, px.pxlen, n0->rd);
197
198 net *r;
199 if (r = net_find_valid(t, (net_addr *) &n))
200 return r;
201 }
202 TRIE_WALK_TO_ROOT_END;
203
204 return NULL;
205 }
206
207 static inline void *
208 net_route_ip6_sadr_trie(rtable *t, const net_addr_ip6_sadr *n0)
209 {
210 TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px)
211 {
212 net_addr_ip6_sadr n = NET_ADDR_IP6_SADR(px.prefix, px.pxlen, n0->src_prefix, n0->src_pxlen);
213 net *best = NULL;
214 int best_pxlen = 0;
215
216 /* We need to do dst first matching. Since sadr addresses are hashed on dst
217 prefix only, find the hash table chain and go through it to find the
218 match with the longest matching src prefix. */
219 for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next)
220 {
221 net_addr_ip6_sadr *a = (void *) fn->addr;
222
223 if (net_equal_dst_ip6_sadr(&n, a) &&
224 net_in_net_src_ip6_sadr(&n, a) &&
225 (a->src_pxlen >= best_pxlen))
226 {
227 best = fib_node_to_user(&t->fib, fn);
228 best_pxlen = a->src_pxlen;
229 }
230 }
231
232 if (best)
233 return best;
234 }
235 TRIE_WALK_TO_ROOT_END;
236
237 return NULL;
238 }
239
240 static inline net *
241 net_route_ip4_fib(rtable *t, const net_addr_ip4 *n0)
242 {
243 net_addr_ip4 n;
244 net_copy_ip4(&n, n0);
245
246 net *r;
247 while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0))
248 {
249 n.pxlen--;
250 ip4_clrbit(&n.prefix, n.pxlen);
251 }
252
253 return r;
254 }
255
256 static inline net *
257 net_route_vpn4_fib(rtable *t, const net_addr_vpn4 *n0)
258 {
259 net_addr_vpn4 n;
260 net_copy_vpn4(&n, n0);
261
262 net *r;
263 while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0))
264 {
265 n.pxlen--;
266 ip4_clrbit(&n.prefix, n.pxlen);
267 }
268
269 return r;
270 }
271
272 static inline net *
273 net_route_ip6_fib(rtable *t, const net_addr_ip6 *n0)
274 {
275 net_addr_ip6 n;
276 net_copy_ip6(&n, n0);
277
278 net *r;
279 while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0))
280 {
281 n.pxlen--;
282 ip6_clrbit(&n.prefix, n.pxlen);
283 }
284
285 return r;
286 }
287
288 static inline net *
289 net_route_vpn6_fib(rtable *t, const net_addr_vpn6 *n0)
290 {
291 net_addr_vpn6 n;
292 net_copy_vpn6(&n, n0);
293
294 net *r;
295 while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0))
296 {
297 n.pxlen--;
298 ip6_clrbit(&n.prefix, n.pxlen);
299 }
300
301 return r;
302 }
303
304 static inline void *
305 net_route_ip6_sadr_fib(rtable *t, const net_addr_ip6_sadr *n0)
306 {
307 net_addr_ip6_sadr n;
308 net_copy_ip6_sadr(&n, n0);
309
310 while (1)
311 {
312 net *best = NULL;
313 int best_pxlen = 0;
314
315 /* We need to do dst first matching. Since sadr addresses are hashed on dst
316 prefix only, find the hash table chain and go through it to find the
317 match with the longest matching src prefix. */
318 for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next)
319 {
320 net_addr_ip6_sadr *a = (void *) fn->addr;
321
322 if (net_equal_dst_ip6_sadr(&n, a) &&
323 net_in_net_src_ip6_sadr(&n, a) &&
324 (a->src_pxlen >= best_pxlen))
325 {
326 best = fib_node_to_user(&t->fib, fn);
327 best_pxlen = a->src_pxlen;
328 }
329 }
330
331 if (best)
332 return best;
333
334 if (!n.dst_pxlen)
335 break;
336
337 n.dst_pxlen--;
338 ip6_clrbit(&n.dst_prefix, n.dst_pxlen);
339 }
340
341 return NULL;
342 }
343
344 net *
345 net_route(rtable *tab, const net_addr *n)
346 {
347 ASSERT(tab->addr_type == n->type);
348
349 switch (n->type)
350 {
351 case NET_IP4:
352 if (tab->trie)
353 return net_route_ip4_trie(tab, (net_addr_ip4 *) n);
354 else
355 return net_route_ip4_fib (tab, (net_addr_ip4 *) n);
356
357 case NET_VPN4:
358 if (tab->trie)
359 return net_route_vpn4_trie(tab, (net_addr_vpn4 *) n);
360 else
361 return net_route_vpn4_fib (tab, (net_addr_vpn4 *) n);
362
363 case NET_IP6:
364 if (tab->trie)
365 return net_route_ip6_trie(tab, (net_addr_ip6 *) n);
366 else
367 return net_route_ip6_fib (tab, (net_addr_ip6 *) n);
368
369 case NET_VPN6:
370 if (tab->trie)
371 return net_route_vpn6_trie(tab, (net_addr_vpn6 *) n);
372 else
373 return net_route_vpn6_fib (tab, (net_addr_vpn6 *) n);
374
375 case NET_IP6_SADR:
376 if (tab->trie)
377 return net_route_ip6_sadr_trie(tab, (net_addr_ip6_sadr *) n);
378 else
379 return net_route_ip6_sadr_fib (tab, (net_addr_ip6_sadr *) n);
380
381 default:
382 return NULL;
383 }
384 }
385
386
387 static int
388 net_roa_check_ip4_trie(rtable *tab, const net_addr_ip4 *px, u32 asn)
389 {
390 int anything = 0;
391
392 TRIE_WALK_TO_ROOT_IP4(tab->trie, px, px0)
393 {
394 net_addr_roa4 roa0 = NET_ADDR_ROA4(px0.prefix, px0.pxlen, 0, 0);
395
396 struct fib_node *fn;
397 for (fn = fib_get_chain(&tab->fib, (net_addr *) &roa0); fn; fn = fn->next)
398 {
399 net_addr_roa4 *roa = (void *) fn->addr;
400 net *r = fib_node_to_user(&tab->fib, fn);
401
402 if (net_equal_prefix_roa4(roa, &roa0) && rte_is_valid(r->routes))
403 {
404 anything = 1;
405 if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen))
406 return ROA_VALID;
407 }
408 }
409 }
410 TRIE_WALK_TO_ROOT_END;
411
412 return anything ? ROA_INVALID : ROA_UNKNOWN;
413 }
414
415 static int
416 net_roa_check_ip4_fib(rtable *tab, const net_addr_ip4 *px, u32 asn)
417 {
418 struct net_addr_roa4 n = NET_ADDR_ROA4(px->prefix, px->pxlen, 0, 0);
419 struct fib_node *fn;
420 int anything = 0;
421
422 while (1)
423 {
424 for (fn = fib_get_chain(&tab->fib, (net_addr *) &n); fn; fn = fn->next)
425 {
426 net_addr_roa4 *roa = (void *) fn->addr;
427 net *r = fib_node_to_user(&tab->fib, fn);
428
429 if (net_equal_prefix_roa4(roa, &n) && rte_is_valid(r->routes))
430 {
431 anything = 1;
432 if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen))
433 return ROA_VALID;
434 }
435 }
436
437 if (n.pxlen == 0)
438 break;
439
440 n.pxlen--;
441 ip4_clrbit(&n.prefix, n.pxlen);
442 }
443
444 return anything ? ROA_INVALID : ROA_UNKNOWN;
445 }
446
447 static int
448 net_roa_check_ip6_trie(rtable *tab, const net_addr_ip6 *px, u32 asn)
449 {
450 int anything = 0;
451
452 TRIE_WALK_TO_ROOT_IP6(tab->trie, px, px0)
453 {
454 net_addr_roa6 roa0 = NET_ADDR_ROA6(px0.prefix, px0.pxlen, 0, 0);
455
456 struct fib_node *fn;
457 for (fn = fib_get_chain(&tab->fib, (net_addr *) &roa0); fn; fn = fn->next)
458 {
459 net_addr_roa6 *roa = (void *) fn->addr;
460 net *r = fib_node_to_user(&tab->fib, fn);
461
462 if (net_equal_prefix_roa6(roa, &roa0) && rte_is_valid(r->routes))
463 {
464 anything = 1;
465 if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen))
466 return ROA_VALID;
467 }
468 }
469 }
470 TRIE_WALK_TO_ROOT_END;
471
472 return anything ? ROA_INVALID : ROA_UNKNOWN;
473 }
474
475 static int
476 net_roa_check_ip6_fib(rtable *tab, const net_addr_ip6 *px, u32 asn)
477 {
478 struct net_addr_roa6 n = NET_ADDR_ROA6(px->prefix, px->pxlen, 0, 0);
479 struct fib_node *fn;
480 int anything = 0;
481
482 while (1)
483 {
484 for (fn = fib_get_chain(&tab->fib, (net_addr *) &n); fn; fn = fn->next)
485 {
486 net_addr_roa6 *roa = (void *) fn->addr;
487 net *r = fib_node_to_user(&tab->fib, fn);
488
489 if (net_equal_prefix_roa6(roa, &n) && rte_is_valid(r->routes))
490 {
491 anything = 1;
492 if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen))
493 return ROA_VALID;
494 }
495 }
496
497 if (n.pxlen == 0)
498 break;
499
500 n.pxlen--;
501 ip6_clrbit(&n.prefix, n.pxlen);
502 }
503
504 return anything ? ROA_INVALID : ROA_UNKNOWN;
505 }
506
507 /**
508 * roa_check - check validity of route origination in a ROA table
509 * @tab: ROA table
510 * @n: network prefix to check
511 * @asn: AS number of network prefix
512 *
513 * Implements RFC 6483 route validation for the given network prefix. The
514 * procedure is to find all candidate ROAs - ROAs whose prefixes cover the given
515 * network prefix. If there is no candidate ROA, return ROA_UNKNOWN. If there is
516 * a candidate ROA with matching ASN and maxlen field greater than or equal to
517 * the given prefix length, return ROA_VALID. Otherwise, return ROA_INVALID. If
518 * caller cannot determine origin AS, 0 could be used (in that case ROA_VALID
519 * cannot happen). Table @tab must have type NET_ROA4 or NET_ROA6, network @n
520 * must have type NET_IP4 or NET_IP6, respectively.
521 */
522 int
523 net_roa_check(rtable *tab, const net_addr *n, u32 asn)
524 {
525 if ((tab->addr_type == NET_ROA4) && (n->type == NET_IP4))
526 {
527 if (tab->trie)
528 return net_roa_check_ip4_trie(tab, (const net_addr_ip4 *) n, asn);
529 else
530 return net_roa_check_ip4_fib (tab, (const net_addr_ip4 *) n, asn);
531 }
532 else if ((tab->addr_type == NET_ROA6) && (n->type == NET_IP6))
533 {
534 if (tab->trie)
535 return net_roa_check_ip6_trie(tab, (const net_addr_ip6 *) n, asn);
536 else
537 return net_roa_check_ip6_fib (tab, (const net_addr_ip6 *) n, asn);
538 }
539 else
540 return ROA_UNKNOWN; /* Should not happen */
541 }
542
543 /**
544 * rte_find - find a route
545 * @net: network node
546 * @src: route source
547 *
548 * The rte_find() function returns a route for destination @net
549 * which is from route source @src.
550 */
551 rte *
552 rte_find(net *net, struct rte_src *src)
553 {
554 rte *e = net->routes;
555
556 while (e && e->src != src)
557 e = e->next;
558 return e;
559 }
560
561 /**
562 * rte_get_temp - get a temporary &rte
563 * @a: attributes to assign to the new route (a &rta; in case it's
564 * un-cached, rte_update() will create a cached copy automatically)
565 * @src: route source
566 *
567 * Create a temporary &rte and bind it with the attributes @a.
568 */
569 rte *
570 rte_get_temp(rta *a, struct rte_src *src)
571 {
572 rte *e = sl_alloc(rte_slab);
573
574 e->attrs = a;
575 e->id = 0;
576 e->flags = 0;
577 e->pflags = 0;
578 rt_lock_source(e->src = src);
579 return e;
580 }
581
582 rte *
583 rte_do_cow(rte *r)
584 {
585 rte *e = sl_alloc(rte_slab);
586
587 memcpy(e, r, sizeof(rte));
588
589 rt_lock_source(e->src);
590 e->attrs = rta_clone(r->attrs);
591 e->flags = 0;
592 return e;
593 }
594
595 /**
596 * rte_cow_rta - get a private writable copy of &rte with writable &rta
597 * @r: a route entry to be copied
598 * @lp: a linpool from which to allocate &rta
599 *
600 * rte_cow_rta() takes a &rte and prepares it and associated &rta for
601 * modification. There are three possibilities: First, both &rte and &rta are
602 * private copies, in that case they are returned unchanged. Second, &rte is
603 * private copy, but &rta is cached, in that case &rta is duplicated using
604 * rta_do_cow(). Third, both &rte is shared and &rta is cached, in that case
605 * both structures are duplicated by rte_do_cow() and rta_do_cow().
606 *
607 * Note that in the second case, cached &rta loses one reference, while private
608 * copy created by rta_do_cow() is a shallow copy sharing indirect data (eattrs,
609 * nexthops, ...) with it. To work properly, original shared &rta should have
610 * another reference during the life of created private copy.
611 *
612 * Result: a pointer to the new writable &rte with writable &rta.
613 */
614 rte *
615 rte_cow_rta(rte *r, linpool *lp)
616 {
617 if (!rta_is_cached(r->attrs))
618 return r;
619
620 r = rte_cow(r);
621 rta *a = rta_do_cow(r->attrs, lp);
622 rta_free(r->attrs);
623 r->attrs = a;
624 return r;
625 }
626
627 static int /* Actually better or at least as good as */
628 rte_better(rte *new, rte *old)
629 {
630 int (*better)(rte *, rte *);
631
632 if (!rte_is_valid(old))
633 return 1;
634 if (!rte_is_valid(new))
635 return 0;
636
637 if (new->attrs->pref > old->attrs->pref)
638 return 1;
639 if (new->attrs->pref < old->attrs->pref)
640 return 0;
641 if (new->src->proto->proto != old->src->proto->proto)
642 {
643 /*
644 * If the user has configured protocol preferences, so that two different protocols
645 * have the same preference, try to break the tie by comparing addresses. Not too
646 * useful, but keeps the ordering of routes unambiguous.
647 */
648 return new->src->proto->proto > old->src->proto->proto;
649 }
650 if (better = new->src->proto->rte_better)
651 return better(new, old);
652 return 0;
653 }
654
655 static int
656 rte_mergable(rte *pri, rte *sec)
657 {
658 int (*mergable)(rte *, rte *);
659
660 if (!rte_is_valid(pri) || !rte_is_valid(sec))
661 return 0;
662
663 if (pri->attrs->pref != sec->attrs->pref)
664 return 0;
665
666 if (pri->src->proto->proto != sec->src->proto->proto)
667 return 0;
668
669 if (mergable = pri->src->proto->rte_mergable)
670 return mergable(pri, sec);
671
672 return 0;
673 }
674
675 static void
676 rte_trace(struct channel *c, rte *e, int dir, char *msg)
677 {
678 log(L_TRACE "%s.%s %c %s %N %uL %uG %s",
679 c->proto->name, c->name ?: "?", dir, msg, e->net->n.addr, e->src->private_id, e->src->global_id,
680 rta_dest_name(e->attrs->dest));
681 }
682
683 static inline void
684 rte_trace_in(uint flag, struct channel *c, rte *e, char *msg)
685 {
686 if ((c->debug & flag) || (c->proto->debug & flag))
687 rte_trace(c, e, '>', msg);
688 }
689
690 static inline void
691 rte_trace_out(uint flag, struct channel *c, rte *e, char *msg)
692 {
693 if ((c->debug & flag) || (c->proto->debug & flag))
694 rte_trace(c, e, '<', msg);
695 }
696
697 static rte *
698 export_filter_(struct channel *c, rte *rt0, rte **rt_free, linpool *pool, int silent)
699 {
700 struct proto *p = c->proto;
701 const struct filter *filter = c->out_filter;
702 struct proto_stats *stats = &c->stats;
703 rte *rt;
704 int v;
705
706 rt = rt0;
707 *rt_free = NULL;
708
709 v = p->preexport ? p->preexport(c, rt) : 0;
710 if (v < 0)
711 {
712 if (silent)
713 goto reject;
714
715 stats->exp_updates_rejected++;
716 if (v == RIC_REJECT)
717 rte_trace_out(D_FILTERS, c, rt, "rejected by protocol");
718 goto reject;
719 }
720 if (v > 0)
721 {
722 if (!silent)
723 rte_trace_out(D_FILTERS, c, rt, "forced accept by protocol");
724 goto accept;
725 }
726
727 v = filter && ((filter == FILTER_REJECT) ||
728 (f_run(filter, &rt, pool,
729 (silent ? FF_SILENT : 0)) > F_ACCEPT));
730 if (v)
731 {
732 if (silent)
733 goto reject;
734
735 stats->exp_updates_filtered++;
736 rte_trace_out(D_FILTERS, c, rt, "filtered out");
737 goto reject;
738 }
739
740 accept:
741 if (rt != rt0)
742 *rt_free = rt;
743 return rt;
744
745 reject:
746 /* Discard temporary rte */
747 if (rt != rt0)
748 rte_free(rt);
749 return NULL;
750 }
751
752 static inline rte *
753 export_filter(struct channel *c, rte *rt0, rte **rt_free, int silent)
754 {
755 return export_filter_(c, rt0, rt_free, rte_update_pool, silent);
756 }
757
758 static void
759 do_rt_notify(struct channel *c, net *net, rte *new, rte *old, int refeed)
760 {
761 struct proto *p = c->proto;
762 struct proto_stats *stats = &c->stats;
763
764 if (refeed && new)
765 c->refeed_count++;
766
767 /* Apply export limit */
768 struct channel_limit *l = &c->out_limit;
769 if (l->action && !old && new)
770 {
771 if (stats->exp_routes >= l->limit)
772 channel_notify_limit(c, l, PLD_OUT, stats->exp_routes);
773
774 if (l->state == PLS_BLOCKED)
775 {
776 stats->exp_updates_rejected++;
777 rte_trace_out(D_FILTERS, c, new, "rejected [limit]");
778 return;
779 }
780 }
781
782 /* Apply export table */
783 if (c->out_table && !rte_update_out(c, net->n.addr, new, old, refeed))
784 return;
785
786 if (new)
787 stats->exp_updates_accepted++;
788 else
789 stats->exp_withdraws_accepted++;
790
791 if (old)
792 {
793 bmap_clear(&c->export_map, old->id);
794 stats->exp_routes--;
795 }
796
797 if (new)
798 {
799 bmap_set(&c->export_map, new->id);
800 stats->exp_routes++;
801 }
802
803 if (p->debug & D_ROUTES)
804 {
805 if (new && old)
806 rte_trace_out(D_ROUTES, c, new, "replaced");
807 else if (new)
808 rte_trace_out(D_ROUTES, c, new, "added");
809 else if (old)
810 rte_trace_out(D_ROUTES, c, old, "removed");
811 }
812
813 p->rt_notify(p, c, net, new, old);
814 }
815
816 static void
817 rt_notify_basic(struct channel *c, net *net, rte *new, rte *old, int refeed)
818 {
819 // struct proto *p = c->proto;
820 rte *new_free = NULL;
821
822 if (new)
823 c->stats.exp_updates_received++;
824 else
825 c->stats.exp_withdraws_received++;
826
827 if (new)
828 new = export_filter(c, new, &new_free, 0);
829
830 if (old && !bmap_test(&c->export_map, old->id))
831 old = NULL;
832
833 if (!new && !old)
834 return;
835
836 do_rt_notify(c, net, new, old, refeed);
837
838 /* Discard temporary rte */
839 if (new_free)
840 rte_free(new_free);
841 }
842
843 static void
844 rt_notify_accepted(struct channel *c, net *net, rte *new_changed, rte *old_changed, int refeed)
845 {
846 // struct proto *p = c->proto;
847 rte *new_best = NULL;
848 rte *old_best = NULL;
849 rte *new_free = NULL;
850 int new_first = 0;
851
852 /*
853 * We assume that there are no changes in net route order except (added)
854 * new_changed and (removed) old_changed. Therefore, the function is not
855 * compatible with deterministic_med (where nontrivial reordering can happen
856 * as a result of a route change) and with recomputation of recursive routes
857 * due to next hop update (where many routes can be changed in one step).
858 *
859 * Note that we need this assumption just for optimizations, we could just
860 * run full new_best recomputation otherwise.
861 *
862 * There are three cases:
863 * feed or old_best is old_changed -> we need to recompute new_best
864 * old_best is before new_changed -> new_best is old_best, ignore
865 * old_best is after new_changed -> try new_changed, otherwise old_best
866 */
867
868 if (net->routes)
869 c->stats.exp_updates_received++;
870 else
871 c->stats.exp_withdraws_received++;
872
873 /* Find old_best - either old_changed, or route for net->routes */
874 if (old_changed && bmap_test(&c->export_map, old_changed->id))
875 old_best = old_changed;
876 else
877 {
878 for (rte *r = net->routes; rte_is_valid(r); r = r->next)
879 {
880 if (bmap_test(&c->export_map, r->id))
881 {
882 old_best = r;
883 break;
884 }
885
886 /* Note if new_changed found before old_best */
887 if (r == new_changed)
888 new_first = 1;
889 }
890 }
891
892 /* Find new_best */
893 if ((new_changed == old_changed) || (old_best == old_changed))
894 {
895 /* Feed or old_best changed -> find first accepted by filters */
896 for (rte *r = net->routes; rte_is_valid(r); r = r->next)
897 if (new_best = export_filter(c, r, &new_free, 0))
898 break;
899 }
900 else
901 {
902 /* Other cases -> either new_changed, or old_best (and nothing changed) */
903 if (new_first && (new_changed = export_filter(c, new_changed, &new_free, 0)))
904 new_best = new_changed;
905 else
906 return;
907 }
908
909 if (!new_best && !old_best)
910 return;
911
912 do_rt_notify(c, net, new_best, old_best, refeed);
913
914 /* Discard temporary rte */
915 if (new_free)
916 rte_free(new_free);
917 }
918
919
920 static struct nexthop *
921 nexthop_merge_rta(struct nexthop *nhs, rta *a, linpool *pool, int max)
922 {
923 return nexthop_merge(nhs, &(a->nh), 1, 0, max, pool);
924 }
925
926 rte *
927 rt_export_merged(struct channel *c, net *net, rte **rt_free, linpool *pool, int silent)
928 {
929 // struct proto *p = c->proto;
930 struct nexthop *nhs = NULL;
931 rte *best0, *best, *rt0, *rt, *tmp;
932
933 best0 = net->routes;
934 *rt_free = NULL;
935
936 if (!rte_is_valid(best0))
937 return NULL;
938
939 best = export_filter_(c, best0, rt_free, pool, silent);
940
941 if (!best || !rte_is_reachable(best))
942 return best;
943
944 for (rt0 = best0->next; rt0; rt0 = rt0->next)
945 {
946 if (!rte_mergable(best0, rt0))
947 continue;
948
949 rt = export_filter_(c, rt0, &tmp, pool, 1);
950
951 if (!rt)
952 continue;
953
954 if (rte_is_reachable(rt))
955 nhs = nexthop_merge_rta(nhs, rt->attrs, pool, c->merge_limit);
956
957 if (tmp)
958 rte_free(tmp);
959 }
960
961 if (nhs)
962 {
963 nhs = nexthop_merge_rta(nhs, best->attrs, pool, c->merge_limit);
964
965 if (nhs->next)
966 {
967 best = rte_cow_rta(best, pool);
968 nexthop_link(best->attrs, nhs);
969 }
970 }
971
972 if (best != best0)
973 *rt_free = best;
974
975 return best;
976 }
977
978
979 static void
980 rt_notify_merged(struct channel *c, net *net, rte *new_changed, rte *old_changed,
981 rte *new_best, rte *old_best, int refeed)
982 {
983 // struct proto *p = c->proto;
984 rte *new_free = NULL;
985
986 /* We assume that all rte arguments are either NULL or rte_is_valid() */
987
988 /* This check should be done by the caller */
989 if (!new_best && !old_best)
990 return;
991
992 /* Check whether the change is relevant to the merged route */
993 if ((new_best == old_best) &&
994 (new_changed != old_changed) &&
995 !rte_mergable(new_best, new_changed) &&
996 !rte_mergable(old_best, old_changed))
997 return;
998
999 if (new_best)
1000 c->stats.exp_updates_received++;
1001 else
1002 c->stats.exp_withdraws_received++;
1003
1004 /* Prepare new merged route */
1005 if (new_best)
1006 new_best = rt_export_merged(c, net, &new_free, rte_update_pool, 0);
1007
1008 /* Check old merged route */
1009 if (old_best && !bmap_test(&c->export_map, old_best->id))
1010 old_best = NULL;
1011
1012 if (!new_best && !old_best)
1013 return;
1014
1015 do_rt_notify(c, net, new_best, old_best, refeed);
1016
1017 /* Discard temporary rte */
1018 if (new_free)
1019 rte_free(new_free);
1020 }
1021
1022
1023 /**
1024 * rte_announce - announce a routing table change
1025 * @tab: table the route has been added to
1026 * @type: type of route announcement (RA_UNDEF or RA_ANY)
1027 * @net: network in question
1028 * @new: the new or changed route
1029 * @old: the previous route replaced by the new one
1030 * @new_best: the new best route for the same network
1031 * @old_best: the previous best route for the same network
1032 *
1033 * This function gets a routing table update and announces it to all protocols
1034 * that are connected to the same table by their channels.
1035 *
1036 * There are two ways of how routing table changes are announced. First, there
1037 * is a change of just one route in @net (which may caused a change of the best
1038 * route of the network). In this case @new and @old describes the changed route
1039 * and @new_best and @old_best describes best routes. Other routes are not
1040 * affected, but in sorted table the order of other routes might change.
1041 *
1042 * Second, There is a bulk change of multiple routes in @net, with shared best
1043 * route selection. In such case separate route changes are described using
1044 * @type of %RA_ANY, with @new and @old specifying the changed route, while
1045 * @new_best and @old_best are NULL. After that, another notification is done
1046 * where @new_best and @old_best are filled (may be the same), but @new and @old
1047 * are NULL.
1048 *
1049 * The function announces the change to all associated channels. For each
1050 * channel, an appropriate preprocessing is done according to channel &ra_mode.
1051 * For example, %RA_OPTIMAL channels receive just changes of best routes.
1052 *
1053 * In general, we first call preexport() hook of a protocol, which performs
1054 * basic checks on the route (each protocol has a right to veto or force accept
1055 * of the route before any filter is asked). Then we consult an export filter
1056 * of the channel and verify the old route in an export map of the channel.
1057 * Finally, the rt_notify() hook of the protocol gets called.
1058 *
1059 * Note that there are also calls of rt_notify() hooks due to feed, but that is
1060 * done outside of scope of rte_announce().
1061 */
1062 static void
1063 rte_announce(rtable *tab, uint type, net *net, rte *new, rte *old,
1064 rte *new_best, rte *old_best)
1065 {
1066 if (!rte_is_valid(new))
1067 new = NULL;
1068
1069 if (!rte_is_valid(old))
1070 old = NULL;
1071
1072 if (!rte_is_valid(new_best))
1073 new_best = NULL;
1074
1075 if (!rte_is_valid(old_best))
1076 old_best = NULL;
1077
1078 if (!new && !old && !new_best && !old_best)
1079 return;
1080
1081 if (new_best != old_best)
1082 {
1083 if (new_best)
1084 new_best->sender->stats.pref_routes++;
1085 if (old_best)
1086 old_best->sender->stats.pref_routes--;
1087
1088 if (tab->hostcache)
1089 rt_notify_hostcache(tab, net);
1090
1091 if (!EMPTY_LIST(tab->flowspec_links))
1092 rt_flowspec_notify(tab, net);
1093 }
1094
1095 rt_schedule_notify(tab);
1096
1097 struct channel *c; node *n;
1098 WALK_LIST2(c, n, tab->channels, table_node)
1099 {
1100 if (c->export_state == ES_DOWN)
1101 continue;
1102
1103 if (type && (type != c->ra_mode))
1104 continue;
1105
1106 switch (c->ra_mode)
1107 {
1108 case RA_OPTIMAL:
1109 if (new_best != old_best)
1110 rt_notify_basic(c, net, new_best, old_best, 0);
1111 break;
1112
1113 case RA_ANY:
1114 if (new != old)
1115 rt_notify_basic(c, net, new, old, 0);
1116 break;
1117
1118 case RA_ACCEPTED:
1119 /*
1120 * The (new != old) condition is problematic here, as it would break
1121 * the second usage pattern (announcement after bulk change, used in
1122 * rt_next_hop_update_net(), which sends both new and old as NULL).
1123 *
1124 * But recursive next hops do not work with sorted tables anyways,
1125 * such configuration is forbidden in BGP and not supported in
1126 * rt_notify_accepted().
1127 *
1128 * The condition is needed to eliminate spurious announcements where
1129 * both old and new routes are not valid (so they are NULL).
1130 */
1131 if (new != old)
1132 rt_notify_accepted(c, net, new, old, 0);
1133 break;
1134
1135 case RA_MERGED:
1136 rt_notify_merged(c, net, new, old, new_best, old_best, 0);
1137 break;
1138 }
1139 }
1140 }
1141
1142 static inline int
1143 rte_validate(rte *e)
1144 {
1145 int c;
1146 net *n = e->net;
1147
1148 if (!net_validate(n->n.addr))
1149 {
1150 log(L_WARN "Ignoring bogus prefix %N received via %s",
1151 n->n.addr, e->sender->proto->name);
1152 return 0;
1153 }
1154
1155 /* FIXME: better handling different nettypes */
1156 c = !net_is_flow(n->n.addr) ?
1157 net_classify(n->n.addr): (IADDR_HOST | SCOPE_UNIVERSE);
1158 if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
1159 {
1160 log(L_WARN "Ignoring bogus route %N received via %s",
1161 n->n.addr, e->sender->proto->name);
1162 return 0;
1163 }
1164
1165 if (net_type_match(n->n.addr, NB_DEST) == !e->attrs->dest)
1166 {
1167 /* Exception for flowspec that failed validation */
1168 if (net_is_flow(n->n.addr) && (e->attrs->dest == RTD_UNREACHABLE))
1169 return 1;
1170
1171 log(L_WARN "Ignoring route %N with invalid dest %d received via %s",
1172 n->n.addr, e->attrs->dest, e->sender->proto->name);
1173 return 0;
1174 }
1175
1176 if ((e->attrs->dest == RTD_UNICAST) && !nexthop_is_sorted(&(e->attrs->nh)))
1177 {
1178 log(L_WARN "Ignoring unsorted multipath route %N received via %s",
1179 n->n.addr, e->sender->proto->name);
1180 return 0;
1181 }
1182
1183 return 1;
1184 }
1185
1186 /**
1187 * rte_free - delete a &rte
1188 * @e: &rte to be deleted
1189 *
1190 * rte_free() deletes the given &rte from the routing table it's linked to.
1191 */
1192 void
1193 rte_free(rte *e)
1194 {
1195 rt_unlock_source(e->src);
1196 if (rta_is_cached(e->attrs))
1197 rta_free(e->attrs);
1198 sl_free(e);
1199 }
1200
1201 static inline void
1202 rte_free_quick(rte *e)
1203 {
1204 rt_unlock_source(e->src);
1205 rta_free(e->attrs);
1206 sl_free(e);
1207 }
1208
1209 static int
1210 rte_same(rte *x, rte *y)
1211 {
1212 /* rte.flags / rte.pflags are not checked, as they are internal to rtable */
1213 return
1214 x->attrs == y->attrs &&
1215 x->src == y->src &&
1216 rte_is_filtered(x) == rte_is_filtered(y);
1217 }
1218
1219 static inline int rte_is_ok(rte *e) { return e && !rte_is_filtered(e); }
1220
1221 static void
1222 rte_recalculate(struct channel *c, net *net, rte *new, struct rte_src *src)
1223 {
1224 struct proto *p = c->proto;
1225 struct rtable *table = c->table;
1226 struct proto_stats *stats = &c->stats;
1227 static struct tbf rl_pipe = TBF_DEFAULT_LOG_LIMITS;
1228 rte *before_old = NULL;
1229 rte *old_best = net->routes;
1230 rte *old = NULL;
1231 rte **k;
1232
1233 k = &net->routes; /* Find and remove original route from the same protocol */
1234 while (old = *k)
1235 {
1236 if (old->src == src)
1237 {
1238 /* If there is the same route in the routing table but from
1239 * a different sender, then there are two paths from the
1240 * source protocol to this routing table through transparent
1241 * pipes, which is not allowed.
1242 *
1243 * We log that and ignore the route. If it is withdraw, we
1244 * ignore it completely (there might be 'spurious withdraws',
1245 * see FIXME in do_rte_announce())
1246 */
1247 if (old->sender->proto != p)
1248 {
1249 if (new)
1250 {
1251 log_rl(&rl_pipe, L_ERR "Pipe collision detected when sending %N to table %s",
1252 net->n.addr, table->name);
1253 rte_free_quick(new);
1254 }
1255 return;
1256 }
1257
1258 if (new && rte_same(old, new))
1259 {
1260 /* No changes, ignore the new route and refresh the old one */
1261
1262 old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
1263
1264 if (!rte_is_filtered(new))
1265 {
1266 stats->imp_updates_ignored++;
1267 rte_trace_in(D_ROUTES, c, new, "ignored");
1268 }
1269
1270 rte_free_quick(new);
1271 return;
1272 }
1273 *k = old->next;
1274 table->rt_count--;
1275 break;
1276 }
1277 k = &old->next;
1278 before_old = old;
1279 }
1280
1281 /* Save the last accessed position */
1282 rte **pos = k;
1283
1284 if (!old)
1285 before_old = NULL;
1286
1287 if (!old && !new)
1288 {
1289 stats->imp_withdraws_ignored++;
1290 return;
1291 }
1292
1293 int new_ok = rte_is_ok(new);
1294 int old_ok = rte_is_ok(old);
1295
1296 struct channel_limit *l = &c->rx_limit;
1297 if (l->action && !old && new && !c->in_table)
1298 {
1299 u32 all_routes = stats->imp_routes + stats->filt_routes;
1300
1301 if (all_routes >= l->limit)
1302 channel_notify_limit(c, l, PLD_RX, all_routes);
1303
1304 if (l->state == PLS_BLOCKED)
1305 {
1306 /* In receive limit the situation is simple, old is NULL so
1307 we just free new and exit like nothing happened */
1308
1309 stats->imp_updates_ignored++;
1310 rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
1311 rte_free_quick(new);
1312 return;
1313 }
1314 }
1315
1316 l = &c->in_limit;
1317 if (l->action && !old_ok && new_ok)
1318 {
1319 if (stats->imp_routes >= l->limit)
1320 channel_notify_limit(c, l, PLD_IN, stats->imp_routes);
1321
1322 if (l->state == PLS_BLOCKED)
1323 {
1324 /* In import limit the situation is more complicated. We
1325 shouldn't just drop the route, we should handle it like
1326 it was filtered. We also have to continue the route
1327 processing if old or new is non-NULL, but we should exit
1328 if both are NULL as this case is probably assumed to be
1329 already handled. */
1330
1331 stats->imp_updates_ignored++;
1332 rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
1333
1334 if (c->in_keep_filtered)
1335 new->flags |= REF_FILTERED;
1336 else
1337 { rte_free_quick(new); new = NULL; }
1338
1339 /* Note that old && !new could be possible when
1340 c->in_keep_filtered changed in the recent past. */
1341
1342 if (!old && !new)
1343 return;
1344
1345 new_ok = 0;
1346 goto skip_stats1;
1347 }
1348 }
1349
1350 if (new_ok)
1351 stats->imp_updates_accepted++;
1352 else if (old_ok)
1353 stats->imp_withdraws_accepted++;
1354 else
1355 stats->imp_withdraws_ignored++;
1356
1357 if (old_ok || new_ok)
1358 table->last_rt_change = current_time();
1359
1360 skip_stats1:
1361
1362 if (new)
1363 rte_is_filtered(new) ? stats->filt_routes++ : stats->imp_routes++;
1364 if (old)
1365 rte_is_filtered(old) ? stats->filt_routes-- : stats->imp_routes--;
1366
1367 if (table->config->sorted)
1368 {
1369 /* If routes are sorted, just insert new route to appropriate position */
1370 if (new)
1371 {
1372 if (before_old && !rte_better(new, before_old))
1373 k = &before_old->next;
1374 else
1375 k = &net->routes;
1376
1377 for (; *k; k=&(*k)->next)
1378 if (rte_better(new, *k))
1379 break;
1380
1381 new->next = *k;
1382 *k = new;
1383
1384 table->rt_count++;
1385 }
1386 }
1387 else
1388 {
1389 /* If routes are not sorted, find the best route and move it on
1390 the first position. There are several optimized cases. */
1391
1392 if (src->proto->rte_recalculate && src->proto->rte_recalculate(table, net, new, old, old_best))
1393 goto do_recalculate;
1394
1395 if (new && rte_better(new, old_best))
1396 {
1397 /* The first case - the new route is cleary optimal,
1398 we link it at the first position */
1399
1400 new->next = net->routes;
1401 net->routes = new;
1402
1403 table->rt_count++;
1404 }
1405 else if (old == old_best)
1406 {
1407 /* The second case - the old best route disappeared, we add the
1408 new route (if we have any) to the list (we don't care about
1409 position) and then we elect the new optimal route and relink
1410 that route at the first position and announce it. New optimal
1411 route might be NULL if there is no more routes */
1412
1413 do_recalculate:
1414 /* Add the new route to the list */
1415 if (new)
1416 {
1417 new->next = *pos;
1418 *pos = new;
1419
1420 table->rt_count++;
1421 }
1422
1423 /* Find a new optimal route (if there is any) */
1424 if (net->routes)
1425 {
1426 rte **bp = &net->routes;
1427 for (k=&(*bp)->next; *k; k=&(*k)->next)
1428 if (rte_better(*k, *bp))
1429 bp = k;
1430
1431 /* And relink it */
1432 rte *best = *bp;
1433 *bp = best->next;
1434 best->next = net->routes;
1435 net->routes = best;
1436 }
1437 }
1438 else if (new)
1439 {
1440 /* The third case - the new route is not better than the old
1441 best route (therefore old_best != NULL) and the old best
1442 route was not removed (therefore old_best == net->routes).
1443 We just link the new route to the old/last position. */
1444
1445 new->next = *pos;
1446 *pos = new;
1447
1448 table->rt_count++;
1449 }
1450 /* The fourth (empty) case - suboptimal route was removed, nothing to do */
1451 }
1452
1453 if (new)
1454 {
1455 new->lastmod = current_time();
1456
1457 if (!old)
1458 {
1459 new->id = hmap_first_zero(&table->id_map);
1460 hmap_set(&table->id_map, new->id);
1461 }
1462 else
1463 new->id = old->id;
1464 }
1465
1466 /* Log the route change */
1467 if ((c->debug & D_ROUTES) || (p->debug & D_ROUTES))
1468 {
1469 if (new_ok)
1470 rte_trace(c, new, '>', new == net->routes ? "added [best]" : "added");
1471 else if (old_ok)
1472 {
1473 if (old != old_best)
1474 rte_trace(c, old, '>', "removed");
1475 else if (rte_is_ok(net->routes))
1476 rte_trace(c, old, '>', "removed [replaced]");
1477 else
1478 rte_trace(c, old, '>', "removed [sole]");
1479 }
1480 }
1481
1482 /* Propagate the route change */
1483 rte_announce(table, RA_UNDEF, net, new, old, net->routes, old_best);
1484
1485 if (!net->routes &&
1486 (table->gc_counter++ >= table->config->gc_threshold))
1487 rt_kick_prune_timer(table);
1488
1489 if (old_ok && p->rte_remove)
1490 p->rte_remove(net, old);
1491 if (new_ok && p->rte_insert)
1492 p->rte_insert(net, new);
1493
1494 if (old)
1495 {
1496 if (!new)
1497 hmap_clear(&table->id_map, old->id);
1498
1499 rte_free_quick(old);
1500 }
1501 }
1502
1503 static int rte_update_nest_cnt; /* Nesting counter to allow recursive updates */
1504
1505 static inline void
1506 rte_update_lock(void)
1507 {
1508 rte_update_nest_cnt++;
1509 }
1510
1511 static inline void
1512 rte_update_unlock(void)
1513 {
1514 if (!--rte_update_nest_cnt)
1515 lp_flush(rte_update_pool);
1516 }
1517
1518 /**
1519 * rte_update - enter a new update to a routing table
1520 * @table: table to be updated
1521 * @c: channel doing the update
1522 * @net: network node
1523 * @p: protocol submitting the update
1524 * @src: protocol originating the update
1525 * @new: a &rte representing the new route or %NULL for route removal.
1526 *
1527 * This function is called by the routing protocols whenever they discover
1528 * a new route or wish to update/remove an existing route. The right announcement
1529 * sequence is to build route attributes first (either un-cached with @aflags set
1530 * to zero or a cached one using rta_lookup(); in this case please note that
1531 * you need to increase the use count of the attributes yourself by calling
1532 * rta_clone()), call rte_get_temp() to obtain a temporary &rte, fill in all
1533 * the appropriate data and finally submit the new &rte by calling rte_update().
1534 *
1535 * @src specifies the protocol that originally created the route and the meaning
1536 * of protocol-dependent data of @new. If @new is not %NULL, @src have to be the
1537 * same value as @new->attrs->proto. @p specifies the protocol that called
1538 * rte_update(). In most cases it is the same protocol as @src. rte_update()
1539 * stores @p in @new->sender;
1540 *
1541 * When rte_update() gets any route, it automatically validates it (checks,
1542 * whether the network and next hop address are valid IP addresses and also
1543 * whether a normal routing protocol doesn't try to smuggle a host or link
1544 * scope route to the table), converts all protocol dependent attributes stored
1545 * in the &rte to temporary extended attributes, consults import filters of the
1546 * protocol to see if the route should be accepted and/or its attributes modified,
1547 * stores the temporary attributes back to the &rte.
1548 *
1549 * Now, having a "public" version of the route, we
1550 * automatically find any old route defined by the protocol @src
1551 * for network @n, replace it by the new one (or removing it if @new is %NULL),
1552 * recalculate the optimal route for this destination and finally broadcast
1553 * the change (if any) to all routing protocols by calling rte_announce().
1554 *
1555 * All memory used for attribute lists and other temporary allocations is taken
1556 * from a special linear pool @rte_update_pool and freed when rte_update()
1557 * finishes.
1558 */
1559
1560 void
1561 rte_update2(struct channel *c, const net_addr *n, rte *new, struct rte_src *src)
1562 {
1563 // struct proto *p = c->proto;
1564 struct proto_stats *stats = &c->stats;
1565 const struct filter *filter = c->in_filter;
1566 net *nn;
1567
1568 ASSERT(c->channel_state == CS_UP);
1569
1570 rte_update_lock();
1571 if (new)
1572 {
1573 /* Create a temporary table node */
1574 nn = alloca(sizeof(net) + n->length);
1575 memset(nn, 0, sizeof(net) + n->length);
1576 net_copy(nn->n.addr, n);
1577
1578 new->net = nn;
1579 new->sender = c;
1580
1581 stats->imp_updates_received++;
1582 if (!rte_validate(new))
1583 {
1584 rte_trace_in(D_FILTERS, c, new, "invalid");
1585 stats->imp_updates_invalid++;
1586 goto drop;
1587 }
1588
1589 if (filter == FILTER_REJECT)
1590 {
1591 stats->imp_updates_filtered++;
1592 rte_trace_in(D_FILTERS, c, new, "filtered out");
1593
1594 if (! c->in_keep_filtered)
1595 goto drop;
1596
1597 /* new is a private copy, i could modify it */
1598 new->flags |= REF_FILTERED;
1599 }
1600 else if (filter)
1601 {
1602 int fr = f_run(filter, &new, rte_update_pool, 0);
1603 if (fr > F_ACCEPT)
1604 {
1605 stats->imp_updates_filtered++;
1606 rte_trace_in(D_FILTERS, c, new, "filtered out");
1607
1608 if (! c->in_keep_filtered)
1609 goto drop;
1610
1611 new->flags |= REF_FILTERED;
1612 }
1613 }
1614 if (!rta_is_cached(new->attrs)) /* Need to copy attributes */
1615 new->attrs = rta_lookup(new->attrs);
1616 new->flags |= REF_COW;
1617
1618 /* Use the actual struct network, not the dummy one */
1619 nn = net_get(c->table, n);
1620 new->net = nn;
1621 }
1622 else
1623 {
1624 stats->imp_withdraws_received++;
1625
1626 if (!(nn = net_find(c->table, n)) || !src)
1627 {
1628 stats->imp_withdraws_ignored++;
1629 rte_update_unlock();
1630 return;
1631 }
1632 }
1633
1634 recalc:
1635 /* And recalculate the best route */
1636 rte_recalculate(c, nn, new, src);
1637
1638 rte_update_unlock();
1639 return;
1640
1641 drop:
1642 rte_free(new);
1643 new = NULL;
1644 if (nn = net_find(c->table, n))
1645 goto recalc;
1646
1647 rte_update_unlock();
1648 }
1649
1650 /* Independent call to rte_announce(), used from next hop
1651 recalculation, outside of rte_update(). new must be non-NULL */
1652 static inline void
1653 rte_announce_i(rtable *tab, uint type, net *net, rte *new, rte *old,
1654 rte *new_best, rte *old_best)
1655 {
1656 rte_update_lock();
1657 rte_announce(tab, type, net, new, old, new_best, old_best);
1658 rte_update_unlock();
1659 }
1660
1661 static inline void
1662 rte_discard(rte *old) /* Non-filtered route deletion, used during garbage collection */
1663 {
1664 rte_update_lock();
1665 rte_recalculate(old->sender, old->net, NULL, old->src);
1666 rte_update_unlock();
1667 }
1668
1669 /* Modify existing route by protocol hook, used for long-lived graceful restart */
1670 static inline void
1671 rte_modify(rte *old)
1672 {
1673 rte_update_lock();
1674
1675 rte *new = old->sender->proto->rte_modify(old, rte_update_pool);
1676 if (new != old)
1677 {
1678 if (new)
1679 {
1680 if (!rta_is_cached(new->attrs))
1681 new->attrs = rta_lookup(new->attrs);
1682 new->flags = (old->flags & ~REF_MODIFY) | REF_COW;
1683 }
1684
1685 rte_recalculate(old->sender, old->net, new, old->src);
1686 }
1687
1688 rte_update_unlock();
1689 }
1690
1691 /* Check rtable for best route to given net whether it would be exported do p */
1692 int
1693 rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter)
1694 {
1695 struct proto *p = c->proto;
1696 net *n = net_find(t, a);
1697 rte *rt = n ? n->routes : NULL;
1698
1699 if (!rte_is_valid(rt))
1700 return 0;
1701
1702 rte_update_lock();
1703
1704 /* Rest is stripped down export_filter() */
1705 int v = p->preexport ? p->preexport(c, rt) : 0;
1706 if (v == RIC_PROCESS)
1707 v = (f_run(filter, &rt, rte_update_pool, FF_SILENT) <= F_ACCEPT);
1708
1709 /* Discard temporary rte */
1710 if (rt != n->routes)
1711 rte_free(rt);
1712
1713 rte_update_unlock();
1714
1715 return v > 0;
1716 }
1717
1718
1719 /**
1720 * rt_refresh_begin - start a refresh cycle
1721 * @t: related routing table
1722 * @c related channel
1723 *
1724 * This function starts a refresh cycle for given routing table and announce
1725 * hook. The refresh cycle is a sequence where the protocol sends all its valid
1726 * routes to the routing table (by rte_update()). After that, all protocol
1727 * routes (more precisely routes with @c as @sender) not sent during the
1728 * refresh cycle but still in the table from the past are pruned. This is
1729 * implemented by marking all related routes as stale by REF_STALE flag in
1730 * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD
1731 * flag in rt_refresh_end() and then removing such routes in the prune loop.
1732 */
1733 void
1734 rt_refresh_begin(rtable *t, struct channel *c)
1735 {
1736 FIB_WALK(&t->fib, net, n)
1737 {
1738 rte *e;
1739 for (e = n->routes; e; e = e->next)
1740 if (e->sender == c)
1741 e->flags |= REF_STALE;
1742 }
1743 FIB_WALK_END;
1744 }
1745
1746 /**
1747 * rt_refresh_end - end a refresh cycle
1748 * @t: related routing table
1749 * @c: related channel
1750 *
1751 * This function ends a refresh cycle for given routing table and announce
1752 * hook. See rt_refresh_begin() for description of refresh cycles.
1753 */
1754 void
1755 rt_refresh_end(rtable *t, struct channel *c)
1756 {
1757 int prune = 0;
1758
1759 FIB_WALK(&t->fib, net, n)
1760 {
1761 rte *e;
1762 for (e = n->routes; e; e = e->next)
1763 if ((e->sender == c) && (e->flags & REF_STALE))
1764 {
1765 e->flags |= REF_DISCARD;
1766 prune = 1;
1767 }
1768 }
1769 FIB_WALK_END;
1770
1771 if (prune)
1772 rt_schedule_prune(t);
1773 }
1774
1775 void
1776 rt_modify_stale(rtable *t, struct channel *c)
1777 {
1778 int prune = 0;
1779
1780 FIB_WALK(&t->fib, net, n)
1781 {
1782 rte *e;
1783 for (e = n->routes; e; e = e->next)
1784 if ((e->sender == c) && (e->flags & REF_STALE) && !(e->flags & REF_FILTERED))
1785 {
1786 e->flags |= REF_MODIFY;
1787 prune = 1;
1788 }
1789 }
1790 FIB_WALK_END;
1791
1792 if (prune)
1793 rt_schedule_prune(t);
1794 }
1795
1796 /**
1797 * rte_dump - dump a route
1798 * @e: &rte to be dumped
1799 *
1800 * This functions dumps contents of a &rte to debug output.
1801 */
1802 void
1803 rte_dump(rte *e)
1804 {
1805 net *n = e->net;
1806 debug("%-1N ", n->n.addr);
1807 debug("PF=%02x ", e->pflags);
1808 rta_dump(e->attrs);
1809 debug("\n");
1810 }
1811
1812 /**
1813 * rt_dump - dump a routing table
1814 * @t: routing table to be dumped
1815 *
1816 * This function dumps contents of a given routing table to debug output.
1817 */
1818 void
1819 rt_dump(rtable *t)
1820 {
1821 debug("Dump of routing table <%s>\n", t->name);
1822 #ifdef DEBUGGING
1823 fib_check(&t->fib);
1824 #endif
1825 FIB_WALK(&t->fib, net, n)
1826 {
1827 rte *e;
1828 for(e=n->routes; e; e=e->next)
1829 rte_dump(e);
1830 }
1831 FIB_WALK_END;
1832 debug("\n");
1833 }
1834
1835 /**
1836 * rt_dump_all - dump all routing tables
1837 *
1838 * This function dumps contents of all routing tables to debug output.
1839 */
1840 void
1841 rt_dump_all(void)
1842 {
1843 rtable *t;
1844 node *n;
1845
1846 WALK_LIST2(t, n, routing_tables, n)
1847 rt_dump(t);
1848 }
1849
1850 static inline void
1851 rt_schedule_hcu(rtable *tab)
1852 {
1853 if (tab->hcu_scheduled)
1854 return;
1855
1856 tab->hcu_scheduled = 1;
1857 ev_schedule(tab->rt_event);
1858 }
1859
1860 static inline void
1861 rt_schedule_nhu(rtable *tab)
1862 {
1863 if (tab->nhu_state == NHU_CLEAN)
1864 ev_schedule(tab->rt_event);
1865
1866 /* state change:
1867 * NHU_CLEAN -> NHU_SCHEDULED
1868 * NHU_RUNNING -> NHU_DIRTY
1869 */
1870 tab->nhu_state |= NHU_SCHEDULED;
1871 }
1872
1873 void
1874 rt_schedule_prune(rtable *tab)
1875 {
1876 if (tab->prune_state == 0)
1877 ev_schedule(tab->rt_event);
1878
1879 /* state change 0->1, 2->3 */
1880 tab->prune_state |= 1;
1881 }
1882
1883
1884 static void
1885 rt_event(void *ptr)
1886 {
1887 rtable *tab = ptr;
1888
1889 rt_lock_table(tab);
1890
1891 if (tab->hcu_scheduled)
1892 rt_update_hostcache(tab);
1893
1894 if (tab->nhu_state)
1895 rt_next_hop_update(tab);
1896
1897 if (tab->prune_state)
1898 rt_prune_table(tab);
1899
1900 rt_unlock_table(tab);
1901 }
1902
1903
1904 static void
1905 rt_prune_timer(timer *t)
1906 {
1907 rtable *tab = t->data;
1908
1909 if (tab->gc_counter >= tab->config->gc_threshold)
1910 rt_schedule_prune(tab);
1911 }
1912
1913 static void
1914 rt_kick_prune_timer(rtable *tab)
1915 {
1916 /* Return if prune is already scheduled */
1917 if (tm_active(tab->prune_timer) || (tab->prune_state & 1))
1918 return;
1919
1920 /* Randomize GC period to +/- 50% */
1921 btime gc_period = tab->config->gc_period;
1922 gc_period = (gc_period / 2) + (random_u32() % (uint) gc_period);
1923 tm_start(tab->prune_timer, gc_period);
1924 }
1925
1926
1927 static inline btime
1928 rt_settled_time(rtable *tab)
1929 {
1930 ASSUME(tab->base_settle_time != 0);
1931
1932 return MIN(tab->last_rt_change + tab->config->min_settle_time,
1933 tab->base_settle_time + tab->config->max_settle_time);
1934 }
1935
1936 static void
1937 rt_settle_timer(timer *t)
1938 {
1939 rtable *tab = t->data;
1940
1941 if (!tab->base_settle_time)
1942 return;
1943
1944 btime settled_time = rt_settled_time(tab);
1945 if (current_time() < settled_time)
1946 {
1947 tm_set(tab->settle_timer, settled_time);
1948 return;
1949 }
1950
1951 /* Settled */
1952 tab->base_settle_time = 0;
1953
1954 struct rt_subscription *s;
1955 WALK_LIST(s, tab->subscribers)
1956 s->hook(s);
1957 }
1958
1959 static void
1960 rt_kick_settle_timer(rtable *tab)
1961 {
1962 tab->base_settle_time = current_time();
1963
1964 if (!tab->settle_timer)
1965 tab->settle_timer = tm_new_init(tab->rp, rt_settle_timer, tab, 0, 0);
1966
1967 if (!tm_active(tab->settle_timer))
1968 tm_set(tab->settle_timer, rt_settled_time(tab));
1969 }
1970
1971 static inline void
1972 rt_schedule_notify(rtable *tab)
1973 {
1974 if (EMPTY_LIST(tab->subscribers))
1975 return;
1976
1977 if (tab->base_settle_time)
1978 return;
1979
1980 rt_kick_settle_timer(tab);
1981 }
1982
1983 void
1984 rt_subscribe(rtable *tab, struct rt_subscription *s)
1985 {
1986 s->tab = tab;
1987 rt_lock_table(tab);
1988 add_tail(&tab->subscribers, &s->n);
1989 }
1990
1991 void
1992 rt_unsubscribe(struct rt_subscription *s)
1993 {
1994 rem_node(&s->n);
1995 rt_unlock_table(s->tab);
1996 }
1997
1998 static struct rt_flowspec_link *
1999 rt_flowspec_find_link(rtable *src, rtable *dst)
2000 {
2001 struct rt_flowspec_link *ln;
2002 WALK_LIST(ln, src->flowspec_links)
2003 if ((ln->src == src) && (ln->dst == dst))
2004 return ln;
2005
2006 return NULL;
2007 }
2008
2009 void
2010 rt_flowspec_link(rtable *src, rtable *dst)
2011 {
2012 ASSERT(rt_is_ip(src));
2013 ASSERT(rt_is_flow(dst));
2014
2015 struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst);
2016
2017 if (!ln)
2018 {
2019 rt_lock_table(src);
2020 rt_lock_table(dst);
2021
2022 ln = mb_allocz(src->rp, sizeof(struct rt_flowspec_link));
2023 ln->src = src;
2024 ln->dst = dst;
2025 add_tail(&src->flowspec_links, &ln->n);
2026 }
2027
2028 ln->uc++;
2029 }
2030
2031 void
2032 rt_flowspec_unlink(rtable *src, rtable *dst)
2033 {
2034 struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst);
2035
2036 ASSERT(ln && (ln->uc > 0));
2037
2038 ln->uc--;
2039
2040 if (!ln->uc)
2041 {
2042 rem_node(&ln->n);
2043 mb_free(ln);
2044
2045 rt_unlock_table(src);
2046 rt_unlock_table(dst);
2047 }
2048 }
2049
2050 static void
2051 rt_flowspec_notify(rtable *src, net *net)
2052 {
2053 /* Only IP tables are src links */
2054 ASSERT(rt_is_ip(src));
2055
2056 struct rt_flowspec_link *ln;
2057 WALK_LIST(ln, src->flowspec_links)
2058 {
2059 rtable *dst = ln->dst;
2060 ASSERT(rt_is_flow(dst));
2061
2062 /* No need to inspect it further if recalculation is already active */
2063 if ((dst->nhu_state == NHU_SCHEDULED) || (dst->nhu_state == NHU_DIRTY))
2064 continue;
2065
2066 if (trie_match_net(dst->flowspec_trie, net->n.addr))
2067 rt_schedule_nhu(dst);
2068 }
2069 }
2070
2071 static void
2072 rt_flowspec_reset_trie(rtable *tab)
2073 {
2074 linpool *lp = tab->flowspec_trie->lp;
2075 int ipv4 = tab->flowspec_trie->ipv4;
2076
2077 lp_flush(lp);
2078 tab->flowspec_trie = f_new_trie(lp, 0);
2079 tab->flowspec_trie->ipv4 = ipv4;
2080 }
2081
2082 static void
2083 rt_free(resource *_r)
2084 {
2085 rtable *r = (rtable *) _r;
2086
2087 DBG("Deleting routing table %s\n", r->name);
2088 ASSERT_DIE(r->use_count == 0);
2089
2090 if (r->internal)
2091 return;
2092
2093 r->config->table = NULL;
2094 rem_node(&r->n);
2095
2096 if (r->hostcache)
2097 rt_free_hostcache(r);
2098
2099 /* Freed automagically by the resource pool
2100 fib_free(&r->fib);
2101 hmap_free(&r->id_map);
2102 rfree(r->rt_event);
2103 rfree(r->settle_timer);
2104 mb_free(r);
2105 */
2106 }
2107
2108 static void
2109 rt_res_dump(resource *_r)
2110 {
2111 rtable *r = (rtable *) _r;
2112 debug("name \"%s\", addr_type=%s, rt_count=%u, use_count=%d\n",
2113 r->name, net_label[r->addr_type], r->rt_count, r->use_count);
2114 }
2115
2116 static struct resclass rt_class = {
2117 .name = "Routing table",
2118 .size = sizeof(struct rtable),
2119 .free = rt_free,
2120 .dump = rt_res_dump,
2121 .lookup = NULL,
2122 .memsize = NULL,
2123 };
2124
2125 rtable *
2126 rt_setup(pool *pp, struct rtable_config *cf)
2127 {
2128 pool *p = rp_newf(pp, "Routing table %s", cf->name);
2129
2130 rtable *t = ralloc(p, &rt_class);
2131 t->rp = p;
2132
2133 t->name = cf->name;
2134 t->config = cf;
2135 t->addr_type = cf->addr_type;
2136
2137 fib_init(&t->fib, p, t->addr_type, sizeof(net), OFFSETOF(net, n), 0, NULL);
2138
2139 if (cf->trie_used)
2140 {
2141 t->trie = f_new_trie(lp_new_default(p), 0);
2142 t->trie->ipv4 = net_val_match(t->addr_type, NB_IP4 | NB_VPN4 | NB_ROA4);
2143
2144 t->fib.init = net_init_with_trie;
2145 }
2146
2147 init_list(&t->channels);
2148 init_list(&t->flowspec_links);
2149 init_list(&t->subscribers);
2150
2151 if (!(t->internal = cf->internal))
2152 {
2153 hmap_init(&t->id_map, p, 1024);
2154 hmap_set(&t->id_map, 0);
2155
2156 t->rt_event = ev_new_init(p, rt_event, t);
2157 t->prune_timer = tm_new_init(p, rt_prune_timer, t, 0, 0);
2158 t->last_rt_change = t->gc_time = current_time();
2159
2160 if (rt_is_flow(t))
2161 {
2162 t->flowspec_trie = f_new_trie(lp_new_default(p), 0);
2163 t->flowspec_trie->ipv4 = (t->addr_type == NET_FLOW4);
2164 }
2165 }
2166
2167 return t;
2168 }
2169
2170 /**
2171 * rt_init - initialize routing tables
2172 *
2173 * This function is called during BIRD startup. It initializes the
2174 * routing table module.
2175 */
2176 void
2177 rt_init(void)
2178 {
2179 rta_init();
2180 rt_table_pool = rp_new(&root_pool, "Routing tables");
2181 rte_update_pool = lp_new_default(rt_table_pool);
2182 rte_slab = sl_new(rt_table_pool, sizeof(rte));
2183 init_list(&routing_tables);
2184 }
2185
2186
2187 /**
2188 * rt_prune_table - prune a routing table
2189 *
2190 * The prune loop scans routing tables and removes routes belonging to flushing
2191 * protocols, discarded routes and also stale network entries. It is called from
2192 * rt_event(). The event is rescheduled if the current iteration do not finish
2193 * the table. The pruning is directed by the prune state (@prune_state),
2194 * specifying whether the prune cycle is scheduled or running, and there
2195 * is also a persistent pruning iterator (@prune_fit).
2196 *
2197 * The prune loop is used also for channel flushing. For this purpose, the
2198 * channels to flush are marked before the iteration and notified after the
2199 * iteration.
2200 */
2201 static void
2202 rt_prune_table(rtable *tab)
2203 {
2204 struct fib_iterator *fit = &tab->prune_fit;
2205 int limit = 2000;
2206
2207 struct channel *c;
2208 node *n, *x;
2209
2210 DBG("Pruning route table %s\n", tab->name);
2211 #ifdef DEBUGGING
2212 fib_check(&tab->fib);
2213 #endif
2214
2215 if (tab->prune_state == 0)
2216 return;
2217
2218 if (tab->prune_state == 1)
2219 {
2220 /* Mark channels to flush */
2221 WALK_LIST2(c, n, tab->channels, table_node)
2222 if (c->channel_state == CS_FLUSHING)
2223 c->flush_active = 1;
2224
2225 FIB_ITERATE_INIT(fit, &tab->fib);
2226 tab->prune_state = 2;
2227
2228 tab->gc_counter = 0;
2229 tab->gc_time = current_time();
2230
2231 if (tab->prune_trie)
2232 {
2233 /* Init prefix trie pruning */
2234 tab->trie_new = f_new_trie(lp_new_default(tab->rp), 0);
2235 tab->trie_new->ipv4 = tab->trie->ipv4;
2236 }
2237 }
2238
2239 again:
2240 FIB_ITERATE_START(&tab->fib, fit, net, n)
2241 {
2242 rte *e;
2243
2244 rescan:
2245 if (limit <= 0)
2246 {
2247 FIB_ITERATE_PUT(fit);
2248 ev_schedule(tab->rt_event);
2249 return;
2250 }
2251
2252 for (e=n->routes; e; e=e->next)
2253 {
2254 if (e->sender->flush_active || (e->flags & REF_DISCARD))
2255 {
2256 rte_discard(e);
2257 limit--;
2258
2259 goto rescan;
2260 }
2261
2262 if (e->flags & REF_MODIFY)
2263 {
2264 rte_modify(e);
2265 limit--;
2266
2267 goto rescan;
2268 }
2269 }
2270
2271 if (!n->routes) /* Orphaned FIB entry */
2272 {
2273 FIB_ITERATE_PUT(fit);
2274 fib_delete(&tab->fib, n);
2275 goto again;
2276 }
2277
2278 if (tab->trie_new)
2279 {
2280 trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen);
2281 limit--;
2282 }
2283 }
2284 FIB_ITERATE_END;
2285
2286 #ifdef DEBUGGING
2287 fib_check(&tab->fib);
2288 #endif
2289
2290 /* state change 2->0, 3->1 */
2291 tab->prune_state &= 1;
2292
2293 if (tab->trie_new)
2294 {
2295 /* Finish prefix trie pruning */
2296
2297 if (!tab->trie_lock_count)
2298 {
2299 rfree(tab->trie->lp);
2300 }
2301 else
2302 {
2303 ASSERT(!tab->trie_old);
2304 tab->trie_old = tab->trie;
2305 tab->trie_old_lock_count = tab->trie_lock_count;
2306 tab->trie_lock_count = 0;
2307 }
2308
2309 tab->trie = tab->trie_new;
2310 tab->trie_new = NULL;
2311 tab->prune_trie = 0;
2312 }
2313 else
2314 {
2315 /* Schedule prefix trie pruning */
2316 if (tab->trie && !tab->trie_old && (tab->trie->prefix_count > (2 * tab->fib.entries)))
2317 {
2318 /* state change 0->1, 2->3 */
2319 tab->prune_state |= 1;
2320 tab->prune_trie = 1;
2321 }
2322 }
2323
2324 if (tab->prune_state > 0)
2325 ev_schedule(tab->rt_event);
2326
2327 /* FIXME: This should be handled in a better way */
2328 rt_prune_sources();
2329
2330 /* Close flushed channels */
2331 WALK_LIST2_DELSAFE(c, n, x, tab->channels, table_node)
2332 if (c->flush_active)
2333 {
2334 c->flush_active = 0;
2335 channel_set_state(c, CS_DOWN);
2336 }
2337
2338 return;
2339 }
2340
2341 /**
2342 * rt_lock_trie - lock a prefix trie of a routing table
2343 * @tab: routing table with prefix trie to be locked
2344 *
2345 * The prune loop may rebuild the prefix trie and invalidate f_trie_walk_state
2346 * structures. Therefore, asynchronous walks should lock the prefix trie using
2347 * this function. That allows the prune loop to rebuild the trie, but postpones
2348 * its freeing until all walks are done (unlocked by rt_unlock_trie()).
2349 *
2350 * Return a current trie that will be locked, the value should be passed back to
2351 * rt_unlock_trie() for unlocking.
2352 *
2353 */
2354 struct f_trie *
2355 rt_lock_trie(rtable *tab)
2356 {
2357 ASSERT(tab->trie);
2358
2359 tab->trie_lock_count++;
2360 return tab->trie;
2361 }
2362
2363 /**
2364 * rt_unlock_trie - unlock a prefix trie of a routing table
2365 * @tab: routing table with prefix trie to be locked
2366 * @trie: value returned by matching rt_lock_trie()
2367 *
2368 * Done for trie locked by rt_lock_trie() after walk over the trie is done.
2369 * It may free the trie and schedule next trie pruning.
2370 */
2371 void
2372 rt_unlock_trie(rtable *tab, struct f_trie *trie)
2373 {
2374 ASSERT(trie);
2375
2376 if (trie == tab->trie)
2377 {
2378 /* Unlock the current prefix trie */
2379 ASSERT(tab->trie_lock_count);
2380 tab->trie_lock_count--;
2381 }
2382 else if (trie == tab->trie_old)
2383 {
2384 /* Unlock the old prefix trie */
2385 ASSERT(tab->trie_old_lock_count);
2386 tab->trie_old_lock_count--;
2387
2388 /* Free old prefix trie that is no longer needed */
2389 if (!tab->trie_old_lock_count)
2390 {
2391 rfree(tab->trie_old->lp);
2392 tab->trie_old = NULL;
2393
2394 /* Kick prefix trie pruning that was postponed */
2395 if (tab->trie && (tab->trie->prefix_count > (2 * tab->fib.entries)))
2396 {
2397 tab->prune_trie = 1;
2398 rt_schedule_prune(tab);
2399 }
2400 }
2401 }
2402 else
2403 log(L_BUG "Invalid arg to rt_unlock_trie()");
2404 }
2405
2406
2407 void
2408 rt_preconfig(struct config *c)
2409 {
2410 init_list(&c->tables);
2411
2412 rt_new_table(cf_get_symbol("master4"), NET_IP4);
2413 rt_new_table(cf_get_symbol("master6"), NET_IP6);
2414 }
2415
2416 void
2417 rt_postconfig(struct config *c)
2418 {
2419 uint num_tables = list_length(&c->tables);
2420 btime def_gc_period = 400 MS * num_tables;
2421 def_gc_period = MAX(def_gc_period, 10 S);
2422 def_gc_period = MIN(def_gc_period, 600 S);
2423
2424 struct rtable_config *rc;
2425 WALK_LIST(rc, c->tables)
2426 if (rc->gc_period == (uint) -1)
2427 rc->gc_period = (uint) def_gc_period;
2428 }
2429
2430
2431 /*
2432 * Some functions for handing internal next hop updates
2433 * triggered by rt_schedule_nhu().
2434 */
2435
2436 void
2437 rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls)
2438 {
2439 a->hostentry = he;
2440 a->dest = he->dest;
2441 a->igp_metric = he->igp_metric;
2442
2443 if (a->dest != RTD_UNICAST)
2444 {
2445 /* No nexthop */
2446 no_nexthop:
2447 a->nh = (struct nexthop) {};
2448 if (mls)
2449 { /* Store the label stack for later changes */
2450 a->nh.labels_orig = a->nh.labels = mls->len;
2451 memcpy(a->nh.label, mls->stack, mls->len * sizeof(u32));
2452 }
2453 return;
2454 }
2455
2456 if (((!mls) || (!mls->len)) && he->nexthop_linkable)
2457 { /* Just link the nexthop chain, no label append happens. */
2458 memcpy(&(a->nh), &(he->src->nh), nexthop_size(&(he->src->nh)));
2459 return;
2460 }
2461
2462 struct nexthop *nhp = NULL, *nhr = NULL;
2463 int skip_nexthop = 0;
2464
2465 for (struct nexthop *nh = &(he->src->nh); nh; nh = nh->next)
2466 {
2467 if (skip_nexthop)
2468 skip_nexthop--;
2469 else
2470 {
2471 nhr = nhp;
2472 nhp = (nhp ? (nhp->next = lp_alloc(rte_update_pool, NEXTHOP_MAX_SIZE)) : &(a->nh));
2473 }
2474
2475 memset(nhp, 0, NEXTHOP_MAX_SIZE);
2476 nhp->iface = nh->iface;
2477 nhp->weight = nh->weight;
2478
2479 if (mls)
2480 {
2481 nhp->labels = nh->labels + mls->len;
2482 nhp->labels_orig = mls->len;
2483 if (nhp->labels <= MPLS_MAX_LABEL_STACK)
2484 {
2485 memcpy(nhp->label, nh->label, nh->labels * sizeof(u32)); /* First the hostentry labels */
2486 memcpy(&(nhp->label[nh->labels]), mls->stack, mls->len * sizeof(u32)); /* Then the bottom labels */
2487 }
2488 else
2489 {
2490 log(L_WARN "Sum of label stack sizes %d + %d = %d exceedes allowed maximum (%d)",
2491 nh->labels, mls->len, nhp->labels, MPLS_MAX_LABEL_STACK);
2492 skip_nexthop++;
2493 continue;
2494 }
2495 }
2496 else if (nh->labels)
2497 {
2498 nhp->labels = nh->labels;
2499 nhp->labels_orig = 0;
2500 memcpy(nhp->label, nh->label, nh->labels * sizeof(u32));
2501 }
2502
2503 if (ipa_nonzero(nh->gw))
2504 {
2505 nhp->gw = nh->gw; /* Router nexthop */
2506 nhp->flags |= (nh->flags & RNF_ONLINK);
2507 }
2508 else if (!(nh->iface->flags & IF_MULTIACCESS) || (nh->iface->flags & IF_LOOPBACK))
2509 nhp->gw = IPA_NONE; /* PtP link - no need for nexthop */
2510 else if (ipa_nonzero(he->link))
2511 nhp->gw = he->link; /* Device nexthop with link-local address known */
2512 else
2513 nhp->gw = he->addr; /* Device nexthop with link-local address unknown */
2514 }
2515
2516 if (skip_nexthop)
2517 if (nhr)
2518 nhr->next = NULL;
2519 else
2520 {
2521 a->dest = RTD_UNREACHABLE;
2522 log(L_WARN "No valid nexthop remaining, setting route unreachable");
2523 goto no_nexthop;
2524 }
2525 }
2526
2527 static inline int
2528 rta_next_hop_outdated(rta *a)
2529 {
2530 struct hostentry *he = a->hostentry;
2531
2532 if (!he)
2533 return 0;
2534
2535 if (!he->src)
2536 return a->dest != RTD_UNREACHABLE;
2537
2538 return (a->dest != he->dest) || (a->igp_metric != he->igp_metric) ||
2539 (!he->nexthop_linkable) || !nexthop_same(&(a->nh), &(he->src->nh));
2540 }
2541
2542 static inline rte *
2543 rt_next_hop_update_rte(rtable *tab UNUSED, rte *old)
2544 {
2545 if (!rta_next_hop_outdated(old->attrs))
2546 return NULL;
2547
2548 rta *a = alloca(RTA_MAX_SIZE);
2549 memcpy(a, old->attrs, rta_size(old->attrs));
2550
2551 mpls_label_stack mls = { .len = a->nh.labels_orig };
2552 memcpy(mls.stack, &a->nh.label[a->nh.labels - mls.len], mls.len * sizeof(u32));
2553
2554 rta_apply_hostentry(a, old->attrs->hostentry, &mls);
2555 a->cached = 0;
2556
2557 rte *e = sl_alloc(rte_slab);
2558 memcpy(e, old, sizeof(rte));
2559 e->attrs = rta_lookup(a);
2560 rt_lock_source(e->src);
2561
2562 return e;
2563 }
2564
2565
2566 #ifdef CONFIG_BGP
2567
2568 static inline int
2569 net_flow_has_dst_prefix(const net_addr *n)
2570 {
2571 ASSUME(net_is_flow(n));
2572
2573 if (n->pxlen)
2574 return 1;
2575
2576 if (n->type == NET_FLOW4)
2577 {
2578 const net_addr_flow4 *n4 = (void *) n;
2579 return (n4->length > sizeof(net_addr_flow4)) && (n4->data[0] == FLOW_TYPE_DST_PREFIX);
2580 }
2581 else
2582 {
2583 const net_addr_flow6 *n6 = (void *) n;
2584 return (n6->length > sizeof(net_addr_flow6)) && (n6->data[0] == FLOW_TYPE_DST_PREFIX);
2585 }
2586 }
2587
2588 static inline int
2589 rta_as_path_is_empty(rta *a)
2590 {
2591 eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH));
2592 return !e || (as_path_getlen(e->u.ptr) == 0);
2593 }
2594
2595 static inline u32
2596 rta_get_first_asn(rta *a)
2597 {
2598 eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH));
2599 u32 asn;
2600
2601 return (e && as_path_get_first_regular(e->u.ptr, &asn)) ? asn : 0;
2602 }
2603
2604 int
2605 rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, rta *a, int interior)
2606 {
2607 ASSERT(rt_is_ip(tab_ip));
2608 ASSERT(rt_is_flow(tab_flow));
2609 ASSERT(tab_ip->trie);
2610
2611 /* RFC 8955 6. a) Flowspec has defined dst prefix */
2612 if (!net_flow_has_dst_prefix(n))
2613 return 0;
2614
2615 /* RFC 9117 4.1. Accept AS_PATH is empty (fr */
2616 if (interior && rta_as_path_is_empty(a))
2617 return 1;
2618
2619
2620 /* RFC 8955 6. b) Flowspec and its best-match route have the same originator */
2621
2622 /* Find flowspec dst prefix */
2623 net_addr dst;
2624 if (n->type == NET_FLOW4)
2625 net_fill_ip4(&dst, net4_prefix(n), net4_pxlen(n));
2626 else
2627 net_fill_ip6(&dst, net6_prefix(n), net6_pxlen(n));
2628
2629 /* Find best-match BGP unicast route for flowspec dst prefix */
2630 net *nb = net_route(tab_ip, &dst);
2631 rte *rb = nb ? nb->routes : NULL;
2632
2633 /* Register prefix to trie for tracking further changes */
2634 int max_pxlen = (n->type == NET_FLOW4) ? IP4_MAX_PREFIX_LENGTH : IP6_MAX_PREFIX_LENGTH;
2635 trie_add_prefix(tab_flow->flowspec_trie, &dst, (nb ? nb->n.addr->pxlen : 0), max_pxlen);
2636
2637 /* No best-match BGP route -> no flowspec */
2638 if (!rb || (rb->attrs->source != RTS_BGP))
2639 return 0;
2640
2641 /* Find ORIGINATOR_ID values */
2642 u32 orig_a = ea_get_int(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0);
2643 u32 orig_b = ea_get_int(rb->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0);
2644
2645 /* Originator is either ORIGINATOR_ID (if present), or BGP neighbor address (if not) */
2646 if ((orig_a != orig_b) || (!orig_a && !orig_b && !ipa_equal(a->from, rb->attrs->from)))
2647 return 0;
2648
2649
2650 /* Find ASN of the best-match route, for use in next checks */
2651 u32 asn_b = rta_get_first_asn(rb->attrs);
2652 if (!asn_b)
2653 return 0;
2654
2655 /* RFC 9117 4.2. For EBGP, flowspec and its best-match route are from the same AS */
2656 if (!interior && (rta_get_first_asn(a) != asn_b))
2657 return 0;
2658
2659 /* RFC 8955 6. c) More-specific routes are from the same AS as the best-match route */
2660 TRIE_WALK(tab_ip->trie, subnet, &dst)
2661 {
2662 net *nc = net_find_valid(tab_ip, &subnet);
2663 if (!nc)
2664 continue;
2665
2666 rte *rc = nc->routes;
2667 if (rc->attrs->source != RTS_BGP)
2668 return 0;
2669
2670 if (rta_get_first_asn(rc->attrs) != asn_b)
2671 return 0;
2672 }
2673 TRIE_WALK_END;
2674
2675 return 1;
2676 }
2677
2678 #endif /* CONFIG_BGP */
2679
2680 static rte *
2681 rt_flowspec_update_rte(rtable *tab, rte *r)
2682 {
2683 #ifdef CONFIG_BGP
2684 if ((r->attrs->source != RTS_BGP) || (r->sender->proto != r->src->proto))
2685 return NULL;
2686
2687 struct bgp_channel *bc = (struct bgp_channel *) r->sender;
2688 if (!bc->base_table)
2689 return NULL;
2690
2691 const net_addr *n = r->net->n.addr;
2692 struct bgp_proto *p = (void *) r->src->proto;
2693 int valid = rt_flowspec_check(bc->base_table, tab, n, r->attrs, p->is_interior);
2694 int dest = valid ? RTD_NONE : RTD_UNREACHABLE;
2695
2696 if (dest == r->attrs->dest)
2697 return NULL;
2698
2699 rta *a = alloca(RTA_MAX_SIZE);
2700 memcpy(a, r->attrs, rta_size(r->attrs));
2701 a->dest = dest;
2702 a->cached = 0;
2703
2704 rte *new = sl_alloc(rte_slab);
2705 memcpy(new, r, sizeof(rte));
2706 new->attrs = rta_lookup(a);
2707 rt_lock_source(new->src);
2708
2709 return new;
2710 #else
2711 return NULL;
2712 #endif
2713 }
2714
2715
2716 static inline int
2717 rt_next_hop_update_net(rtable *tab, net *n)
2718 {
2719 rte **k, *e, *new, *old_best, **new_best;
2720 int count = 0;
2721 int free_old_best = 0;
2722
2723 old_best = n->routes;
2724 if (!old_best)
2725 return 0;
2726
2727 for (k = &n->routes; e = *k; k = &e->next)
2728 {
2729 if (!net_is_flow(n->n.addr))
2730 new = rt_next_hop_update_rte(tab, e);
2731 else
2732 new = rt_flowspec_update_rte(tab, e);
2733
2734 if (new)
2735 {
2736 *k = new;
2737
2738 rte_trace_in(D_ROUTES, new->sender, new, "updated");
2739 rte_announce_i(tab, RA_ANY, n, new, e, NULL, NULL);
2740
2741 /* Call a pre-comparison hook */
2742 /* Not really an efficient way to compute this */
2743 if (e->src->proto->rte_recalculate)
2744 e->src->proto->rte_recalculate(tab, n, new, e, NULL);
2745
2746 if (e != old_best)
2747 rte_free_quick(e);
2748 else /* Freeing of the old best rte is postponed */
2749 free_old_best = 1;
2750
2751 e = new;
2752 count++;
2753 }
2754 }
2755
2756 if (!count)
2757 return 0;
2758
2759 /* Find the new best route */
2760 new_best = NULL;
2761 for (k = &n->routes; e = *k; k = &e->next)
2762 {
2763 if (!new_best || rte_better(e, *new_best))
2764 new_best = k;
2765 }
2766
2767 /* Relink the new best route to the first position */
2768 new = *new_best;
2769 if (new != n->routes)
2770 {
2771 *new_best = new->next;
2772 new->next = n->routes;
2773 n->routes = new;
2774 }
2775
2776 /* Announce the new best route */
2777 if (new != old_best)
2778 rte_trace_in(D_ROUTES, new->sender, new, "updated [best]");
2779
2780 /* Propagate changes */
2781 rte_announce_i(tab, RA_UNDEF, n, NULL, NULL, n->routes, old_best);
2782
2783 if (free_old_best)
2784 rte_free_quick(old_best);
2785
2786 return count;
2787 }
2788
2789 static void
2790 rt_next_hop_update(rtable *tab)
2791 {
2792 struct fib_iterator *fit = &tab->nhu_fit;
2793 int max_feed = 32;
2794
2795 if (tab->nhu_state == NHU_CLEAN)
2796 return;
2797
2798 if (tab->nhu_state == NHU_SCHEDULED)
2799 {
2800 FIB_ITERATE_INIT(fit, &tab->fib);
2801 tab->nhu_state = NHU_RUNNING;
2802
2803 if (tab->flowspec_trie)
2804 rt_flowspec_reset_trie(tab);
2805 }
2806
2807 FIB_ITERATE_START(&tab->fib, fit, net, n)
2808 {
2809 if (max_feed <= 0)
2810 {
2811 FIB_ITERATE_PUT(fit);
2812 ev_schedule(tab->rt_event);
2813 return;
2814 }
2815 max_feed -= rt_next_hop_update_net(tab, n);
2816 }
2817 FIB_ITERATE_END;
2818
2819 /* State change:
2820 * NHU_DIRTY -> NHU_SCHEDULED
2821 * NHU_RUNNING -> NHU_CLEAN
2822 */
2823 tab->nhu_state &= 1;
2824
2825 if (tab->nhu_state != NHU_CLEAN)
2826 ev_schedule(tab->rt_event);
2827 }
2828
2829
2830 struct rtable_config *
2831 rt_new_table(struct symbol *s, uint addr_type)
2832 {
2833 /* Hack that allows to 'redefine' the master table */
2834 if ((s->class == SYM_TABLE) &&
2835 (s->table == new_config->def_tables[addr_type]) &&
2836 ((addr_type == NET_IP4) || (addr_type == NET_IP6)))
2837 return s->table;
2838
2839 struct rtable_config *c = cfg_allocz(sizeof(struct rtable_config));
2840
2841 cf_define_symbol(s, SYM_TABLE, table, c);
2842 c->name = s->name;
2843 c->addr_type = addr_type;
2844 c->gc_threshold = 1000;
2845 c->gc_period = (uint) -1; /* set in rt_postconfig() */
2846 c->min_settle_time = 1 S;
2847 c->max_settle_time = 20 S;
2848
2849 add_tail(&new_config->tables, &c->n);
2850
2851 /* First table of each type is kept as default */
2852 if (! new_config->def_tables[addr_type])
2853 new_config->def_tables[addr_type] = c;
2854
2855 return c;
2856 }
2857
2858 /**
2859 * rt_lock_table - lock a routing table
2860 * @r: routing table to be locked
2861 *
2862 * Lock a routing table, because it's in use by a protocol,
2863 * preventing it from being freed when it gets undefined in a new
2864 * configuration.
2865 */
2866 void
2867 rt_lock_table(rtable *r)
2868 {
2869 r->use_count++;
2870 }
2871
2872 /**
2873 * rt_unlock_table - unlock a routing table
2874 * @r: routing table to be unlocked
2875 *
2876 * Unlock a routing table formerly locked by rt_lock_table(),
2877 * that is decrease its use count and delete it if it's scheduled
2878 * for deletion by configuration changes.
2879 */
2880 void
2881 rt_unlock_table(rtable *r)
2882 {
2883 if (!--r->use_count && r->deleted)
2884 {
2885 struct config *conf = r->deleted;
2886
2887 /* Delete the routing table by freeing its pool */
2888 rt_shutdown(r);
2889 config_del_obstacle(conf);
2890 }
2891 }
2892
2893 static int
2894 rt_reconfigure(rtable *tab, struct rtable_config *new, struct rtable_config *old)
2895 {
2896 if ((new->addr_type != old->addr_type) ||
2897 (new->sorted != old->sorted) ||
2898 (new->trie_used != old->trie_used))
2899 return 0;
2900
2901 DBG("\t%s: same\n", new->name);
2902 new->table = tab;
2903 tab->name = new->name;
2904 tab->config = new;
2905
2906 return 1;
2907 }
2908
2909 static struct rtable_config *
2910 rt_find_table_config(struct config *cf, char *name)
2911 {
2912 struct symbol *sym = cf_find_symbol(cf, name);
2913 return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL;
2914 }
2915
2916 /**
2917 * rt_commit - commit new routing table configuration
2918 * @new: new configuration
2919 * @old: original configuration or %NULL if it's boot time config
2920 *
2921 * Scan differences between @old and @new configuration and modify
2922 * the routing tables according to these changes. If @new defines a
2923 * previously unknown table, create it, if it omits a table existing
2924 * in @old, schedule it for deletion (it gets deleted when all protocols
2925 * disconnect from it by calling rt_unlock_table()), if it exists
2926 * in both configurations, leave it unchanged.
2927 */
2928 void
2929 rt_commit(struct config *new, struct config *old)
2930 {
2931 struct rtable_config *o, *r;
2932
2933 DBG("rt_commit:\n");
2934 if (old)
2935 {
2936 WALK_LIST(o, old->tables)
2937 {
2938 rtable *tab = o->table;
2939 if (tab->deleted)
2940 continue;
2941
2942 r = rt_find_table_config(new, o->name);
2943 if (r && !new->shutdown && rt_reconfigure(tab, r, o))
2944 continue;
2945
2946 DBG("\t%s: deleted\n", o->name);
2947 tab->deleted = old;
2948 config_add_obstacle(old);
2949 rt_lock_table(tab);
2950 rt_unlock_table(tab);
2951 }
2952 }
2953
2954 WALK_LIST(r, new->tables)
2955 if (!r->table)
2956 {
2957 r->table = rt_setup(rt_table_pool, r);
2958 DBG("\t%s: created\n", r->name);
2959 add_tail(&routing_tables, &r->table->n);
2960 }
2961 DBG("\tdone\n");
2962 }
2963
2964 static inline void
2965 do_feed_channel(struct channel *c, net *n, rte *e)
2966 {
2967 rte_update_lock();
2968 if (c->ra_mode == RA_ACCEPTED)
2969 rt_notify_accepted(c, n, NULL, NULL, c->refeeding);
2970 else if (c->ra_mode == RA_MERGED)
2971 rt_notify_merged(c, n, NULL, NULL, e, e, c->refeeding);
2972 else /* RA_BASIC */
2973 rt_notify_basic(c, n, e, e, c->refeeding);
2974 rte_update_unlock();
2975 }
2976
2977 /**
2978 * rt_feed_channel - advertise all routes to a channel
2979 * @c: channel to be fed
2980 *
2981 * This function performs one pass of advertisement of routes to a channel that
2982 * is in the ES_FEEDING state. It is called by the protocol code as long as it
2983 * has something to do. (We avoid transferring all the routes in single pass in
2984 * order not to monopolize CPU time.)
2985 */
2986 int
2987 rt_feed_channel(struct channel *c)
2988 {
2989 struct fib_iterator *fit = &c->feed_fit;
2990 int max_feed = 256;
2991
2992 ASSERT(c->export_state == ES_FEEDING);
2993
2994 if (!c->feed_active)
2995 {
2996 FIB_ITERATE_INIT(fit, &c->table->fib);
2997 c->feed_active = 1;
2998 }
2999
3000 FIB_ITERATE_START(&c->table->fib, fit, net, n)
3001 {
3002 rte *e = n->routes;
3003 if (max_feed <= 0)
3004 {
3005 FIB_ITERATE_PUT(fit);
3006 return 0;
3007 }
3008
3009 if ((c->ra_mode == RA_OPTIMAL) ||
3010 (c->ra_mode == RA_ACCEPTED) ||
3011 (c->ra_mode == RA_MERGED))
3012 if (rte_is_valid(e))
3013 {
3014 /* In the meantime, the protocol may fell down */
3015 if (c->export_state != ES_FEEDING)
3016 goto done;
3017
3018 do_feed_channel(c, n, e);
3019 max_feed--;
3020 }
3021
3022 if (c->ra_mode == RA_ANY)
3023 for(e = n->routes; e; e = e->next)
3024 {
3025 /* In the meantime, the protocol may fell down */
3026 if (c->export_state != ES_FEEDING)
3027 goto done;
3028
3029 if (!rte_is_valid(e))
3030 continue;
3031
3032 do_feed_channel(c, n, e);
3033 max_feed--;
3034 }
3035 }
3036 FIB_ITERATE_END;
3037
3038 done:
3039 c->feed_active = 0;
3040 return 1;
3041 }
3042
3043 /**
3044 * rt_feed_baby_abort - abort protocol feeding
3045 * @c: channel
3046 *
3047 * This function is called by the protocol code when the protocol stops or
3048 * ceases to exist during the feeding.
3049 */
3050 void
3051 rt_feed_channel_abort(struct channel *c)
3052 {
3053 if (c->feed_active)
3054 {
3055 /* Unlink the iterator */
3056 fit_get(&c->table->fib, &c->feed_fit);
3057 c->feed_active = 0;
3058 }
3059 }
3060
3061
3062 /*
3063 * Import table
3064 */
3065
3066 int
3067 rte_update_in(struct channel *c, const net_addr *n, rte *new, struct rte_src *src)
3068 {
3069 struct rtable *tab = c->in_table;
3070 rte *old, **pos;
3071 net *net;
3072
3073 if (new)
3074 {
3075 net = net_get(tab, n);
3076
3077 if (!rta_is_cached(new->attrs))
3078 new->attrs = rta_lookup(new->attrs);
3079 }
3080 else
3081 {
3082 net = net_find(tab, n);
3083
3084 if (!net)
3085 goto drop_withdraw;
3086 }
3087
3088 /* Find the old rte */
3089 for (pos = &net->routes; old = *pos; pos = &old->next)
3090 if (old->src == src)
3091 {
3092 if (new && rte_same(old, new))
3093 {
3094 /* Refresh the old rte, continue with update to main rtable */
3095 if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY))
3096 {
3097 old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
3098
3099 if (c->proto->rte_update_in_notify)
3100 c->proto->rte_update_in_notify(c, n, old, src);
3101
3102 return 1;
3103 }
3104
3105 goto drop_update;
3106 }
3107
3108 /* Move iterator if needed */
3109 if (old == c->reload_next_rte)
3110 c->reload_next_rte = old->next;
3111
3112 /* Remove the old rte */
3113 *pos = old->next;
3114 rte_free_quick(old);
3115 tab->rt_count--;
3116
3117 break;
3118 }
3119
3120 if (!new)
3121 {
3122 if (!old)
3123 goto drop_withdraw;
3124
3125 if (!net->routes)
3126 fib_delete(&tab->fib, net);
3127
3128 if (c->proto->rte_update_in_notify)
3129 c->proto->rte_update_in_notify(c, n, NULL, src);
3130
3131 return 1;
3132 }
3133
3134 struct channel_limit *l = &c->rx_limit;
3135 if (l->action && !old)
3136 {
3137 if (tab->rt_count >= l->limit)
3138 channel_notify_limit(c, l, PLD_RX, tab->rt_count);
3139
3140 if (l->state == PLS_BLOCKED)
3141 {
3142 /* Required by rte_trace_in() */
3143 new->net = net;
3144
3145 rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
3146 goto drop_update;
3147 }
3148 }
3149
3150 /* Insert the new rte */
3151 rte *e = rte_do_cow(new);
3152 e->flags |= REF_COW;
3153 e->net = net;
3154 e->sender = c;
3155 e->lastmod = current_time();
3156 e->next = *pos;
3157 *pos = e;
3158 tab->rt_count++;
3159
3160 if (c->proto->rte_update_in_notify)
3161 c->proto->rte_update_in_notify(c, n, e, src);
3162
3163 return 1;
3164
3165 drop_update:
3166 c->stats.imp_updates_received++;
3167 c->stats.imp_updates_ignored++;
3168 rte_free(new);
3169
3170 if (!net->routes)
3171 fib_delete(&tab->fib, net);
3172
3173 return 0;
3174
3175 drop_withdraw:
3176 c->stats.imp_withdraws_received++;
3177 c->stats.imp_withdraws_ignored++;
3178 return 0;
3179 }
3180
3181 int
3182 rt_reload_channel(struct channel *c)
3183 {
3184 struct rtable *tab = c->in_table;
3185 struct fib_iterator *fit = &c->reload_fit;
3186 int max_feed = 64;
3187
3188 ASSERT(c->channel_state == CS_UP);
3189
3190 if (!c->reload_active)
3191 {
3192 FIB_ITERATE_INIT(fit, &tab->fib);
3193 c->reload_active = 1;
3194 }
3195
3196 do {
3197 for (rte *e = c->reload_next_rte; e; e = e->next)
3198 {
3199 if (max_feed-- <= 0)
3200 {
3201 c->reload_next_rte = e;
3202 debug("%s channel reload burst split (max_feed=%d)", c->proto->name, max_feed);
3203 return 0;
3204 }
3205
3206 rte_update2(c, e->net->n.addr, rte_do_cow(e), e->src);
3207 }
3208
3209 c->reload_next_rte = NULL;
3210
3211 FIB_ITERATE_START(&tab->fib, fit, net, n)
3212 {
3213 if (c->reload_next_rte = n->routes)
3214 {
3215 FIB_ITERATE_PUT_NEXT(fit, &tab->fib);
3216 break;
3217 }
3218 }
3219 FIB_ITERATE_END;
3220 }
3221 while (c->reload_next_rte);
3222
3223 c->reload_active = 0;
3224 return 1;
3225 }
3226
3227 void
3228 rt_reload_channel_abort(struct channel *c)
3229 {
3230 if (c->reload_active)
3231 {
3232 /* Unlink the iterator */
3233 fit_get(&c->in_table->fib, &c->reload_fit);
3234 c->reload_next_rte = NULL;
3235 c->reload_active = 0;
3236 }
3237 }
3238
3239 void
3240 rt_prune_sync(rtable *t, int all)
3241 {
3242 struct fib_iterator fit;
3243
3244 FIB_ITERATE_INIT(&fit, &t->fib);
3245
3246 again:
3247 FIB_ITERATE_START(&t->fib, &fit, net, n)
3248 {
3249 rte *e, **ee = &n->routes;
3250
3251 while (e = *ee)
3252 {
3253 if (all || (e->flags & (REF_STALE | REF_DISCARD)))
3254 {
3255 *ee = e->next;
3256 rte_free_quick(e);
3257 t->rt_count--;
3258 }
3259 else
3260 ee = &e->next;
3261 }
3262
3263 if (all || !n->routes)
3264 {
3265 FIB_ITERATE_PUT(&fit);
3266 fib_delete(&t->fib, n);
3267 goto again;
3268 }
3269 }
3270 FIB_ITERATE_END;
3271 }
3272
3273
3274 /*
3275 * Export table
3276 */
3277
3278 int
3279 rte_update_out(struct channel *c, const net_addr *n, rte *new, rte *old0, int refeed)
3280 {
3281 struct rtable *tab = c->out_table;
3282 struct rte_src *src;
3283 rte *old, **pos;
3284 net *net;
3285
3286 if (new)
3287 {
3288 net = net_get(tab, n);
3289 src = new->src;
3290
3291 if (!rta_is_cached(new->attrs))
3292 new->attrs = rta_lookup(new->attrs);
3293 }
3294 else
3295 {
3296 net = net_find(tab, n);
3297 src = old0->src;
3298
3299 if (!net)
3300 goto drop_withdraw;
3301 }
3302
3303 /* Find the old rte */
3304 for (pos = &net->routes; old = *pos; pos = &old->next)
3305 if ((c->ra_mode != RA_ANY) || (old->src == src))
3306 {
3307 if (new && rte_same(old, new))
3308 {
3309 /* REF_STALE / REF_DISCARD not used in export table */
3310 /*
3311 if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY))
3312 {
3313 old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
3314 return 1;
3315 }
3316 */
3317
3318 goto drop_update;
3319 }
3320
3321 /* Remove the old rte */
3322 *pos = old->next;
3323 rte_free_quick(old);
3324 tab->rt_count--;
3325
3326 break;
3327 }
3328
3329 if (!new)
3330 {
3331 if (!old)
3332 goto drop_withdraw;
3333
3334 if (!net->routes)
3335 fib_delete(&tab->fib, net);
3336
3337 return 1;
3338 }
3339
3340 /* Insert the new rte */
3341 rte *e = rte_do_cow(new);
3342 e->flags |= REF_COW;
3343 e->net = net;
3344 e->sender = c;
3345 e->lastmod = current_time();
3346 e->next = *pos;
3347 *pos = e;
3348 tab->rt_count++;
3349 return 1;
3350
3351 drop_update:
3352 return refeed;
3353
3354 drop_withdraw:
3355 return 0;
3356 }
3357
3358
3359 /*
3360 * Hostcache
3361 */
3362
3363 static inline u32
3364 hc_hash(ip_addr a, rtable *dep)
3365 {
3366 return ipa_hash(a) ^ ptr_hash(dep);
3367 }
3368
3369 static inline void
3370 hc_insert(struct hostcache *hc, struct hostentry *he)
3371 {
3372 uint k = he->hash_key >> hc->hash_shift;
3373 he->next = hc->hash_table[k];
3374 hc->hash_table[k] = he;
3375 }
3376
3377 static inline void
3378 hc_remove(struct hostcache *hc, struct hostentry *he)
3379 {
3380 struct hostentry **hep;
3381 uint k = he->hash_key >> hc->hash_shift;
3382
3383 for (hep = &hc->hash_table[k]; *hep != he; hep = &(*hep)->next);
3384 *hep = he->next;
3385 }
3386
3387 #define HC_DEF_ORDER 10
3388 #define HC_HI_MARK *4
3389 #define HC_HI_STEP 2
3390 #define HC_HI_ORDER 16 /* Must be at most 16 */
3391 #define HC_LO_MARK /5
3392 #define HC_LO_STEP 2
3393 #define HC_LO_ORDER 10
3394
3395 static void
3396 hc_alloc_table(struct hostcache *hc, pool *p, unsigned order)
3397 {
3398 uint hsize = 1 << order;
3399 hc->hash_order = order;
3400 hc->hash_shift = 32 - order;
3401 hc->hash_max = (order >= HC_HI_ORDER) ? ~0U : (hsize HC_HI_MARK);
3402 hc->hash_min = (order <= HC_LO_ORDER) ? 0U : (hsize HC_LO_MARK);
3403
3404 hc->hash_table = mb_allocz(p, hsize * sizeof(struct hostentry *));
3405 }
3406
3407 static void
3408 hc_resize(struct hostcache *hc, pool *p, unsigned new_order)
3409 {
3410 struct hostentry **old_table = hc->hash_table;
3411 struct hostentry *he, *hen;
3412 uint old_size = 1 << hc->hash_order;
3413 uint i;
3414
3415 hc_alloc_table(hc, p, new_order);
3416 for (i = 0; i < old_size; i++)
3417 for (he = old_table[i]; he != NULL; he=hen)
3418 {
3419 hen = he->next;
3420 hc_insert(hc, he);
3421 }
3422 mb_free(old_table);
3423 }
3424
3425 static struct hostentry *
3426 hc_new_hostentry(struct hostcache *hc, pool *p, ip_addr a, ip_addr ll, rtable *dep, unsigned k)
3427 {
3428 struct hostentry *he = sl_alloc(hc->slab);
3429
3430 *he = (struct hostentry) {
3431 .addr = a,
3432 .link = ll,
3433 .tab = dep,
3434 .hash_key = k,
3435 };
3436
3437 add_tail(&hc->hostentries, &he->ln);
3438 hc_insert(hc, he);
3439
3440 hc->hash_items++;
3441 if (hc->hash_items > hc->hash_max)
3442 hc_resize(hc, p, hc->hash_order + HC_HI_STEP);
3443
3444 return he;
3445 }
3446
3447 static void
3448 hc_delete_hostentry(struct hostcache *hc, pool *p, struct hostentry *he)
3449 {
3450 rta_free(he->src);
3451
3452 rem_node(&he->ln);
3453 hc_remove(hc, he);
3454 sl_free(he);
3455
3456 hc->hash_items--;
3457 if (hc->hash_items < hc->hash_min)
3458 hc_resize(hc, p, hc->hash_order - HC_LO_STEP);
3459 }
3460
3461 static void
3462 rt_init_hostcache(rtable *tab)
3463 {
3464 struct hostcache *hc = mb_allocz(tab->rp, sizeof(struct hostcache));
3465 init_list(&hc->hostentries);
3466
3467 hc->hash_items = 0;
3468 hc_alloc_table(hc, tab->rp, HC_DEF_ORDER);
3469 hc->slab = sl_new(tab->rp, sizeof(struct hostentry));
3470
3471 hc->lp = lp_new(tab->rp);
3472 hc->trie = f_new_trie(hc->lp, 0);
3473
3474 tab->hostcache = hc;
3475 }
3476
3477 static void
3478 rt_free_hostcache(rtable *tab)
3479 {
3480 struct hostcache *hc = tab->hostcache;
3481
3482 node *n;
3483 WALK_LIST(n, hc->hostentries)
3484 {
3485 struct hostentry *he = SKIP_BACK(struct hostentry, ln, n);
3486 rta_free(he->src);
3487
3488 if (he->uc)
3489 log(L_ERR "Hostcache is not empty in table %s", tab->name);
3490 }
3491
3492 /* Freed automagically by the resource pool
3493 rfree(hc->slab);
3494 rfree(hc->lp);
3495 mb_free(hc->hash_table);
3496 mb_free(hc);
3497 */
3498 }
3499
3500 static void
3501 rt_notify_hostcache(rtable *tab, net *net)
3502 {
3503 if (tab->hcu_scheduled)
3504 return;
3505
3506 if (trie_match_net(tab->hostcache->trie, net->n.addr))
3507 rt_schedule_hcu(tab);
3508 }
3509
3510 static int
3511 if_local_addr(ip_addr a, struct iface *i)
3512 {
3513 struct ifa *b;
3514
3515 WALK_LIST(b, i->addrs)
3516 if (ipa_equal(a, b->ip))
3517 return 1;
3518
3519 return 0;
3520 }
3521
3522 u32
3523 rt_get_igp_metric(rte *rt)
3524 {
3525 eattr *ea = ea_find(rt->attrs->eattrs, EA_GEN_IGP_METRIC);
3526
3527 if (ea)
3528 return ea->u.data;
3529
3530 if (rt->attrs->source == RTS_DEVICE)
3531 return 0;
3532
3533 if (rt->src->proto->rte_igp_metric)
3534 return rt->src->proto->rte_igp_metric(rt);
3535
3536 return IGP_METRIC_UNKNOWN;
3537 }
3538
3539 static int
3540 rt_update_hostentry(rtable *tab, struct hostentry *he)
3541 {
3542 rta *old_src = he->src;
3543 int direct = 0;
3544 int pxlen = 0;
3545
3546 /* Reset the hostentry */
3547 he->src = NULL;
3548 he->dest = RTD_UNREACHABLE;
3549 he->nexthop_linkable = 0;
3550 he->igp_metric = 0;
3551
3552 net_addr he_addr;
3553 net_fill_ip_host(&he_addr, he->addr);
3554 net *n = net_route(tab, &he_addr);
3555 if (n)
3556 {
3557 rte *e = n->routes;
3558 rta *a = e->attrs;
3559 word pref = a->pref;
3560
3561 for (rte *ee = n->routes; ee; ee = ee->next)
3562 if ((ee->attrs->pref >= pref) && ee->attrs->hostentry)
3563 {
3564 /* Recursive route should not depend on another recursive route */
3565 log(L_WARN "Next hop address %I resolvable through recursive route for %N",
3566 he->addr, n->n.addr);
3567 goto done;
3568 }
3569
3570 pxlen = n->n.addr->pxlen;
3571
3572 if (a->dest == RTD_UNICAST)
3573 {
3574 for (struct nexthop *nh = &(a->nh); nh; nh = nh->next)
3575 if (ipa_zero(nh->gw))
3576 {
3577 if (if_local_addr(he->addr, nh->iface))
3578 {
3579 /* The host address is a local address, this is not valid */
3580 log(L_WARN "Next hop address %I is a local address of iface %s",
3581 he->addr, nh->iface->name);
3582 goto done;
3583 }
3584
3585 direct++;
3586 }
3587 }
3588
3589 he->src = rta_clone(a);
3590 he->dest = a->dest;
3591 he->nexthop_linkable = !direct;
3592 he->igp_metric = rt_get_igp_metric(e);
3593 }
3594
3595 done:
3596 /* Add a prefix range to the trie */
3597 trie_add_prefix(tab->hostcache->trie, &he_addr, pxlen, he_addr.pxlen);
3598
3599 rta_free(old_src);
3600 return old_src != he->src;
3601 }
3602
3603 static void
3604 rt_update_hostcache(rtable *tab)
3605 {
3606 struct hostcache *hc = tab->hostcache;
3607 struct hostentry *he;
3608 node *n, *x;
3609
3610 /* Reset the trie */
3611 lp_flush(hc->lp);
3612 hc->trie = f_new_trie(hc->lp, 0);
3613
3614 WALK_LIST_DELSAFE(n, x, hc->hostentries)
3615 {
3616 he = SKIP_BACK(struct hostentry, ln, n);
3617 if (!he->uc)
3618 {
3619 hc_delete_hostentry(hc, tab->rp, he);
3620 continue;
3621 }
3622
3623 if (rt_update_hostentry(tab, he))
3624 rt_schedule_nhu(he->tab);
3625 }
3626
3627 tab->hcu_scheduled = 0;
3628 }
3629
3630 struct hostentry *
3631 rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep)
3632 {
3633 ip_addr link = ipa_zero(ll) ? a : ll;
3634 struct hostentry *he;
3635
3636 if (!tab->hostcache)
3637 rt_init_hostcache(tab);
3638
3639 u32 k = hc_hash(a, dep);
3640 struct hostcache *hc = tab->hostcache;
3641 for (he = hc->hash_table[k >> hc->hash_shift]; he != NULL; he = he->next)
3642 if (ipa_equal(he->addr, a) && ipa_equal(he->link, link) && (he->tab == dep))
3643 return he;
3644
3645 he = hc_new_hostentry(hc, tab->rp, a, link, dep, k);
3646 rt_update_hostentry(tab, he);
3647 return he;
3648 }
3649
3650
3651 /*
3652 * Documentation for functions declared inline in route.h
3653 */
3654 #if 0
3655
3656 /**
3657 * net_find - find a network entry
3658 * @tab: a routing table
3659 * @addr: address of the network
3660 *
3661 * net_find() looks up the given network in routing table @tab and
3662 * returns a pointer to its &net entry or %NULL if no such network
3663 * exists.
3664 */
3665 static inline net *net_find(rtable *tab, net_addr *addr)
3666 { DUMMY; }
3667
3668 /**
3669 * net_get - obtain a network entry
3670 * @tab: a routing table
3671 * @addr: address of the network
3672 *
3673 * net_get() looks up the given network in routing table @tab and
3674 * returns a pointer to its &net entry. If no such entry exists, it's
3675 * created.
3676 */
3677 static inline net *net_get(rtable *tab, net_addr *addr)
3678 { DUMMY; }
3679
3680 /**
3681 * rte_cow - copy a route for writing
3682 * @r: a route entry to be copied
3683 *
3684 * rte_cow() takes a &rte and prepares it for modification. The exact action
3685 * taken depends on the flags of the &rte -- if it's a temporary entry, it's
3686 * just returned unchanged, else a new temporary entry with the same contents
3687 * is created.
3688 *
3689 * The primary use of this function is inside the filter machinery -- when
3690 * a filter wants to modify &rte contents (to change the preference or to
3691 * attach another set of attributes), it must ensure that the &rte is not
3692 * shared with anyone else (and especially that it isn't stored in any routing
3693 * table).
3694 *
3695 * Result: a pointer to the new writable &rte.
3696 */
3697 static inline rte * rte_cow(rte *r)
3698 { DUMMY; }
3699
3700 #endif