]> git.ipfire.org Git - thirdparty/bird.git/blob - nest/rt-table.c
Table: Adding route refresh begin and end debug messages
[thirdparty/bird.git] / nest / rt-table.c
1 /*
2 * BIRD -- Routing Tables
3 *
4 * (c) 1998--2000 Martin Mares <mj@ucw.cz>
5 *
6 * Can be freely distributed and used under the terms of the GNU GPL.
7 */
8
9 /**
10 * DOC: Routing tables
11 *
12 * Routing tables are probably the most important structures BIRD uses. They
13 * hold all the information about known networks, the associated routes and
14 * their attributes.
15 *
16 * There are multiple routing tables (a primary one together with any
17 * number of secondary ones if requested by the configuration). Each table
18 * is basically a FIB containing entries describing the individual
19 * destination networks. For each network (represented by structure &net),
20 * there is a one-way linked list of route entries (&rte), the first entry
21 * on the list being the best one (i.e., the one we currently use
22 * for routing), the order of the other ones is undetermined.
23 *
24 * The &rte contains information about the route. There are net and src, which
25 * together forms a key identifying the route in a routing table. There is a
26 * pointer to a &rta structure (see the route attribute module for a precise
27 * explanation) holding the route attributes, which are primary data about the
28 * route. There are several technical fields used by routing table code (route
29 * id, REF_* flags), There is also the pflags field, holding protocol-specific
30 * flags. They are not used by routing table code, but by protocol-specific
31 * hooks. In contrast to route attributes, they are not primary data and their
32 * validity is also limited to the routing table.
33 *
34 * There are several mechanisms that allow automatic update of routes in one
35 * routing table (dst) as a result of changes in another routing table (src).
36 * They handle issues of recursive next hop resolving, flowspec validation and
37 * RPKI validation.
38 *
39 * The first such mechanism is handling of recursive next hops. A route in the
40 * dst table has an indirect next hop address, which is resolved through a route
41 * in the src table (which may also be the same table) to get an immediate next
42 * hop. This is implemented using structure &hostcache attached to the src
43 * table, which contains &hostentry structures for each tracked next hop
44 * address. These structures are linked from recursive routes in dst tables,
45 * possibly multiple routes sharing one hostentry (as many routes may have the
46 * same indirect next hop). There is also a trie in the hostcache, which matches
47 * all prefixes that may influence resolving of tracked next hops.
48 *
49 * When a best route changes in the src table, the hostcache is notified using
50 * rt_notify_hostcache(), which immediately checks using the trie whether the
51 * change is relevant and if it is, then it schedules asynchronous hostcache
52 * recomputation. The recomputation is done by rt_update_hostcache() (called
53 * from rt_event() of src table), it walks through all hostentries and resolves
54 * them (by rt_update_hostentry()). It also updates the trie. If a change in
55 * hostentry resolution was found, then it schedules asynchronous nexthop
56 * recomputation of associated dst table. That is done by rt_next_hop_update()
57 * (called from rt_event() of dst table), it iterates over all routes in the dst
58 * table and re-examines their hostentries for changes. Note that in contrast to
59 * hostcache update, next hop update can be interrupted by main loop. These two
60 * full-table walks (over hostcache and dst table) are necessary due to absence
61 * of direct lookups (route -> affected nexthop, nexthop -> its route).
62 *
63 * The second mechanism is for flowspec validation, where validity of flowspec
64 * routes depends of resolving their network prefixes in IP routing tables. This
65 * is similar to the recursive next hop mechanism, but simpler as there are no
66 * intermediate hostcache and hostentries (because flows are less likely to
67 * share common net prefix than routes sharing a common next hop). In src table,
68 * there is a list of dst tables (list flowspec_links), this list is updated by
69 * flowpsec channels (by rt_flowspec_link() and rt_flowspec_unlink() during
70 * channel start/stop). Each dst table has its own trie of prefixes that may
71 * influence validation of flowspec routes in it (flowspec_trie).
72 *
73 * When a best route changes in the src table, rt_flowspec_notify() immediately
74 * checks all dst tables from the list using their tries to see whether the
75 * change is relevant for them. If it is, then an asynchronous re-validation of
76 * flowspec routes in the dst table is scheduled. That is also done by function
77 * rt_next_hop_update(), like nexthop recomputation above. It iterates over all
78 * flowspec routes and re-validates them. It also recalculates the trie.
79 *
80 * Note that in contrast to the hostcache update, here the trie is recalculated
81 * during the rt_next_hop_update(), which may be interleaved with IP route
82 * updates. The trie is flushed at the beginning of recalculation, which means
83 * that such updates may use partial trie to see if they are relevant. But it
84 * works anyway! Either affected flowspec was already re-validated and added to
85 * the trie, then IP route change would match the trie and trigger a next round
86 * of re-validation, or it was not yet re-validated and added to the trie, but
87 * will be re-validated later in this round anyway.
88 *
89 * The third mechanism is used for RPKI re-validation of IP routes and it is the
90 * simplest. It is just a list of subscribers in src table, who are notified
91 * when any change happened, but only after a settle time. Also, in RPKI case
92 * the dst is not a table, but a channel, who refeeds routes through a filter.
93 */
94
95 #undef LOCAL_DEBUG
96
97 #include "nest/bird.h"
98 #include "nest/route.h"
99 #include "nest/protocol.h"
100 #include "nest/iface.h"
101 #include "nest/mpls.h"
102 #include "lib/resource.h"
103 #include "lib/event.h"
104 #include "lib/timer.h"
105 #include "lib/string.h"
106 #include "conf/conf.h"
107 #include "filter/filter.h"
108 #include "filter/data.h"
109 #include "lib/hash.h"
110 #include "lib/string.h"
111 #include "lib/alloca.h"
112 #include "lib/flowspec.h"
113
114 #ifdef CONFIG_BGP
115 #include "proto/bgp/bgp.h"
116 #endif
117
118 pool *rt_table_pool;
119
120 static slab *rte_slab;
121 linpool *rte_update_pool;
122
123 list routing_tables;
124
125 static void rt_free_hostcache(rtable *tab);
126 static void rt_notify_hostcache(rtable *tab, net *net);
127 static void rt_update_hostcache(rtable *tab);
128 static void rt_next_hop_update(rtable *tab);
129 static inline void rt_prune_table(rtable *tab);
130 static inline void rt_schedule_notify(rtable *tab);
131 static void rt_flowspec_notify(rtable *tab, net *net);
132 static void rt_kick_prune_timer(rtable *tab);
133
134
135 static void
136 net_init_with_trie(struct fib *f, void *N)
137 {
138 rtable *tab = SKIP_BACK(rtable, fib, f);
139 net *n = N;
140
141 if (tab->trie)
142 trie_add_prefix(tab->trie, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen);
143
144 if (tab->trie_new)
145 trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen);
146 }
147
148 static inline net *
149 net_route_ip4_trie(rtable *t, const net_addr_ip4 *n0)
150 {
151 TRIE_WALK_TO_ROOT_IP4(t->trie, n0, n)
152 {
153 net *r;
154 if (r = net_find_valid(t, (net_addr *) &n))
155 return r;
156 }
157 TRIE_WALK_TO_ROOT_END;
158
159 return NULL;
160 }
161
162 static inline net *
163 net_route_vpn4_trie(rtable *t, const net_addr_vpn4 *n0)
164 {
165 TRIE_WALK_TO_ROOT_IP4(t->trie, (const net_addr_ip4 *) n0, px)
166 {
167 net_addr_vpn4 n = NET_ADDR_VPN4(px.prefix, px.pxlen, n0->rd);
168
169 net *r;
170 if (r = net_find_valid(t, (net_addr *) &n))
171 return r;
172 }
173 TRIE_WALK_TO_ROOT_END;
174
175 return NULL;
176 }
177
178 static inline net *
179 net_route_ip6_trie(rtable *t, const net_addr_ip6 *n0)
180 {
181 TRIE_WALK_TO_ROOT_IP6(t->trie, n0, n)
182 {
183 net *r;
184 if (r = net_find_valid(t, (net_addr *) &n))
185 return r;
186 }
187 TRIE_WALK_TO_ROOT_END;
188
189 return NULL;
190 }
191
192 static inline net *
193 net_route_vpn6_trie(rtable *t, const net_addr_vpn6 *n0)
194 {
195 TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px)
196 {
197 net_addr_vpn6 n = NET_ADDR_VPN6(px.prefix, px.pxlen, n0->rd);
198
199 net *r;
200 if (r = net_find_valid(t, (net_addr *) &n))
201 return r;
202 }
203 TRIE_WALK_TO_ROOT_END;
204
205 return NULL;
206 }
207
208 static inline void *
209 net_route_ip6_sadr_trie(rtable *t, const net_addr_ip6_sadr *n0)
210 {
211 TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px)
212 {
213 net_addr_ip6_sadr n = NET_ADDR_IP6_SADR(px.prefix, px.pxlen, n0->src_prefix, n0->src_pxlen);
214 net *best = NULL;
215 int best_pxlen = 0;
216
217 /* We need to do dst first matching. Since sadr addresses are hashed on dst
218 prefix only, find the hash table chain and go through it to find the
219 match with the longest matching src prefix. */
220 for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next)
221 {
222 net_addr_ip6_sadr *a = (void *) fn->addr;
223
224 if (net_equal_dst_ip6_sadr(&n, a) &&
225 net_in_net_src_ip6_sadr(&n, a) &&
226 (a->src_pxlen >= best_pxlen))
227 {
228 best = fib_node_to_user(&t->fib, fn);
229 best_pxlen = a->src_pxlen;
230 }
231 }
232
233 if (best)
234 return best;
235 }
236 TRIE_WALK_TO_ROOT_END;
237
238 return NULL;
239 }
240
241 static inline net *
242 net_route_ip4_fib(rtable *t, const net_addr_ip4 *n0)
243 {
244 net_addr_ip4 n;
245 net_copy_ip4(&n, n0);
246
247 net *r;
248 while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0))
249 {
250 n.pxlen--;
251 ip4_clrbit(&n.prefix, n.pxlen);
252 }
253
254 return r;
255 }
256
257 static inline net *
258 net_route_vpn4_fib(rtable *t, const net_addr_vpn4 *n0)
259 {
260 net_addr_vpn4 n;
261 net_copy_vpn4(&n, n0);
262
263 net *r;
264 while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0))
265 {
266 n.pxlen--;
267 ip4_clrbit(&n.prefix, n.pxlen);
268 }
269
270 return r;
271 }
272
273 static inline net *
274 net_route_ip6_fib(rtable *t, const net_addr_ip6 *n0)
275 {
276 net_addr_ip6 n;
277 net_copy_ip6(&n, n0);
278
279 net *r;
280 while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0))
281 {
282 n.pxlen--;
283 ip6_clrbit(&n.prefix, n.pxlen);
284 }
285
286 return r;
287 }
288
289 static inline net *
290 net_route_vpn6_fib(rtable *t, const net_addr_vpn6 *n0)
291 {
292 net_addr_vpn6 n;
293 net_copy_vpn6(&n, n0);
294
295 net *r;
296 while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0))
297 {
298 n.pxlen--;
299 ip6_clrbit(&n.prefix, n.pxlen);
300 }
301
302 return r;
303 }
304
305 static inline void *
306 net_route_ip6_sadr_fib(rtable *t, const net_addr_ip6_sadr *n0)
307 {
308 net_addr_ip6_sadr n;
309 net_copy_ip6_sadr(&n, n0);
310
311 while (1)
312 {
313 net *best = NULL;
314 int best_pxlen = 0;
315
316 /* We need to do dst first matching. Since sadr addresses are hashed on dst
317 prefix only, find the hash table chain and go through it to find the
318 match with the longest matching src prefix. */
319 for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next)
320 {
321 net_addr_ip6_sadr *a = (void *) fn->addr;
322
323 if (net_equal_dst_ip6_sadr(&n, a) &&
324 net_in_net_src_ip6_sadr(&n, a) &&
325 (a->src_pxlen >= best_pxlen))
326 {
327 best = fib_node_to_user(&t->fib, fn);
328 best_pxlen = a->src_pxlen;
329 }
330 }
331
332 if (best)
333 return best;
334
335 if (!n.dst_pxlen)
336 break;
337
338 n.dst_pxlen--;
339 ip6_clrbit(&n.dst_prefix, n.dst_pxlen);
340 }
341
342 return NULL;
343 }
344
345 net *
346 net_route(rtable *tab, const net_addr *n)
347 {
348 ASSERT(tab->addr_type == n->type);
349
350 switch (n->type)
351 {
352 case NET_IP4:
353 if (tab->trie)
354 return net_route_ip4_trie(tab, (net_addr_ip4 *) n);
355 else
356 return net_route_ip4_fib (tab, (net_addr_ip4 *) n);
357
358 case NET_VPN4:
359 if (tab->trie)
360 return net_route_vpn4_trie(tab, (net_addr_vpn4 *) n);
361 else
362 return net_route_vpn4_fib (tab, (net_addr_vpn4 *) n);
363
364 case NET_IP6:
365 if (tab->trie)
366 return net_route_ip6_trie(tab, (net_addr_ip6 *) n);
367 else
368 return net_route_ip6_fib (tab, (net_addr_ip6 *) n);
369
370 case NET_VPN6:
371 if (tab->trie)
372 return net_route_vpn6_trie(tab, (net_addr_vpn6 *) n);
373 else
374 return net_route_vpn6_fib (tab, (net_addr_vpn6 *) n);
375
376 case NET_IP6_SADR:
377 if (tab->trie)
378 return net_route_ip6_sadr_trie(tab, (net_addr_ip6_sadr *) n);
379 else
380 return net_route_ip6_sadr_fib (tab, (net_addr_ip6_sadr *) n);
381
382 default:
383 return NULL;
384 }
385 }
386
387
388 static int
389 net_roa_check_ip4_trie(rtable *tab, const net_addr_ip4 *px, u32 asn)
390 {
391 int anything = 0;
392
393 TRIE_WALK_TO_ROOT_IP4(tab->trie, px, px0)
394 {
395 net_addr_roa4 roa0 = NET_ADDR_ROA4(px0.prefix, px0.pxlen, 0, 0);
396
397 struct fib_node *fn;
398 for (fn = fib_get_chain(&tab->fib, (net_addr *) &roa0); fn; fn = fn->next)
399 {
400 net_addr_roa4 *roa = (void *) fn->addr;
401 net *r = fib_node_to_user(&tab->fib, fn);
402
403 if (net_equal_prefix_roa4(roa, &roa0) && rte_is_valid(r->routes))
404 {
405 anything = 1;
406 if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen))
407 return ROA_VALID;
408 }
409 }
410 }
411 TRIE_WALK_TO_ROOT_END;
412
413 return anything ? ROA_INVALID : ROA_UNKNOWN;
414 }
415
416 static int
417 net_roa_check_ip4_fib(rtable *tab, const net_addr_ip4 *px, u32 asn)
418 {
419 struct net_addr_roa4 n = NET_ADDR_ROA4(px->prefix, px->pxlen, 0, 0);
420 struct fib_node *fn;
421 int anything = 0;
422
423 while (1)
424 {
425 for (fn = fib_get_chain(&tab->fib, (net_addr *) &n); fn; fn = fn->next)
426 {
427 net_addr_roa4 *roa = (void *) fn->addr;
428 net *r = fib_node_to_user(&tab->fib, fn);
429
430 if (net_equal_prefix_roa4(roa, &n) && rte_is_valid(r->routes))
431 {
432 anything = 1;
433 if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen))
434 return ROA_VALID;
435 }
436 }
437
438 if (n.pxlen == 0)
439 break;
440
441 n.pxlen--;
442 ip4_clrbit(&n.prefix, n.pxlen);
443 }
444
445 return anything ? ROA_INVALID : ROA_UNKNOWN;
446 }
447
448 static int
449 net_roa_check_ip6_trie(rtable *tab, const net_addr_ip6 *px, u32 asn)
450 {
451 int anything = 0;
452
453 TRIE_WALK_TO_ROOT_IP6(tab->trie, px, px0)
454 {
455 net_addr_roa6 roa0 = NET_ADDR_ROA6(px0.prefix, px0.pxlen, 0, 0);
456
457 struct fib_node *fn;
458 for (fn = fib_get_chain(&tab->fib, (net_addr *) &roa0); fn; fn = fn->next)
459 {
460 net_addr_roa6 *roa = (void *) fn->addr;
461 net *r = fib_node_to_user(&tab->fib, fn);
462
463 if (net_equal_prefix_roa6(roa, &roa0) && rte_is_valid(r->routes))
464 {
465 anything = 1;
466 if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen))
467 return ROA_VALID;
468 }
469 }
470 }
471 TRIE_WALK_TO_ROOT_END;
472
473 return anything ? ROA_INVALID : ROA_UNKNOWN;
474 }
475
476 static int
477 net_roa_check_ip6_fib(rtable *tab, const net_addr_ip6 *px, u32 asn)
478 {
479 struct net_addr_roa6 n = NET_ADDR_ROA6(px->prefix, px->pxlen, 0, 0);
480 struct fib_node *fn;
481 int anything = 0;
482
483 while (1)
484 {
485 for (fn = fib_get_chain(&tab->fib, (net_addr *) &n); fn; fn = fn->next)
486 {
487 net_addr_roa6 *roa = (void *) fn->addr;
488 net *r = fib_node_to_user(&tab->fib, fn);
489
490 if (net_equal_prefix_roa6(roa, &n) && rte_is_valid(r->routes))
491 {
492 anything = 1;
493 if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen))
494 return ROA_VALID;
495 }
496 }
497
498 if (n.pxlen == 0)
499 break;
500
501 n.pxlen--;
502 ip6_clrbit(&n.prefix, n.pxlen);
503 }
504
505 return anything ? ROA_INVALID : ROA_UNKNOWN;
506 }
507
508 /**
509 * roa_check - check validity of route origination in a ROA table
510 * @tab: ROA table
511 * @n: network prefix to check
512 * @asn: AS number of network prefix
513 *
514 * Implements RFC 6483 route validation for the given network prefix. The
515 * procedure is to find all candidate ROAs - ROAs whose prefixes cover the given
516 * network prefix. If there is no candidate ROA, return ROA_UNKNOWN. If there is
517 * a candidate ROA with matching ASN and maxlen field greater than or equal to
518 * the given prefix length, return ROA_VALID. Otherwise, return ROA_INVALID. If
519 * caller cannot determine origin AS, 0 could be used (in that case ROA_VALID
520 * cannot happen). Table @tab must have type NET_ROA4 or NET_ROA6, network @n
521 * must have type NET_IP4 or NET_IP6, respectively.
522 */
523 int
524 net_roa_check(rtable *tab, const net_addr *n, u32 asn)
525 {
526 if ((tab->addr_type == NET_ROA4) && (n->type == NET_IP4))
527 {
528 if (tab->trie)
529 return net_roa_check_ip4_trie(tab, (const net_addr_ip4 *) n, asn);
530 else
531 return net_roa_check_ip4_fib (tab, (const net_addr_ip4 *) n, asn);
532 }
533 else if ((tab->addr_type == NET_ROA6) && (n->type == NET_IP6))
534 {
535 if (tab->trie)
536 return net_roa_check_ip6_trie(tab, (const net_addr_ip6 *) n, asn);
537 else
538 return net_roa_check_ip6_fib (tab, (const net_addr_ip6 *) n, asn);
539 }
540 else
541 return ROA_UNKNOWN; /* Should not happen */
542 }
543
544 /**
545 * rte_find - find a route
546 * @net: network node
547 * @src: route source
548 *
549 * The rte_find() function returns a route for destination @net
550 * which is from route source @src.
551 */
552 rte *
553 rte_find(net *net, struct rte_src *src)
554 {
555 rte *e = net->routes;
556
557 while (e && e->src != src)
558 e = e->next;
559 return e;
560 }
561
562 /**
563 * rte_get_temp - get a temporary &rte
564 * @a: attributes to assign to the new route (a &rta; in case it's
565 * un-cached, rte_update() will create a cached copy automatically)
566 * @src: route source
567 *
568 * Create a temporary &rte and bind it with the attributes @a.
569 */
570 rte *
571 rte_get_temp(rta *a, struct rte_src *src)
572 {
573 rte *e = sl_alloc(rte_slab);
574
575 e->attrs = a;
576 e->id = 0;
577 e->flags = 0;
578 e->pflags = 0;
579 rt_lock_source(e->src = src);
580 return e;
581 }
582
583 rte *
584 rte_do_cow(rte *r)
585 {
586 rte *e = sl_alloc(rte_slab);
587
588 memcpy(e, r, sizeof(rte));
589
590 rt_lock_source(e->src);
591 e->attrs = rta_clone(r->attrs);
592 e->flags = 0;
593 return e;
594 }
595
596 /**
597 * rte_cow_rta - get a private writable copy of &rte with writable &rta
598 * @r: a route entry to be copied
599 * @lp: a linpool from which to allocate &rta
600 *
601 * rte_cow_rta() takes a &rte and prepares it and associated &rta for
602 * modification. There are three possibilities: First, both &rte and &rta are
603 * private copies, in that case they are returned unchanged. Second, &rte is
604 * private copy, but &rta is cached, in that case &rta is duplicated using
605 * rta_do_cow(). Third, both &rte is shared and &rta is cached, in that case
606 * both structures are duplicated by rte_do_cow() and rta_do_cow().
607 *
608 * Note that in the second case, cached &rta loses one reference, while private
609 * copy created by rta_do_cow() is a shallow copy sharing indirect data (eattrs,
610 * nexthops, ...) with it. To work properly, original shared &rta should have
611 * another reference during the life of created private copy.
612 *
613 * Result: a pointer to the new writable &rte with writable &rta.
614 */
615 rte *
616 rte_cow_rta(rte *r, linpool *lp)
617 {
618 if (!rta_is_cached(r->attrs))
619 return r;
620
621 r = rte_cow(r);
622 rta *a = rta_do_cow(r->attrs, lp);
623 rta_free(r->attrs);
624 r->attrs = a;
625 return r;
626 }
627
628 static int /* Actually better or at least as good as */
629 rte_better(rte *new, rte *old)
630 {
631 int (*better)(rte *, rte *);
632
633 if (!rte_is_valid(old))
634 return 1;
635 if (!rte_is_valid(new))
636 return 0;
637
638 if (new->attrs->pref > old->attrs->pref)
639 return 1;
640 if (new->attrs->pref < old->attrs->pref)
641 return 0;
642 if (new->src->proto->proto != old->src->proto->proto)
643 {
644 /*
645 * If the user has configured protocol preferences, so that two different protocols
646 * have the same preference, try to break the tie by comparing addresses. Not too
647 * useful, but keeps the ordering of routes unambiguous.
648 */
649 return new->src->proto->proto > old->src->proto->proto;
650 }
651 if (better = new->src->proto->rte_better)
652 return better(new, old);
653 return 0;
654 }
655
656 static int
657 rte_mergable(rte *pri, rte *sec)
658 {
659 int (*mergable)(rte *, rte *);
660
661 if (!rte_is_valid(pri) || !rte_is_valid(sec))
662 return 0;
663
664 if (pri->attrs->pref != sec->attrs->pref)
665 return 0;
666
667 if (pri->src->proto->proto != sec->src->proto->proto)
668 return 0;
669
670 if (mergable = pri->src->proto->rte_mergable)
671 return mergable(pri, sec);
672
673 return 0;
674 }
675
676 static void
677 rte_trace(struct channel *c, rte *e, int dir, char *msg)
678 {
679 log(L_TRACE "%s.%s %c %s %N %luL %uG %s",
680 c->proto->name, c->name ?: "?", dir, msg, e->net->n.addr, e->src->private_id, e->src->global_id,
681 rta_dest_name(e->attrs->dest));
682 }
683
684 static inline void
685 rte_trace_in(uint flag, struct channel *c, rte *e, char *msg)
686 {
687 if ((c->debug & flag) || (c->proto->debug & flag))
688 rte_trace(c, e, '>', msg);
689 }
690
691 static inline void
692 rte_trace_out(uint flag, struct channel *c, rte *e, char *msg)
693 {
694 if ((c->debug & flag) || (c->proto->debug & flag))
695 rte_trace(c, e, '<', msg);
696 }
697
698 static rte *
699 export_filter_(struct channel *c, rte *rt0, rte **rt_free, linpool *pool, int silent)
700 {
701 struct proto *p = c->proto;
702 const struct filter *filter = c->out_filter;
703 struct proto_stats *stats = &c->stats;
704 rte *rt;
705 int v;
706
707 rt = rt0;
708 *rt_free = NULL;
709
710 v = p->preexport ? p->preexport(c, rt) : 0;
711 if (v < 0)
712 {
713 if (silent)
714 goto reject;
715
716 stats->exp_updates_rejected++;
717 if (v == RIC_REJECT)
718 rte_trace_out(D_FILTERS, c, rt, "rejected by protocol");
719 goto reject;
720 }
721 if (v > 0)
722 {
723 if (!silent)
724 rte_trace_out(D_FILTERS, c, rt, "forced accept by protocol");
725 goto accept;
726 }
727
728 v = filter && ((filter == FILTER_REJECT) ||
729 (f_run(filter, &rt, pool,
730 (silent ? FF_SILENT : 0)) > F_ACCEPT));
731 if (v)
732 {
733 if (silent)
734 goto reject;
735
736 stats->exp_updates_filtered++;
737 rte_trace_out(D_FILTERS, c, rt, "filtered out");
738 goto reject;
739 }
740
741 accept:
742 if (rt != rt0)
743 *rt_free = rt;
744 return rt;
745
746 reject:
747 /* Discard temporary rte */
748 if (rt != rt0)
749 rte_free(rt);
750 return NULL;
751 }
752
753 static inline rte *
754 export_filter(struct channel *c, rte *rt0, rte **rt_free, int silent)
755 {
756 return export_filter_(c, rt0, rt_free, rte_update_pool, silent);
757 }
758
759 static void
760 do_rt_notify(struct channel *c, net *net, rte *new, rte *old, int refeed)
761 {
762 struct proto *p = c->proto;
763 struct proto_stats *stats = &c->stats;
764
765 if (refeed && new)
766 c->refeed_count++;
767
768 /* Apply export limit */
769 struct channel_limit *l = &c->out_limit;
770 if (l->action && !old && new)
771 {
772 if (stats->exp_routes >= l->limit)
773 channel_notify_limit(c, l, PLD_OUT, stats->exp_routes);
774
775 if (l->state == PLS_BLOCKED)
776 {
777 stats->exp_updates_rejected++;
778 rte_trace_out(D_FILTERS, c, new, "rejected [limit]");
779 return;
780 }
781 }
782
783 /* Apply export table */
784 if (c->out_table && !rte_update_out(c, net->n.addr, new, old, refeed))
785 return;
786
787 if (new)
788 stats->exp_updates_accepted++;
789 else
790 stats->exp_withdraws_accepted++;
791
792 if (old)
793 {
794 bmap_clear(&c->export_map, old->id);
795 stats->exp_routes--;
796 }
797
798 if (new)
799 {
800 bmap_set(&c->export_map, new->id);
801 stats->exp_routes++;
802 }
803
804 if (p->debug & D_ROUTES)
805 {
806 if (new && old)
807 rte_trace_out(D_ROUTES, c, new, "replaced");
808 else if (new)
809 rte_trace_out(D_ROUTES, c, new, "added");
810 else if (old)
811 rte_trace_out(D_ROUTES, c, old, "removed");
812 }
813
814 p->rt_notify(p, c, net, new, old);
815 }
816
817 static void
818 rt_notify_basic(struct channel *c, net *net, rte *new, rte *old, int refeed)
819 {
820 // struct proto *p = c->proto;
821 rte *new_free = NULL;
822
823 if (new)
824 c->stats.exp_updates_received++;
825 else
826 c->stats.exp_withdraws_received++;
827
828 if (new)
829 new = export_filter(c, new, &new_free, 0);
830
831 if (old && !bmap_test(&c->export_map, old->id))
832 old = NULL;
833
834 if (!new && !old)
835 return;
836
837 do_rt_notify(c, net, new, old, refeed);
838
839 /* Discard temporary rte */
840 if (new_free)
841 rte_free(new_free);
842 }
843
844 static void
845 rt_notify_accepted(struct channel *c, net *net, rte *new_changed, rte *old_changed, int refeed)
846 {
847 // struct proto *p = c->proto;
848 rte *new_best = NULL;
849 rte *old_best = NULL;
850 rte *new_free = NULL;
851 int new_first = 0;
852
853 /*
854 * We assume that there are no changes in net route order except (added)
855 * new_changed and (removed) old_changed. Therefore, the function is not
856 * compatible with deterministic_med (where nontrivial reordering can happen
857 * as a result of a route change) and with recomputation of recursive routes
858 * due to next hop update (where many routes can be changed in one step).
859 *
860 * Note that we need this assumption just for optimizations, we could just
861 * run full new_best recomputation otherwise.
862 *
863 * There are three cases:
864 * feed or old_best is old_changed -> we need to recompute new_best
865 * old_best is before new_changed -> new_best is old_best, ignore
866 * old_best is after new_changed -> try new_changed, otherwise old_best
867 */
868
869 if (net->routes)
870 c->stats.exp_updates_received++;
871 else
872 c->stats.exp_withdraws_received++;
873
874 /* Find old_best - either old_changed, or route for net->routes */
875 if (old_changed && bmap_test(&c->export_map, old_changed->id))
876 old_best = old_changed;
877 else
878 {
879 for (rte *r = net->routes; rte_is_valid(r); r = r->next)
880 {
881 if (bmap_test(&c->export_map, r->id))
882 {
883 old_best = r;
884 break;
885 }
886
887 /* Note if new_changed found before old_best */
888 if (r == new_changed)
889 new_first = 1;
890 }
891 }
892
893 /* Find new_best */
894 if ((new_changed == old_changed) || (old_best == old_changed))
895 {
896 /* Feed or old_best changed -> find first accepted by filters */
897 for (rte *r = net->routes; rte_is_valid(r); r = r->next)
898 if (new_best = export_filter(c, r, &new_free, 0))
899 break;
900 }
901 else
902 {
903 /* Other cases -> either new_changed, or old_best (and nothing changed) */
904 if (new_first && (new_changed = export_filter(c, new_changed, &new_free, 0)))
905 new_best = new_changed;
906 else
907 return;
908 }
909
910 if (!new_best && !old_best)
911 return;
912
913 do_rt_notify(c, net, new_best, old_best, refeed);
914
915 /* Discard temporary rte */
916 if (new_free)
917 rte_free(new_free);
918 }
919
920
921 static struct nexthop *
922 nexthop_merge_rta(struct nexthop *nhs, rta *a, linpool *pool, int max)
923 {
924 return nexthop_merge(nhs, &(a->nh), 1, 0, max, pool);
925 }
926
927 rte *
928 rt_export_merged(struct channel *c, net *net, rte **rt_free, linpool *pool, int silent)
929 {
930 // struct proto *p = c->proto;
931 struct nexthop *nhs = NULL;
932 rte *best0, *best, *rt0, *rt, *tmp;
933
934 best0 = net->routes;
935 *rt_free = NULL;
936
937 if (!rte_is_valid(best0))
938 return NULL;
939
940 best = export_filter_(c, best0, rt_free, pool, silent);
941
942 if (!best || !rte_is_reachable(best))
943 return best;
944
945 for (rt0 = best0->next; rt0; rt0 = rt0->next)
946 {
947 if (!rte_mergable(best0, rt0))
948 continue;
949
950 rt = export_filter_(c, rt0, &tmp, pool, 1);
951
952 if (!rt)
953 continue;
954
955 if (rte_is_reachable(rt))
956 nhs = nexthop_merge_rta(nhs, rt->attrs, pool, c->merge_limit);
957
958 if (tmp)
959 rte_free(tmp);
960 }
961
962 if (nhs)
963 {
964 nhs = nexthop_merge_rta(nhs, best->attrs, pool, c->merge_limit);
965
966 if (nhs->next)
967 {
968 best = rte_cow_rta(best, pool);
969 nexthop_link(best->attrs, nhs);
970 }
971 }
972
973 if (best != best0)
974 *rt_free = best;
975
976 return best;
977 }
978
979 static void
980 rt_notify_merged(struct channel *c, net *net, rte *new_changed, rte *old_changed,
981 rte *new_best, rte *old_best, int refeed)
982 {
983 // struct proto *p = c->proto;
984 rte *new_free = NULL;
985
986 /* We assume that all rte arguments are either NULL or rte_is_valid() */
987
988 /* This check should be done by the caller */
989 if (!new_best && !old_best)
990 return;
991
992 /* Check whether the change is relevant to the merged route */
993 if ((new_best == old_best) &&
994 (new_changed != old_changed) &&
995 !rte_mergable(new_best, new_changed) &&
996 !rte_mergable(old_best, old_changed))
997 return;
998
999 if (new_best)
1000 c->stats.exp_updates_received++;
1001 else
1002 c->stats.exp_withdraws_received++;
1003
1004 /* Prepare new merged route */
1005 if (new_best)
1006 new_best = rt_export_merged(c, net, &new_free, rte_update_pool, 0);
1007
1008 /* Check old merged route */
1009 if (old_best && !bmap_test(&c->export_map, old_best->id))
1010 old_best = NULL;
1011
1012 if (!new_best && !old_best)
1013 return;
1014
1015 do_rt_notify(c, net, new_best, old_best, refeed);
1016
1017 /* Discard temporary rte */
1018 if (new_free)
1019 rte_free(new_free);
1020 }
1021
1022
1023 /**
1024 * rte_announce - announce a routing table change
1025 * @tab: table the route has been added to
1026 * @type: type of route announcement (RA_UNDEF or RA_ANY)
1027 * @net: network in question
1028 * @new: the new or changed route
1029 * @old: the previous route replaced by the new one
1030 * @new_best: the new best route for the same network
1031 * @old_best: the previous best route for the same network
1032 *
1033 * This function gets a routing table update and announces it to all protocols
1034 * that are connected to the same table by their channels.
1035 *
1036 * There are two ways of how routing table changes are announced. First, there
1037 * is a change of just one route in @net (which may caused a change of the best
1038 * route of the network). In this case @new and @old describes the changed route
1039 * and @new_best and @old_best describes best routes. Other routes are not
1040 * affected, but in sorted table the order of other routes might change.
1041 *
1042 * Second, There is a bulk change of multiple routes in @net, with shared best
1043 * route selection. In such case separate route changes are described using
1044 * @type of %RA_ANY, with @new and @old specifying the changed route, while
1045 * @new_best and @old_best are NULL. After that, another notification is done
1046 * where @new_best and @old_best are filled (may be the same), but @new and @old
1047 * are NULL.
1048 *
1049 * The function announces the change to all associated channels. For each
1050 * channel, an appropriate preprocessing is done according to channel &ra_mode.
1051 * For example, %RA_OPTIMAL channels receive just changes of best routes.
1052 *
1053 * In general, we first call preexport() hook of a protocol, which performs
1054 * basic checks on the route (each protocol has a right to veto or force accept
1055 * of the route before any filter is asked). Then we consult an export filter
1056 * of the channel and verify the old route in an export map of the channel.
1057 * Finally, the rt_notify() hook of the protocol gets called.
1058 *
1059 * Note that there are also calls of rt_notify() hooks due to feed, but that is
1060 * done outside of scope of rte_announce().
1061 */
1062 static void
1063 rte_announce(rtable *tab, uint type, net *net, rte *new, rte *old,
1064 rte *new_best, rte *old_best)
1065 {
1066 if (!rte_is_valid(new))
1067 new = NULL;
1068
1069 if (!rte_is_valid(old))
1070 old = NULL;
1071
1072 if (!rte_is_valid(new_best))
1073 new_best = NULL;
1074
1075 if (!rte_is_valid(old_best))
1076 old_best = NULL;
1077
1078 if (!new && !old && !new_best && !old_best)
1079 return;
1080
1081 if (new_best != old_best)
1082 {
1083 if (new_best)
1084 new_best->sender->stats.pref_routes++;
1085 if (old_best)
1086 old_best->sender->stats.pref_routes--;
1087
1088 if (tab->hostcache)
1089 rt_notify_hostcache(tab, net);
1090
1091 if (!EMPTY_LIST(tab->flowspec_links))
1092 rt_flowspec_notify(tab, net);
1093 }
1094
1095 rt_schedule_notify(tab);
1096
1097 struct channel *c; node *n;
1098 WALK_LIST2(c, n, tab->channels, table_node)
1099 {
1100 if (c->export_state == ES_DOWN)
1101 continue;
1102
1103 if (type && (type != c->ra_mode))
1104 continue;
1105
1106 switch (c->ra_mode)
1107 {
1108 case RA_OPTIMAL:
1109 if (new_best != old_best)
1110 rt_notify_basic(c, net, new_best, old_best, 0);
1111 break;
1112
1113 case RA_ANY:
1114 if (new != old)
1115 rt_notify_basic(c, net, new, old, 0);
1116 break;
1117
1118 case RA_ACCEPTED:
1119 /*
1120 * The (new != old) condition is problematic here, as it would break
1121 * the second usage pattern (announcement after bulk change, used in
1122 * rt_next_hop_update_net(), which sends both new and old as NULL).
1123 *
1124 * But recursive next hops do not work with sorted tables anyways,
1125 * such configuration is forbidden in BGP and not supported in
1126 * rt_notify_accepted().
1127 *
1128 * The condition is needed to eliminate spurious announcements where
1129 * both old and new routes are not valid (so they are NULL).
1130 */
1131 if (new != old)
1132 rt_notify_accepted(c, net, new, old, 0);
1133 break;
1134
1135 case RA_MERGED:
1136 rt_notify_merged(c, net, new, old, new_best, old_best, 0);
1137 break;
1138 }
1139 }
1140 }
1141
1142 static inline int
1143 rte_validate(rte *e)
1144 {
1145 int c;
1146 net *n = e->net;
1147
1148 if (!net_validate(n->n.addr))
1149 {
1150 log(L_WARN "Ignoring bogus prefix %N received via %s",
1151 n->n.addr, e->sender->proto->name);
1152 return 0;
1153 }
1154
1155 /* FIXME: better handling different nettypes */
1156 c = !net_is_flow(n->n.addr) ?
1157 net_classify(n->n.addr): (IADDR_HOST | SCOPE_UNIVERSE);
1158 if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
1159 {
1160 log(L_WARN "Ignoring bogus route %N received via %s",
1161 n->n.addr, e->sender->proto->name);
1162 return 0;
1163 }
1164
1165 if (net_type_match(n->n.addr, NB_DEST) == !e->attrs->dest)
1166 {
1167 /* Exception for flowspec that failed validation */
1168 if (net_is_flow(n->n.addr) && (e->attrs->dest == RTD_UNREACHABLE))
1169 return 1;
1170
1171 log(L_WARN "Ignoring route %N with invalid dest %d received via %s",
1172 n->n.addr, e->attrs->dest, e->sender->proto->name);
1173 return 0;
1174 }
1175
1176 if ((e->attrs->dest == RTD_UNICAST) && !nexthop_is_sorted(&(e->attrs->nh)))
1177 {
1178 log(L_WARN "Ignoring unsorted multipath route %N received via %s",
1179 n->n.addr, e->sender->proto->name);
1180 return 0;
1181 }
1182
1183 return 1;
1184 }
1185
1186 /**
1187 * rte_free - delete a &rte
1188 * @e: &rte to be deleted
1189 *
1190 * rte_free() deletes the given &rte from the routing table it's linked to.
1191 */
1192 void
1193 rte_free(rte *e)
1194 {
1195 rt_unlock_source(e->src);
1196 if (rta_is_cached(e->attrs))
1197 rta_free(e->attrs);
1198 sl_free(e);
1199 }
1200
1201 static inline void
1202 rte_free_quick(rte *e)
1203 {
1204 rt_unlock_source(e->src);
1205 rta_free(e->attrs);
1206 sl_free(e);
1207 }
1208
1209 int
1210 rte_same(rte *x, rte *y)
1211 {
1212 /* rte.flags / rte.pflags are not checked, as they are internal to rtable */
1213 return
1214 x->attrs == y->attrs &&
1215 x->src == y->src &&
1216 rte_is_filtered(x) == rte_is_filtered(y);
1217 }
1218
1219 static inline int rte_is_ok(rte *e) { return e && !rte_is_filtered(e); }
1220
1221 static void
1222 rte_recalculate(struct channel *c, net *net, rte *new, struct rte_src *src)
1223 {
1224 struct proto *p = c->proto;
1225 struct rtable *table = c->table;
1226 struct proto_stats *stats = &c->stats;
1227 static struct tbf rl_pipe = TBF_DEFAULT_LOG_LIMITS;
1228 rte *before_old = NULL;
1229 rte *old_best = net->routes;
1230 rte *old = NULL;
1231 rte **k;
1232
1233 k = &net->routes; /* Find and remove original route from the same protocol */
1234 while (old = *k)
1235 {
1236 if (old->src == src)
1237 {
1238 /* If there is the same route in the routing table but from
1239 * a different sender, then there are two paths from the
1240 * source protocol to this routing table through transparent
1241 * pipes, which is not allowed.
1242 *
1243 * We log that and ignore the route. If it is withdraw, we
1244 * ignore it completely (there might be 'spurious withdraws',
1245 * see FIXME in do_rte_announce())
1246 */
1247 if (old->sender->proto != p)
1248 {
1249 if (new)
1250 {
1251 log_rl(&rl_pipe, L_ERR "Pipe collision detected when sending %N to table %s",
1252 net->n.addr, table->name);
1253 rte_free_quick(new);
1254 }
1255 return;
1256 }
1257
1258 if (new && rte_same(old, new))
1259 {
1260 /* No changes, ignore the new route and refresh the old one */
1261
1262 old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
1263
1264 if (!rte_is_filtered(new))
1265 {
1266 stats->imp_updates_ignored++;
1267 rte_trace_in(D_ROUTES, c, new, "ignored");
1268 }
1269
1270 rte_free_quick(new);
1271 return;
1272 }
1273 *k = old->next;
1274 table->rt_count--;
1275 break;
1276 }
1277 k = &old->next;
1278 before_old = old;
1279 }
1280
1281 /* Save the last accessed position */
1282 rte **pos = k;
1283
1284 if (!old)
1285 before_old = NULL;
1286
1287 if (!old && !new)
1288 {
1289 stats->imp_withdraws_ignored++;
1290 return;
1291 }
1292
1293 int new_ok = rte_is_ok(new);
1294 int old_ok = rte_is_ok(old);
1295
1296 struct channel_limit *l = &c->rx_limit;
1297 if (l->action && !old && new && !c->in_table)
1298 {
1299 u32 all_routes = stats->imp_routes + stats->filt_routes;
1300
1301 if (all_routes >= l->limit)
1302 channel_notify_limit(c, l, PLD_RX, all_routes);
1303
1304 if (l->state == PLS_BLOCKED)
1305 {
1306 /* In receive limit the situation is simple, old is NULL so
1307 we just free new and exit like nothing happened */
1308
1309 stats->imp_updates_ignored++;
1310 rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
1311 rte_free_quick(new);
1312 return;
1313 }
1314 }
1315
1316 l = &c->in_limit;
1317 if (l->action && !old_ok && new_ok)
1318 {
1319 if (stats->imp_routes >= l->limit)
1320 channel_notify_limit(c, l, PLD_IN, stats->imp_routes);
1321
1322 if (l->state == PLS_BLOCKED)
1323 {
1324 /* In import limit the situation is more complicated. We
1325 shouldn't just drop the route, we should handle it like
1326 it was filtered. We also have to continue the route
1327 processing if old or new is non-NULL, but we should exit
1328 if both are NULL as this case is probably assumed to be
1329 already handled. */
1330
1331 stats->imp_updates_ignored++;
1332 rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
1333
1334 if (c->in_keep_filtered)
1335 new->flags |= REF_FILTERED;
1336 else
1337 { rte_free_quick(new); new = NULL; }
1338
1339 /* Note that old && !new could be possible when
1340 c->in_keep_filtered changed in the recent past. */
1341
1342 if (!old && !new)
1343 return;
1344
1345 new_ok = 0;
1346 goto skip_stats1;
1347 }
1348 }
1349
1350 if (new_ok)
1351 stats->imp_updates_accepted++;
1352 else if (old_ok)
1353 stats->imp_withdraws_accepted++;
1354 else
1355 stats->imp_withdraws_ignored++;
1356
1357 if (old_ok || new_ok)
1358 table->last_rt_change = current_time();
1359
1360 skip_stats1:
1361
1362 if (new)
1363 rte_is_filtered(new) ? stats->filt_routes++ : stats->imp_routes++;
1364 if (old)
1365 rte_is_filtered(old) ? stats->filt_routes-- : stats->imp_routes--;
1366
1367 if (table->config->sorted)
1368 {
1369 /* If routes are sorted, just insert new route to appropriate position */
1370 if (new)
1371 {
1372 if (before_old && !rte_better(new, before_old))
1373 k = &before_old->next;
1374 else
1375 k = &net->routes;
1376
1377 for (; *k; k=&(*k)->next)
1378 if (rte_better(new, *k))
1379 break;
1380
1381 new->next = *k;
1382 *k = new;
1383
1384 table->rt_count++;
1385 }
1386 }
1387 else
1388 {
1389 /* If routes are not sorted, find the best route and move it on
1390 the first position. There are several optimized cases. */
1391
1392 if (src->proto->rte_recalculate && src->proto->rte_recalculate(table, net, new, old, old_best))
1393 goto do_recalculate;
1394
1395 if (new && rte_better(new, old_best))
1396 {
1397 /* The first case - the new route is cleary optimal,
1398 we link it at the first position */
1399
1400 new->next = net->routes;
1401 net->routes = new;
1402
1403 table->rt_count++;
1404 }
1405 else if (old == old_best)
1406 {
1407 /* The second case - the old best route disappeared, we add the
1408 new route (if we have any) to the list (we don't care about
1409 position) and then we elect the new optimal route and relink
1410 that route at the first position and announce it. New optimal
1411 route might be NULL if there is no more routes */
1412
1413 do_recalculate:
1414 /* Add the new route to the list */
1415 if (new)
1416 {
1417 new->next = *pos;
1418 *pos = new;
1419
1420 table->rt_count++;
1421 }
1422
1423 /* Find a new optimal route (if there is any) */
1424 if (net->routes)
1425 {
1426 rte **bp = &net->routes;
1427 for (k=&(*bp)->next; *k; k=&(*k)->next)
1428 if (rte_better(*k, *bp))
1429 bp = k;
1430
1431 /* And relink it */
1432 rte *best = *bp;
1433 *bp = best->next;
1434 best->next = net->routes;
1435 net->routes = best;
1436 }
1437 }
1438 else if (new)
1439 {
1440 /* The third case - the new route is not better than the old
1441 best route (therefore old_best != NULL) and the old best
1442 route was not removed (therefore old_best == net->routes).
1443 We just link the new route to the old/last position. */
1444
1445 new->next = *pos;
1446 *pos = new;
1447
1448 table->rt_count++;
1449 }
1450 /* The fourth (empty) case - suboptimal route was removed, nothing to do */
1451 }
1452
1453 if (new)
1454 {
1455 new->lastmod = current_time();
1456
1457 if (!old)
1458 {
1459 new->id = hmap_first_zero(&table->id_map);
1460 hmap_set(&table->id_map, new->id);
1461 }
1462 else
1463 new->id = old->id;
1464 }
1465
1466 /* Log the route change */
1467 if ((c->debug & D_ROUTES) || (p->debug & D_ROUTES))
1468 {
1469 if (new_ok)
1470 rte_trace(c, new, '>', new == net->routes ? "added [best]" : "added");
1471 else if (old_ok)
1472 {
1473 if (old != old_best)
1474 rte_trace(c, old, '>', "removed");
1475 else if (rte_is_ok(net->routes))
1476 rte_trace(c, old, '>', "removed [replaced]");
1477 else
1478 rte_trace(c, old, '>', "removed [sole]");
1479 }
1480 }
1481
1482 /* Propagate the route change */
1483 rte_announce(table, RA_UNDEF, net, new, old, net->routes, old_best);
1484
1485 if (!net->routes &&
1486 (table->gc_counter++ >= table->config->gc_threshold))
1487 rt_kick_prune_timer(table);
1488
1489 if (old_ok && p->rte_remove)
1490 p->rte_remove(net, old);
1491 if (new_ok && p->rte_insert)
1492 p->rte_insert(net, new);
1493
1494 if (old)
1495 {
1496 if (!new)
1497 hmap_clear(&table->id_map, old->id);
1498
1499 rte_free_quick(old);
1500 }
1501 }
1502
1503 static int rte_update_nest_cnt; /* Nesting counter to allow recursive updates */
1504
1505 static inline void
1506 rte_update_lock(void)
1507 {
1508 rte_update_nest_cnt++;
1509 }
1510
1511 static inline void
1512 rte_update_unlock(void)
1513 {
1514 if (!--rte_update_nest_cnt)
1515 lp_flush(rte_update_pool);
1516 }
1517
1518 /**
1519 * rte_update - enter a new update to a routing table
1520 * @table: table to be updated
1521 * @c: channel doing the update
1522 * @net: network node
1523 * @p: protocol submitting the update
1524 * @src: protocol originating the update
1525 * @new: a &rte representing the new route or %NULL for route removal.
1526 *
1527 * This function is called by the routing protocols whenever they discover
1528 * a new route or wish to update/remove an existing route. The right announcement
1529 * sequence is to build route attributes first (either un-cached with @aflags set
1530 * to zero or a cached one using rta_lookup(); in this case please note that
1531 * you need to increase the use count of the attributes yourself by calling
1532 * rta_clone()), call rte_get_temp() to obtain a temporary &rte, fill in all
1533 * the appropriate data and finally submit the new &rte by calling rte_update().
1534 *
1535 * @src specifies the protocol that originally created the route and the meaning
1536 * of protocol-dependent data of @new. If @new is not %NULL, @src have to be the
1537 * same value as @new->attrs->proto. @p specifies the protocol that called
1538 * rte_update(). In most cases it is the same protocol as @src. rte_update()
1539 * stores @p in @new->sender;
1540 *
1541 * When rte_update() gets any route, it automatically validates it (checks,
1542 * whether the network and next hop address are valid IP addresses and also
1543 * whether a normal routing protocol doesn't try to smuggle a host or link
1544 * scope route to the table), converts all protocol dependent attributes stored
1545 * in the &rte to temporary extended attributes, consults import filters of the
1546 * protocol to see if the route should be accepted and/or its attributes modified,
1547 * stores the temporary attributes back to the &rte.
1548 *
1549 * Now, having a "public" version of the route, we
1550 * automatically find any old route defined by the protocol @src
1551 * for network @n, replace it by the new one (or removing it if @new is %NULL),
1552 * recalculate the optimal route for this destination and finally broadcast
1553 * the change (if any) to all routing protocols by calling rte_announce().
1554 *
1555 * All memory used for attribute lists and other temporary allocations is taken
1556 * from a special linear pool @rte_update_pool and freed when rte_update()
1557 * finishes.
1558 */
1559
1560 void
1561 rte_update2(struct channel *c, const net_addr *n, rte *new, struct rte_src *src)
1562 {
1563 struct proto *p = c->proto;
1564 struct proto_stats *stats = &c->stats;
1565 const struct filter *filter = c->in_filter;
1566 struct mpls_fec *fec = NULL;
1567 net *nn;
1568
1569 ASSERT(c->channel_state == CS_UP);
1570
1571 rte_update_lock();
1572 if (new)
1573 {
1574 /* Create a temporary table node */
1575 nn = alloca(sizeof(net) + n->length);
1576 memset(nn, 0, sizeof(net) + n->length);
1577 net_copy(nn->n.addr, n);
1578
1579 new->net = nn;
1580 new->sender = c;
1581
1582 stats->imp_updates_received++;
1583 if (!rte_validate(new))
1584 {
1585 rte_trace_in(D_FILTERS, c, new, "invalid");
1586 stats->imp_updates_invalid++;
1587 goto drop;
1588 }
1589
1590 if (filter == FILTER_REJECT)
1591 {
1592 stats->imp_updates_filtered++;
1593 rte_trace_in(D_FILTERS, c, new, "filtered out");
1594
1595 if (! c->in_keep_filtered)
1596 goto drop;
1597
1598 /* new is a private copy, i could modify it */
1599 new->flags |= REF_FILTERED;
1600 }
1601 else if (filter)
1602 {
1603 int fr = f_run(filter, &new, rte_update_pool, 0);
1604 if (fr > F_ACCEPT)
1605 {
1606 stats->imp_updates_filtered++;
1607 rte_trace_in(D_FILTERS, c, new, "filtered out");
1608
1609 if (! c->in_keep_filtered)
1610 goto drop;
1611
1612 new->flags |= REF_FILTERED;
1613 }
1614 }
1615
1616 if (p->mpls_map)
1617 {
1618 if (mpls_handle_rte(p->mpls_map, n, new, rte_update_pool, &fec) < 0)
1619 {
1620 rte_trace_in(D_FILTERS, c, new, "invalid");
1621 stats->imp_updates_invalid++;
1622 goto drop;
1623 }
1624 }
1625
1626 if (!rta_is_cached(new->attrs)) /* Need to copy attributes */
1627 new->attrs = rta_lookup(new->attrs);
1628 new->flags |= REF_COW;
1629
1630 /* Use the actual struct network, not the dummy one */
1631 nn = net_get(c->table, n);
1632 new->net = nn;
1633 }
1634 else
1635 {
1636 stats->imp_withdraws_received++;
1637
1638 if (!(nn = net_find(c->table, n)) || !src)
1639 {
1640 stats->imp_withdraws_ignored++;
1641 rte_update_unlock();
1642 return;
1643 }
1644 }
1645
1646 recalc:
1647 /* And recalculate the best route */
1648 rte_recalculate(c, nn, new, src);
1649
1650 if (p->mpls_map)
1651 mpls_handle_rte_cleanup(p->mpls_map, &fec);
1652
1653 rte_update_unlock();
1654 return;
1655
1656 drop:
1657 rte_free(new);
1658 new = NULL;
1659 if (nn = net_find(c->table, n))
1660 goto recalc;
1661
1662 rte_update_unlock();
1663 }
1664
1665 /* Independent call to rte_announce(), used from next hop
1666 recalculation, outside of rte_update(). new must be non-NULL */
1667 static inline void
1668 rte_announce_i(rtable *tab, uint type, net *net, rte *new, rte *old,
1669 rte *new_best, rte *old_best)
1670 {
1671 rte_update_lock();
1672 rte_announce(tab, type, net, new, old, new_best, old_best);
1673 rte_update_unlock();
1674 }
1675
1676 static inline void
1677 rte_discard(rte *old) /* Non-filtered route deletion, used during garbage collection */
1678 {
1679 rte_update_lock();
1680 rte_recalculate(old->sender, old->net, NULL, old->src);
1681 rte_update_unlock();
1682 }
1683
1684 /* Modify existing route by protocol hook, used for long-lived graceful restart */
1685 static inline void
1686 rte_modify(rte *old)
1687 {
1688 rte_update_lock();
1689
1690 rte *new = old->sender->proto->rte_modify(old, rte_update_pool);
1691 if (new != old)
1692 {
1693 if (new)
1694 {
1695 if (!rta_is_cached(new->attrs))
1696 new->attrs = rta_lookup(new->attrs);
1697 new->flags = (old->flags & ~REF_MODIFY) | REF_COW;
1698 }
1699
1700 rte_recalculate(old->sender, old->net, new, old->src);
1701 }
1702
1703 rte_update_unlock();
1704 }
1705
1706 /* Check rtable for best route to given net whether it would be exported do p */
1707 int
1708 rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter)
1709 {
1710 struct proto *p = c->proto;
1711 net *n = net_find(t, a);
1712 rte *rt = n ? n->routes : NULL;
1713
1714 if (!rte_is_valid(rt))
1715 return 0;
1716
1717 rte_update_lock();
1718
1719 /* Rest is stripped down export_filter() */
1720 int v = p->preexport ? p->preexport(c, rt) : 0;
1721 if (v == RIC_PROCESS)
1722 v = (f_run(filter, &rt, rte_update_pool, FF_SILENT) <= F_ACCEPT);
1723
1724 /* Discard temporary rte */
1725 if (rt != n->routes)
1726 rte_free(rt);
1727
1728 rte_update_unlock();
1729
1730 return v > 0;
1731 }
1732
1733
1734 /**
1735 * rt_refresh_begin - start a refresh cycle
1736 * @t: related routing table
1737 * @c related channel
1738 *
1739 * This function starts a refresh cycle for given routing table and announce
1740 * hook. The refresh cycle is a sequence where the protocol sends all its valid
1741 * routes to the routing table (by rte_update()). After that, all protocol
1742 * routes (more precisely routes with @c as @sender) not sent during the
1743 * refresh cycle but still in the table from the past are pruned. This is
1744 * implemented by marking all related routes as stale by REF_STALE flag in
1745 * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD
1746 * flag in rt_refresh_end() and then removing such routes in the prune loop.
1747 */
1748 void
1749 rt_refresh_begin(rtable *t, struct channel *c)
1750 {
1751 if (c->debug & D_EVENTS)
1752 log(L_TRACE "%s.%s: Route refresh begin", c->proto->name, c->name);
1753
1754 FIB_WALK(&t->fib, net, n)
1755 {
1756 rte *e;
1757 for (e = n->routes; e; e = e->next)
1758 if (e->sender == c)
1759 e->flags |= REF_STALE;
1760 }
1761 FIB_WALK_END;
1762 }
1763
1764 /**
1765 * rt_refresh_end - end a refresh cycle
1766 * @t: related routing table
1767 * @c: related channel
1768 *
1769 * This function ends a refresh cycle for given routing table and announce
1770 * hook. See rt_refresh_begin() for description of refresh cycles.
1771 */
1772 void
1773 rt_refresh_end(rtable *t, struct channel *c)
1774 {
1775 if (c->debug & D_EVENTS)
1776 log(L_TRACE "%s.%s: Route refresh end", c->proto->name, c->name);
1777
1778 int prune = 0;
1779
1780 FIB_WALK(&t->fib, net, n)
1781 {
1782 rte *e;
1783 for (e = n->routes; e; e = e->next)
1784 if ((e->sender == c) && (e->flags & REF_STALE))
1785 {
1786 e->flags |= REF_DISCARD;
1787 prune = 1;
1788 }
1789 }
1790 FIB_WALK_END;
1791
1792 if (prune)
1793 rt_schedule_prune(t);
1794 }
1795
1796 void
1797 rt_modify_stale(rtable *t, struct channel *c)
1798 {
1799 int prune = 0;
1800
1801 FIB_WALK(&t->fib, net, n)
1802 {
1803 rte *e;
1804 for (e = n->routes; e; e = e->next)
1805 if ((e->sender == c) && (e->flags & REF_STALE) && !(e->flags & REF_FILTERED))
1806 {
1807 e->flags |= REF_MODIFY;
1808 prune = 1;
1809 }
1810 }
1811 FIB_WALK_END;
1812
1813 if (prune)
1814 rt_schedule_prune(t);
1815 }
1816
1817 /**
1818 * rte_dump - dump a route
1819 * @e: &rte to be dumped
1820 *
1821 * This functions dumps contents of a &rte to debug output.
1822 */
1823 void
1824 rte_dump(rte *e)
1825 {
1826 net *n = e->net;
1827 debug("%-1N ", n->n.addr);
1828 debug("PF=%02x ", e->pflags);
1829 rta_dump(e->attrs);
1830 debug("\n");
1831 }
1832
1833 /**
1834 * rt_dump - dump a routing table
1835 * @t: routing table to be dumped
1836 *
1837 * This function dumps contents of a given routing table to debug output.
1838 */
1839 void
1840 rt_dump(rtable *t)
1841 {
1842 debug("Dump of routing table <%s>\n", t->name);
1843 #ifdef DEBUGGING
1844 fib_check(&t->fib);
1845 #endif
1846 FIB_WALK(&t->fib, net, n)
1847 {
1848 rte *e;
1849 for(e=n->routes; e; e=e->next)
1850 rte_dump(e);
1851 }
1852 FIB_WALK_END;
1853 debug("\n");
1854 }
1855
1856 /**
1857 * rt_dump_all - dump all routing tables
1858 *
1859 * This function dumps contents of all routing tables to debug output.
1860 */
1861 void
1862 rt_dump_all(void)
1863 {
1864 rtable *t;
1865 node *n;
1866
1867 WALK_LIST2(t, n, routing_tables, n)
1868 rt_dump(t);
1869 }
1870
1871 static inline void
1872 rt_schedule_hcu(rtable *tab)
1873 {
1874 if (tab->hcu_scheduled)
1875 return;
1876
1877 tab->hcu_scheduled = 1;
1878 ev_schedule(tab->rt_event);
1879 }
1880
1881 static inline void
1882 rt_schedule_nhu(rtable *tab)
1883 {
1884 if (tab->nhu_state == NHU_CLEAN)
1885 ev_schedule(tab->rt_event);
1886
1887 /* state change:
1888 * NHU_CLEAN -> NHU_SCHEDULED
1889 * NHU_RUNNING -> NHU_DIRTY
1890 */
1891 tab->nhu_state |= NHU_SCHEDULED;
1892 }
1893
1894 void
1895 rt_schedule_prune(rtable *tab)
1896 {
1897 if (tab->prune_state == 0)
1898 ev_schedule(tab->rt_event);
1899
1900 /* state change 0->1, 2->3 */
1901 tab->prune_state |= 1;
1902 }
1903
1904
1905 static void
1906 rt_event(void *ptr)
1907 {
1908 rtable *tab = ptr;
1909
1910 rt_lock_table(tab);
1911
1912 if (tab->hcu_scheduled)
1913 rt_update_hostcache(tab);
1914
1915 if (tab->nhu_state)
1916 rt_next_hop_update(tab);
1917
1918 if (tab->prune_state)
1919 rt_prune_table(tab);
1920
1921 rt_unlock_table(tab);
1922 }
1923
1924
1925 static void
1926 rt_prune_timer(timer *t)
1927 {
1928 rtable *tab = t->data;
1929
1930 if (tab->gc_counter >= tab->config->gc_threshold)
1931 rt_schedule_prune(tab);
1932 }
1933
1934 static void
1935 rt_kick_prune_timer(rtable *tab)
1936 {
1937 /* Return if prune is already scheduled */
1938 if (tm_active(tab->prune_timer) || (tab->prune_state & 1))
1939 return;
1940
1941 /* Randomize GC period to +/- 50% */
1942 btime gc_period = tab->config->gc_period;
1943 gc_period = (gc_period / 2) + (random_u32() % (uint) gc_period);
1944 tm_start(tab->prune_timer, gc_period);
1945 }
1946
1947
1948 static inline btime
1949 rt_settled_time(rtable *tab)
1950 {
1951 ASSUME(tab->base_settle_time != 0);
1952
1953 return MIN(tab->last_rt_change + tab->config->min_settle_time,
1954 tab->base_settle_time + tab->config->max_settle_time);
1955 }
1956
1957 static void
1958 rt_settle_timer(timer *t)
1959 {
1960 rtable *tab = t->data;
1961
1962 if (!tab->base_settle_time)
1963 return;
1964
1965 btime settled_time = rt_settled_time(tab);
1966 if (current_time() < settled_time)
1967 {
1968 tm_set(tab->settle_timer, settled_time);
1969 return;
1970 }
1971
1972 /* Settled */
1973 tab->base_settle_time = 0;
1974
1975 struct rt_subscription *s;
1976 WALK_LIST(s, tab->subscribers)
1977 s->hook(s);
1978 }
1979
1980 static void
1981 rt_kick_settle_timer(rtable *tab)
1982 {
1983 tab->base_settle_time = current_time();
1984
1985 if (!tab->settle_timer)
1986 tab->settle_timer = tm_new_init(tab->rp, rt_settle_timer, tab, 0, 0);
1987
1988 if (!tm_active(tab->settle_timer))
1989 tm_set(tab->settle_timer, rt_settled_time(tab));
1990 }
1991
1992 static inline void
1993 rt_schedule_notify(rtable *tab)
1994 {
1995 if (EMPTY_LIST(tab->subscribers))
1996 return;
1997
1998 if (tab->base_settle_time)
1999 return;
2000
2001 rt_kick_settle_timer(tab);
2002 }
2003
2004 void
2005 rt_subscribe(rtable *tab, struct rt_subscription *s)
2006 {
2007 s->tab = tab;
2008 rt_lock_table(tab);
2009 add_tail(&tab->subscribers, &s->n);
2010 }
2011
2012 void
2013 rt_unsubscribe(struct rt_subscription *s)
2014 {
2015 rem_node(&s->n);
2016 rt_unlock_table(s->tab);
2017 }
2018
2019 static struct rt_flowspec_link *
2020 rt_flowspec_find_link(rtable *src, rtable *dst)
2021 {
2022 struct rt_flowspec_link *ln;
2023 WALK_LIST(ln, src->flowspec_links)
2024 if ((ln->src == src) && (ln->dst == dst))
2025 return ln;
2026
2027 return NULL;
2028 }
2029
2030 void
2031 rt_flowspec_link(rtable *src, rtable *dst)
2032 {
2033 ASSERT(rt_is_ip(src));
2034 ASSERT(rt_is_flow(dst));
2035
2036 struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst);
2037
2038 if (!ln)
2039 {
2040 rt_lock_table(src);
2041 rt_lock_table(dst);
2042
2043 ln = mb_allocz(src->rp, sizeof(struct rt_flowspec_link));
2044 ln->src = src;
2045 ln->dst = dst;
2046 add_tail(&src->flowspec_links, &ln->n);
2047 }
2048
2049 ln->uc++;
2050 }
2051
2052 void
2053 rt_flowspec_unlink(rtable *src, rtable *dst)
2054 {
2055 struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst);
2056
2057 ASSERT(ln && (ln->uc > 0));
2058
2059 ln->uc--;
2060
2061 if (!ln->uc)
2062 {
2063 rem_node(&ln->n);
2064 mb_free(ln);
2065
2066 rt_unlock_table(src);
2067 rt_unlock_table(dst);
2068 }
2069 }
2070
2071 static void
2072 rt_flowspec_notify(rtable *src, net *net)
2073 {
2074 /* Only IP tables are src links */
2075 ASSERT(rt_is_ip(src));
2076
2077 struct rt_flowspec_link *ln;
2078 WALK_LIST(ln, src->flowspec_links)
2079 {
2080 rtable *dst = ln->dst;
2081 ASSERT(rt_is_flow(dst));
2082
2083 /* No need to inspect it further if recalculation is already active */
2084 if ((dst->nhu_state == NHU_SCHEDULED) || (dst->nhu_state == NHU_DIRTY))
2085 continue;
2086
2087 if (trie_match_net(dst->flowspec_trie, net->n.addr))
2088 rt_schedule_nhu(dst);
2089 }
2090 }
2091
2092 static void
2093 rt_flowspec_reset_trie(rtable *tab)
2094 {
2095 linpool *lp = tab->flowspec_trie->lp;
2096 int ipv4 = tab->flowspec_trie->ipv4;
2097
2098 lp_flush(lp);
2099 tab->flowspec_trie = f_new_trie(lp, 0);
2100 tab->flowspec_trie->ipv4 = ipv4;
2101 }
2102
2103 static void
2104 rt_free(resource *_r)
2105 {
2106 rtable *r = (rtable *) _r;
2107
2108 DBG("Deleting routing table %s\n", r->name);
2109 ASSERT_DIE(r->use_count == 0);
2110
2111 if (r->internal)
2112 return;
2113
2114 r->config->table = NULL;
2115 rem_node(&r->n);
2116
2117 if (r->hostcache)
2118 rt_free_hostcache(r);
2119
2120 /* Freed automagically by the resource pool
2121 fib_free(&r->fib);
2122 hmap_free(&r->id_map);
2123 rfree(r->rt_event);
2124 rfree(r->settle_timer);
2125 mb_free(r);
2126 */
2127 }
2128
2129 static void
2130 rt_res_dump(resource *_r)
2131 {
2132 rtable *r = (rtable *) _r;
2133 debug("name \"%s\", addr_type=%s, rt_count=%u, use_count=%d\n",
2134 r->name, net_label[r->addr_type], r->rt_count, r->use_count);
2135 }
2136
2137 static struct resclass rt_class = {
2138 .name = "Routing table",
2139 .size = sizeof(struct rtable),
2140 .free = rt_free,
2141 .dump = rt_res_dump,
2142 .lookup = NULL,
2143 .memsize = NULL,
2144 };
2145
2146 rtable *
2147 rt_setup(pool *pp, struct rtable_config *cf)
2148 {
2149 pool *p = rp_newf(pp, "Routing table %s", cf->name);
2150
2151 rtable *t = ralloc(p, &rt_class);
2152 t->rp = p;
2153
2154 t->name = cf->name;
2155 t->config = cf;
2156 t->addr_type = cf->addr_type;
2157
2158 fib_init(&t->fib, p, t->addr_type, sizeof(net), OFFSETOF(net, n), 0, NULL);
2159
2160 if (cf->trie_used)
2161 {
2162 t->trie = f_new_trie(lp_new_default(p), 0);
2163 t->trie->ipv4 = net_val_match(t->addr_type, NB_IP4 | NB_VPN4 | NB_ROA4);
2164
2165 t->fib.init = net_init_with_trie;
2166 }
2167
2168 init_list(&t->channels);
2169 init_list(&t->flowspec_links);
2170 init_list(&t->subscribers);
2171
2172 hmap_init(&t->id_map, p, 1024);
2173 hmap_set(&t->id_map, 0);
2174
2175 if (!(t->internal = cf->internal))
2176 {
2177 t->rt_event = ev_new_init(p, rt_event, t);
2178 t->prune_timer = tm_new_init(p, rt_prune_timer, t, 0, 0);
2179 t->last_rt_change = t->gc_time = current_time();
2180
2181 if (rt_is_flow(t))
2182 {
2183 t->flowspec_trie = f_new_trie(lp_new_default(p), 0);
2184 t->flowspec_trie->ipv4 = (t->addr_type == NET_FLOW4);
2185 }
2186 }
2187
2188 return t;
2189 }
2190
2191 /**
2192 * rt_init - initialize routing tables
2193 *
2194 * This function is called during BIRD startup. It initializes the
2195 * routing table module.
2196 */
2197 void
2198 rt_init(void)
2199 {
2200 rta_init();
2201 rt_table_pool = rp_new(&root_pool, "Routing tables");
2202 rte_update_pool = lp_new_default(rt_table_pool);
2203 rte_slab = sl_new(rt_table_pool, sizeof(rte));
2204 init_list(&routing_tables);
2205 }
2206
2207
2208 /**
2209 * rt_prune_table - prune a routing table
2210 *
2211 * The prune loop scans routing tables and removes routes belonging to flushing
2212 * protocols, discarded routes and also stale network entries. It is called from
2213 * rt_event(). The event is rescheduled if the current iteration do not finish
2214 * the table. The pruning is directed by the prune state (@prune_state),
2215 * specifying whether the prune cycle is scheduled or running, and there
2216 * is also a persistent pruning iterator (@prune_fit).
2217 *
2218 * The prune loop is used also for channel flushing. For this purpose, the
2219 * channels to flush are marked before the iteration and notified after the
2220 * iteration.
2221 */
2222 static void
2223 rt_prune_table(rtable *tab)
2224 {
2225 struct fib_iterator *fit = &tab->prune_fit;
2226 int limit = 2000;
2227
2228 struct channel *c;
2229 node *n, *x;
2230
2231 DBG("Pruning route table %s\n", tab->name);
2232 #ifdef DEBUGGING
2233 fib_check(&tab->fib);
2234 #endif
2235
2236 if (tab->prune_state == 0)
2237 return;
2238
2239 if (tab->prune_state == 1)
2240 {
2241 /* Mark channels to flush */
2242 WALK_LIST2(c, n, tab->channels, table_node)
2243 if (c->channel_state == CS_FLUSHING)
2244 c->flush_active = 1;
2245
2246 FIB_ITERATE_INIT(fit, &tab->fib);
2247 tab->prune_state = 2;
2248
2249 tab->gc_counter = 0;
2250 tab->gc_time = current_time();
2251
2252 if (tab->prune_trie)
2253 {
2254 /* Init prefix trie pruning */
2255 tab->trie_new = f_new_trie(lp_new_default(tab->rp), 0);
2256 tab->trie_new->ipv4 = tab->trie->ipv4;
2257 }
2258 }
2259
2260 again:
2261 FIB_ITERATE_START(&tab->fib, fit, net, n)
2262 {
2263 rte *e;
2264
2265 rescan:
2266 if (limit <= 0)
2267 {
2268 FIB_ITERATE_PUT(fit);
2269 ev_schedule(tab->rt_event);
2270 return;
2271 }
2272
2273 for (e=n->routes; e; e=e->next)
2274 {
2275 if (e->sender->flush_active || (e->flags & REF_DISCARD))
2276 {
2277 rte_discard(e);
2278 limit--;
2279
2280 goto rescan;
2281 }
2282
2283 if (e->flags & REF_MODIFY)
2284 {
2285 rte_modify(e);
2286 limit--;
2287
2288 goto rescan;
2289 }
2290 }
2291
2292 if (!n->routes) /* Orphaned FIB entry */
2293 {
2294 FIB_ITERATE_PUT(fit);
2295 fib_delete(&tab->fib, n);
2296 goto again;
2297 }
2298
2299 if (tab->trie_new)
2300 {
2301 trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen);
2302 limit--;
2303 }
2304 }
2305 FIB_ITERATE_END;
2306
2307 #ifdef DEBUGGING
2308 fib_check(&tab->fib);
2309 #endif
2310
2311 /* state change 2->0, 3->1 */
2312 tab->prune_state &= 1;
2313
2314 if (tab->trie_new)
2315 {
2316 /* Finish prefix trie pruning */
2317
2318 if (!tab->trie_lock_count)
2319 {
2320 rfree(tab->trie->lp);
2321 }
2322 else
2323 {
2324 ASSERT(!tab->trie_old);
2325 tab->trie_old = tab->trie;
2326 tab->trie_old_lock_count = tab->trie_lock_count;
2327 tab->trie_lock_count = 0;
2328 }
2329
2330 tab->trie = tab->trie_new;
2331 tab->trie_new = NULL;
2332 tab->prune_trie = 0;
2333 }
2334 else
2335 {
2336 /* Schedule prefix trie pruning */
2337 if (tab->trie && !tab->trie_old && (tab->trie->prefix_count > (2 * tab->fib.entries)))
2338 {
2339 /* state change 0->1, 2->3 */
2340 tab->prune_state |= 1;
2341 tab->prune_trie = 1;
2342 }
2343 }
2344
2345 if (tab->prune_state > 0)
2346 ev_schedule(tab->rt_event);
2347
2348 /* FIXME: This should be handled in a better way */
2349 rt_prune_sources();
2350
2351 /* Close flushed channels */
2352 WALK_LIST2_DELSAFE(c, n, x, tab->channels, table_node)
2353 if (c->flush_active)
2354 {
2355 c->flush_active = 0;
2356 channel_set_state(c, CS_DOWN);
2357 }
2358
2359 return;
2360 }
2361
2362 /**
2363 * rt_lock_trie - lock a prefix trie of a routing table
2364 * @tab: routing table with prefix trie to be locked
2365 *
2366 * The prune loop may rebuild the prefix trie and invalidate f_trie_walk_state
2367 * structures. Therefore, asynchronous walks should lock the prefix trie using
2368 * this function. That allows the prune loop to rebuild the trie, but postpones
2369 * its freeing until all walks are done (unlocked by rt_unlock_trie()).
2370 *
2371 * Return a current trie that will be locked, the value should be passed back to
2372 * rt_unlock_trie() for unlocking.
2373 *
2374 */
2375 struct f_trie *
2376 rt_lock_trie(rtable *tab)
2377 {
2378 ASSERT(tab->trie);
2379
2380 tab->trie_lock_count++;
2381 return tab->trie;
2382 }
2383
2384 /**
2385 * rt_unlock_trie - unlock a prefix trie of a routing table
2386 * @tab: routing table with prefix trie to be locked
2387 * @trie: value returned by matching rt_lock_trie()
2388 *
2389 * Done for trie locked by rt_lock_trie() after walk over the trie is done.
2390 * It may free the trie and schedule next trie pruning.
2391 */
2392 void
2393 rt_unlock_trie(rtable *tab, struct f_trie *trie)
2394 {
2395 ASSERT(trie);
2396
2397 if (trie == tab->trie)
2398 {
2399 /* Unlock the current prefix trie */
2400 ASSERT(tab->trie_lock_count);
2401 tab->trie_lock_count--;
2402 }
2403 else if (trie == tab->trie_old)
2404 {
2405 /* Unlock the old prefix trie */
2406 ASSERT(tab->trie_old_lock_count);
2407 tab->trie_old_lock_count--;
2408
2409 /* Free old prefix trie that is no longer needed */
2410 if (!tab->trie_old_lock_count)
2411 {
2412 rfree(tab->trie_old->lp);
2413 tab->trie_old = NULL;
2414
2415 /* Kick prefix trie pruning that was postponed */
2416 if (tab->trie && (tab->trie->prefix_count > (2 * tab->fib.entries)))
2417 {
2418 tab->prune_trie = 1;
2419 rt_schedule_prune(tab);
2420 }
2421 }
2422 }
2423 else
2424 log(L_BUG "Invalid arg to rt_unlock_trie()");
2425 }
2426
2427
2428 void
2429 rt_preconfig(struct config *c)
2430 {
2431 init_list(&c->tables);
2432
2433 rt_new_table(cf_get_symbol(c, "master4"), NET_IP4);
2434 rt_new_table(cf_get_symbol(c, "master6"), NET_IP6);
2435 }
2436
2437 void
2438 rt_postconfig(struct config *c)
2439 {
2440 uint num_tables = list_length(&c->tables);
2441 btime def_gc_period = 400 MS * num_tables;
2442 def_gc_period = MAX(def_gc_period, 10 S);
2443 def_gc_period = MIN(def_gc_period, 600 S);
2444
2445 struct rtable_config *rc;
2446 WALK_LIST(rc, c->tables)
2447 if (rc->gc_period == (uint) -1)
2448 rc->gc_period = (uint) def_gc_period;
2449 }
2450
2451
2452 /*
2453 * Some functions for handing internal next hop updates
2454 * triggered by rt_schedule_nhu().
2455 */
2456
2457 void
2458 rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls)
2459 {
2460 a->hostentry = he;
2461 a->dest = he->dest;
2462 a->igp_metric = he->igp_metric;
2463
2464 if (a->dest != RTD_UNICAST)
2465 {
2466 /* No nexthop */
2467 no_nexthop:
2468 a->nh = (struct nexthop) {};
2469 if (mls)
2470 { /* Store the label stack for later changes */
2471 a->nh.labels_orig = a->nh.labels = mls->len;
2472 memcpy(a->nh.label, mls->stack, mls->len * sizeof(u32));
2473 }
2474 return;
2475 }
2476
2477 if (((!mls) || (!mls->len)) && he->nexthop_linkable)
2478 { /* Just link the nexthop chain, no label append happens. */
2479 memcpy(&(a->nh), &(he->src->nh), nexthop_size(&(he->src->nh)));
2480 return;
2481 }
2482
2483 struct nexthop *nhp = NULL, *nhr = NULL;
2484 int skip_nexthop = 0;
2485
2486 for (struct nexthop *nh = &(he->src->nh); nh; nh = nh->next)
2487 {
2488 if (skip_nexthop)
2489 skip_nexthop--;
2490 else
2491 {
2492 nhr = nhp;
2493 nhp = (nhp ? (nhp->next = lp_alloc(rte_update_pool, NEXTHOP_MAX_SIZE)) : &(a->nh));
2494 }
2495
2496 memset(nhp, 0, NEXTHOP_MAX_SIZE);
2497 nhp->iface = nh->iface;
2498 nhp->weight = nh->weight;
2499
2500 if (mls)
2501 {
2502 nhp->labels = nh->labels + mls->len;
2503 nhp->labels_orig = mls->len;
2504 if (nhp->labels <= MPLS_MAX_LABEL_STACK)
2505 {
2506 memcpy(nhp->label, nh->label, nh->labels * sizeof(u32)); /* First the hostentry labels */
2507 memcpy(&(nhp->label[nh->labels]), mls->stack, mls->len * sizeof(u32)); /* Then the bottom labels */
2508 }
2509 else
2510 {
2511 log(L_WARN "Sum of label stack sizes %d + %d = %d exceedes allowed maximum (%d)",
2512 nh->labels, mls->len, nhp->labels, MPLS_MAX_LABEL_STACK);
2513 skip_nexthop++;
2514 continue;
2515 }
2516 }
2517 else if (nh->labels)
2518 {
2519 nhp->labels = nh->labels;
2520 nhp->labels_orig = 0;
2521 memcpy(nhp->label, nh->label, nh->labels * sizeof(u32));
2522 }
2523
2524 if (ipa_nonzero(nh->gw))
2525 {
2526 nhp->gw = nh->gw; /* Router nexthop */
2527 nhp->flags |= (nh->flags & RNF_ONLINK);
2528 }
2529 else if (!(nh->iface->flags & IF_MULTIACCESS) || (nh->iface->flags & IF_LOOPBACK))
2530 nhp->gw = IPA_NONE; /* PtP link - no need for nexthop */
2531 else if (ipa_nonzero(he->link))
2532 nhp->gw = he->link; /* Device nexthop with link-local address known */
2533 else
2534 nhp->gw = he->addr; /* Device nexthop with link-local address unknown */
2535 }
2536
2537 if (skip_nexthop)
2538 if (nhr)
2539 nhr->next = NULL;
2540 else
2541 {
2542 a->dest = RTD_UNREACHABLE;
2543 log(L_WARN "No valid nexthop remaining, setting route unreachable");
2544 goto no_nexthop;
2545 }
2546 }
2547
2548 static inline int
2549 rta_next_hop_outdated(rta *a)
2550 {
2551 struct hostentry *he = a->hostentry;
2552
2553 if (!he)
2554 return 0;
2555
2556 if (!he->src)
2557 return a->dest != RTD_UNREACHABLE;
2558
2559 return (a->dest != he->dest) || (a->igp_metric != he->igp_metric) ||
2560 (!he->nexthop_linkable) || !nexthop_same(&(a->nh), &(he->src->nh));
2561 }
2562
2563 static inline rte *
2564 rt_next_hop_update_rte(rtable *tab UNUSED, rte *old)
2565 {
2566 if (!rta_next_hop_outdated(old->attrs))
2567 return NULL;
2568
2569 rta *a = alloca(RTA_MAX_SIZE);
2570 memcpy(a, old->attrs, rta_size(old->attrs));
2571
2572 mpls_label_stack mls = { .len = a->nh.labels_orig };
2573 memcpy(mls.stack, &a->nh.label[a->nh.labels - mls.len], mls.len * sizeof(u32));
2574
2575 rta_apply_hostentry(a, old->attrs->hostentry, &mls);
2576 a->cached = 0;
2577
2578 rte *e = sl_alloc(rte_slab);
2579 memcpy(e, old, sizeof(rte));
2580 e->attrs = rta_lookup(a);
2581 rt_lock_source(e->src);
2582
2583 return e;
2584 }
2585
2586
2587 #ifdef CONFIG_BGP
2588
2589 static inline int
2590 net_flow_has_dst_prefix(const net_addr *n)
2591 {
2592 ASSUME(net_is_flow(n));
2593
2594 if (n->pxlen)
2595 return 1;
2596
2597 if (n->type == NET_FLOW4)
2598 {
2599 const net_addr_flow4 *n4 = (void *) n;
2600 return (n4->length > sizeof(net_addr_flow4)) && (n4->data[0] == FLOW_TYPE_DST_PREFIX);
2601 }
2602 else
2603 {
2604 const net_addr_flow6 *n6 = (void *) n;
2605 return (n6->length > sizeof(net_addr_flow6)) && (n6->data[0] == FLOW_TYPE_DST_PREFIX);
2606 }
2607 }
2608
2609 static inline int
2610 rta_as_path_is_empty(rta *a)
2611 {
2612 eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH));
2613 return !e || (as_path_getlen(e->u.ptr) == 0);
2614 }
2615
2616 static inline u32
2617 rta_get_first_asn(rta *a)
2618 {
2619 eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH));
2620 u32 asn;
2621
2622 return (e && as_path_get_first_regular(e->u.ptr, &asn)) ? asn : 0;
2623 }
2624
2625 int
2626 rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, rta *a, int interior)
2627 {
2628 ASSERT(rt_is_ip(tab_ip));
2629 ASSERT(rt_is_flow(tab_flow));
2630 ASSERT(tab_ip->trie);
2631
2632 /* RFC 8955 6. a) Flowspec has defined dst prefix */
2633 if (!net_flow_has_dst_prefix(n))
2634 return 0;
2635
2636 /* RFC 9117 4.1. Accept AS_PATH is empty (fr */
2637 if (interior && rta_as_path_is_empty(a))
2638 return 1;
2639
2640
2641 /* RFC 8955 6. b) Flowspec and its best-match route have the same originator */
2642
2643 /* Find flowspec dst prefix */
2644 net_addr dst;
2645 if (n->type == NET_FLOW4)
2646 net_fill_ip4(&dst, net4_prefix(n), net4_pxlen(n));
2647 else
2648 net_fill_ip6(&dst, net6_prefix(n), net6_pxlen(n));
2649
2650 /* Find best-match BGP unicast route for flowspec dst prefix */
2651 net *nb = net_route(tab_ip, &dst);
2652 rte *rb = nb ? nb->routes : NULL;
2653
2654 /* Register prefix to trie for tracking further changes */
2655 int max_pxlen = (n->type == NET_FLOW4) ? IP4_MAX_PREFIX_LENGTH : IP6_MAX_PREFIX_LENGTH;
2656 trie_add_prefix(tab_flow->flowspec_trie, &dst, (nb ? nb->n.addr->pxlen : 0), max_pxlen);
2657
2658 /* No best-match BGP route -> no flowspec */
2659 if (!rb || (rb->attrs->source != RTS_BGP))
2660 return 0;
2661
2662 /* Find ORIGINATOR_ID values */
2663 u32 orig_a = ea_get_int(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0);
2664 u32 orig_b = ea_get_int(rb->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0);
2665
2666 /* Originator is either ORIGINATOR_ID (if present), or BGP neighbor address (if not) */
2667 if ((orig_a != orig_b) || (!orig_a && !orig_b && !ipa_equal(a->from, rb->attrs->from)))
2668 return 0;
2669
2670
2671 /* Find ASN of the best-match route, for use in next checks */
2672 u32 asn_b = rta_get_first_asn(rb->attrs);
2673 if (!asn_b)
2674 return 0;
2675
2676 /* RFC 9117 4.2. For EBGP, flowspec and its best-match route are from the same AS */
2677 if (!interior && (rta_get_first_asn(a) != asn_b))
2678 return 0;
2679
2680 /* RFC 8955 6. c) More-specific routes are from the same AS as the best-match route */
2681 TRIE_WALK(tab_ip->trie, subnet, &dst)
2682 {
2683 net *nc = net_find_valid(tab_ip, &subnet);
2684 if (!nc)
2685 continue;
2686
2687 rte *rc = nc->routes;
2688 if (rc->attrs->source != RTS_BGP)
2689 return 0;
2690
2691 if (rta_get_first_asn(rc->attrs) != asn_b)
2692 return 0;
2693 }
2694 TRIE_WALK_END;
2695
2696 return 1;
2697 }
2698
2699 #endif /* CONFIG_BGP */
2700
2701 static rte *
2702 rt_flowspec_update_rte(rtable *tab, rte *r)
2703 {
2704 #ifdef CONFIG_BGP
2705 if ((r->attrs->source != RTS_BGP) || (r->sender->proto != r->src->proto))
2706 return NULL;
2707
2708 struct bgp_channel *bc = (struct bgp_channel *) r->sender;
2709 if (!bc->base_table)
2710 return NULL;
2711
2712 const net_addr *n = r->net->n.addr;
2713 struct bgp_proto *p = (void *) r->src->proto;
2714 int valid = rt_flowspec_check(bc->base_table, tab, n, r->attrs, p->is_interior);
2715 int dest = valid ? RTD_NONE : RTD_UNREACHABLE;
2716
2717 if (dest == r->attrs->dest)
2718 return NULL;
2719
2720 rta *a = alloca(RTA_MAX_SIZE);
2721 memcpy(a, r->attrs, rta_size(r->attrs));
2722 a->dest = dest;
2723 a->cached = 0;
2724
2725 rte *new = sl_alloc(rte_slab);
2726 memcpy(new, r, sizeof(rte));
2727 new->attrs = rta_lookup(a);
2728 rt_lock_source(new->src);
2729
2730 return new;
2731 #else
2732 return NULL;
2733 #endif
2734 }
2735
2736
2737 static inline int
2738 rt_next_hop_update_net(rtable *tab, net *n)
2739 {
2740 rte **k, *e, *new, *old_best, **new_best;
2741 int count = 0;
2742 int free_old_best = 0;
2743
2744 old_best = n->routes;
2745 if (!old_best)
2746 return 0;
2747
2748 for (k = &n->routes; e = *k; k = &e->next)
2749 {
2750 if (!net_is_flow(n->n.addr))
2751 new = rt_next_hop_update_rte(tab, e);
2752 else
2753 new = rt_flowspec_update_rte(tab, e);
2754
2755 if (new)
2756 {
2757 *k = new;
2758
2759 rte_trace_in(D_ROUTES, new->sender, new, "updated");
2760 rte_announce_i(tab, RA_ANY, n, new, e, NULL, NULL);
2761
2762 /* Call a pre-comparison hook */
2763 /* Not really an efficient way to compute this */
2764 if (e->src->proto->rte_recalculate)
2765 e->src->proto->rte_recalculate(tab, n, new, e, NULL);
2766
2767 if (e != old_best)
2768 rte_free_quick(e);
2769 else /* Freeing of the old best rte is postponed */
2770 free_old_best = 1;
2771
2772 e = new;
2773 count++;
2774 }
2775 }
2776
2777 if (!count)
2778 return 0;
2779
2780 /* Find the new best route */
2781 new_best = NULL;
2782 for (k = &n->routes; e = *k; k = &e->next)
2783 {
2784 if (!new_best || rte_better(e, *new_best))
2785 new_best = k;
2786 }
2787
2788 /* Relink the new best route to the first position */
2789 new = *new_best;
2790 if (new != n->routes)
2791 {
2792 *new_best = new->next;
2793 new->next = n->routes;
2794 n->routes = new;
2795 }
2796
2797 /* Announce the new best route */
2798 if (new != old_best)
2799 rte_trace_in(D_ROUTES, new->sender, new, "updated [best]");
2800
2801 /* Propagate changes */
2802 rte_announce_i(tab, RA_UNDEF, n, NULL, NULL, n->routes, old_best);
2803
2804 if (free_old_best)
2805 rte_free_quick(old_best);
2806
2807 return count;
2808 }
2809
2810 static void
2811 rt_next_hop_update(rtable *tab)
2812 {
2813 struct fib_iterator *fit = &tab->nhu_fit;
2814 int max_feed = 32;
2815
2816 if (tab->nhu_state == NHU_CLEAN)
2817 return;
2818
2819 if (tab->nhu_state == NHU_SCHEDULED)
2820 {
2821 FIB_ITERATE_INIT(fit, &tab->fib);
2822 tab->nhu_state = NHU_RUNNING;
2823
2824 if (tab->flowspec_trie)
2825 rt_flowspec_reset_trie(tab);
2826 }
2827
2828 FIB_ITERATE_START(&tab->fib, fit, net, n)
2829 {
2830 if (max_feed <= 0)
2831 {
2832 FIB_ITERATE_PUT(fit);
2833 ev_schedule(tab->rt_event);
2834 return;
2835 }
2836 max_feed -= rt_next_hop_update_net(tab, n);
2837 }
2838 FIB_ITERATE_END;
2839
2840 /* State change:
2841 * NHU_DIRTY -> NHU_SCHEDULED
2842 * NHU_RUNNING -> NHU_CLEAN
2843 */
2844 tab->nhu_state &= 1;
2845
2846 if (tab->nhu_state != NHU_CLEAN)
2847 ev_schedule(tab->rt_event);
2848 }
2849
2850
2851 struct rtable_config *
2852 rt_new_table(struct symbol *s, uint addr_type)
2853 {
2854 /* Hack that allows to 'redefine' the master table */
2855 if ((s->class == SYM_TABLE) &&
2856 (s->table == new_config->def_tables[addr_type]) &&
2857 ((addr_type == NET_IP4) || (addr_type == NET_IP6)))
2858 return s->table;
2859
2860 struct rtable_config *c = cfg_allocz(sizeof(struct rtable_config));
2861
2862 cf_define_symbol(new_config, s, SYM_TABLE, table, c);
2863 c->name = s->name;
2864 c->addr_type = addr_type;
2865 c->gc_threshold = 1000;
2866 c->gc_period = (uint) -1; /* set in rt_postconfig() */
2867 c->min_settle_time = 1 S;
2868 c->max_settle_time = 20 S;
2869
2870 add_tail(&new_config->tables, &c->n);
2871
2872 /* First table of each type is kept as default */
2873 if (! new_config->def_tables[addr_type])
2874 new_config->def_tables[addr_type] = c;
2875
2876 return c;
2877 }
2878
2879 /**
2880 * rt_lock_table - lock a routing table
2881 * @r: routing table to be locked
2882 *
2883 * Lock a routing table, because it's in use by a protocol,
2884 * preventing it from being freed when it gets undefined in a new
2885 * configuration.
2886 */
2887 void
2888 rt_lock_table(rtable *r)
2889 {
2890 r->use_count++;
2891 }
2892
2893 /**
2894 * rt_unlock_table - unlock a routing table
2895 * @r: routing table to be unlocked
2896 *
2897 * Unlock a routing table formerly locked by rt_lock_table(),
2898 * that is decrease its use count and delete it if it's scheduled
2899 * for deletion by configuration changes.
2900 */
2901 void
2902 rt_unlock_table(rtable *r)
2903 {
2904 if (!--r->use_count && r->deleted)
2905 {
2906 struct config *conf = r->deleted;
2907
2908 /* Delete the routing table by freeing its pool */
2909 rt_shutdown(r);
2910 config_del_obstacle(conf);
2911 }
2912 }
2913
2914 static int
2915 rt_reconfigure(rtable *tab, struct rtable_config *new, struct rtable_config *old)
2916 {
2917 if ((new->addr_type != old->addr_type) ||
2918 (new->sorted != old->sorted) ||
2919 (new->trie_used != old->trie_used))
2920 return 0;
2921
2922 DBG("\t%s: same\n", new->name);
2923 new->table = tab;
2924 tab->name = new->name;
2925 tab->config = new;
2926
2927 return 1;
2928 }
2929
2930 static struct rtable_config *
2931 rt_find_table_config(struct config *cf, char *name)
2932 {
2933 struct symbol *sym = cf_find_symbol(cf, name);
2934 return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL;
2935 }
2936
2937 /**
2938 * rt_commit - commit new routing table configuration
2939 * @new: new configuration
2940 * @old: original configuration or %NULL if it's boot time config
2941 *
2942 * Scan differences between @old and @new configuration and modify
2943 * the routing tables according to these changes. If @new defines a
2944 * previously unknown table, create it, if it omits a table existing
2945 * in @old, schedule it for deletion (it gets deleted when all protocols
2946 * disconnect from it by calling rt_unlock_table()), if it exists
2947 * in both configurations, leave it unchanged.
2948 */
2949 void
2950 rt_commit(struct config *new, struct config *old)
2951 {
2952 struct rtable_config *o, *r;
2953
2954 DBG("rt_commit:\n");
2955 if (old)
2956 {
2957 WALK_LIST(o, old->tables)
2958 {
2959 rtable *tab = o->table;
2960 if (tab->deleted)
2961 continue;
2962
2963 r = rt_find_table_config(new, o->name);
2964 if (r && !new->shutdown && rt_reconfigure(tab, r, o))
2965 continue;
2966
2967 DBG("\t%s: deleted\n", o->name);
2968 tab->deleted = old;
2969 config_add_obstacle(old);
2970 rt_lock_table(tab);
2971 rt_unlock_table(tab);
2972 }
2973 }
2974
2975 WALK_LIST(r, new->tables)
2976 if (!r->table)
2977 {
2978 r->table = rt_setup(rt_table_pool, r);
2979 DBG("\t%s: created\n", r->name);
2980 add_tail(&routing_tables, &r->table->n);
2981 }
2982 DBG("\tdone\n");
2983 }
2984
2985 static inline void
2986 do_feed_channel(struct channel *c, net *n, rte *e)
2987 {
2988 rte_update_lock();
2989 if (c->ra_mode == RA_ACCEPTED)
2990 rt_notify_accepted(c, n, NULL, NULL, c->refeeding);
2991 else if (c->ra_mode == RA_MERGED)
2992 rt_notify_merged(c, n, NULL, NULL, e, e, c->refeeding);
2993 else /* RA_BASIC */
2994 rt_notify_basic(c, n, e, e, c->refeeding);
2995 rte_update_unlock();
2996 }
2997
2998 /**
2999 * rt_feed_channel - advertise all routes to a channel
3000 * @c: channel to be fed
3001 *
3002 * This function performs one pass of advertisement of routes to a channel that
3003 * is in the ES_FEEDING state. It is called by the protocol code as long as it
3004 * has something to do. (We avoid transferring all the routes in single pass in
3005 * order not to monopolize CPU time.)
3006 */
3007 int
3008 rt_feed_channel(struct channel *c)
3009 {
3010 struct fib_iterator *fit = &c->feed_fit;
3011 int max_feed = 256;
3012
3013 ASSERT(c->export_state == ES_FEEDING);
3014
3015 if (!c->feed_active)
3016 {
3017 FIB_ITERATE_INIT(fit, &c->table->fib);
3018 c->feed_active = 1;
3019 }
3020
3021 FIB_ITERATE_START(&c->table->fib, fit, net, n)
3022 {
3023 rte *e = n->routes;
3024 if (max_feed <= 0)
3025 {
3026 FIB_ITERATE_PUT(fit);
3027 return 0;
3028 }
3029
3030 if ((c->ra_mode == RA_OPTIMAL) ||
3031 (c->ra_mode == RA_ACCEPTED) ||
3032 (c->ra_mode == RA_MERGED))
3033 if (rte_is_valid(e))
3034 {
3035 /* In the meantime, the protocol may fell down */
3036 if (c->export_state != ES_FEEDING)
3037 goto done;
3038
3039 do_feed_channel(c, n, e);
3040 max_feed--;
3041 }
3042
3043 if (c->ra_mode == RA_ANY)
3044 for(e = n->routes; e; e = e->next)
3045 {
3046 /* In the meantime, the protocol may fell down */
3047 if (c->export_state != ES_FEEDING)
3048 goto done;
3049
3050 if (!rte_is_valid(e))
3051 continue;
3052
3053 do_feed_channel(c, n, e);
3054 max_feed--;
3055 }
3056 }
3057 FIB_ITERATE_END;
3058
3059 done:
3060 c->feed_active = 0;
3061 return 1;
3062 }
3063
3064 /**
3065 * rt_feed_baby_abort - abort protocol feeding
3066 * @c: channel
3067 *
3068 * This function is called by the protocol code when the protocol stops or
3069 * ceases to exist during the feeding.
3070 */
3071 void
3072 rt_feed_channel_abort(struct channel *c)
3073 {
3074 if (c->feed_active)
3075 {
3076 /* Unlink the iterator */
3077 fit_get(&c->table->fib, &c->feed_fit);
3078 c->feed_active = 0;
3079 }
3080 }
3081
3082
3083 /*
3084 * Import table
3085 */
3086
3087 int
3088 rte_update_in(struct channel *c, const net_addr *n, rte *new, struct rte_src *src)
3089 {
3090 struct rtable *tab = c->in_table;
3091 rte *old, **pos;
3092 net *net;
3093
3094 if (new)
3095 {
3096 net = net_get(tab, n);
3097
3098 if (!rta_is_cached(new->attrs))
3099 new->attrs = rta_lookup(new->attrs);
3100 }
3101 else
3102 {
3103 net = net_find(tab, n);
3104
3105 if (!net)
3106 goto drop_withdraw;
3107 }
3108
3109 /* Find the old rte */
3110 for (pos = &net->routes; old = *pos; pos = &old->next)
3111 if (old->src == src)
3112 {
3113 if (new && rte_same(old, new))
3114 {
3115 /* Refresh the old rte, continue with update to main rtable */
3116 if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY))
3117 {
3118 old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
3119
3120 return 1;
3121 }
3122
3123 goto drop_update;
3124 }
3125
3126 /* Move iterator if needed */
3127 if (old == c->reload_next_rte)
3128 c->reload_next_rte = old->next;
3129
3130 /* Remove the old rte */
3131 *pos = old->next;
3132 tab->rt_count--;
3133 break;
3134 }
3135
3136 if (!old && !new)
3137 goto drop_withdraw;
3138
3139 struct channel_limit *l = &c->rx_limit;
3140 if (l->action && !old && new)
3141 {
3142 if (tab->rt_count >= l->limit)
3143 channel_notify_limit(c, l, PLD_RX, tab->rt_count);
3144
3145 if (l->state == PLS_BLOCKED)
3146 {
3147 /* Required by rte_trace_in() */
3148 new->net = net;
3149
3150 rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
3151 goto drop_update;
3152 }
3153 }
3154
3155 if (new)
3156 {
3157 /* Insert the new rte */
3158 rte *e = rte_do_cow(new);
3159 e->flags |= REF_COW;
3160 e->net = net;
3161 e->sender = c;
3162 e->lastmod = current_time();
3163 e->next = *pos;
3164 *pos = new = e;
3165 tab->rt_count++;
3166
3167 if (!old)
3168 {
3169 new->id = hmap_first_zero(&tab->id_map);
3170 hmap_set(&tab->id_map, new->id);
3171 }
3172 else
3173 new->id = old->id;
3174 }
3175
3176 rte_announce(tab, RA_ANY, net, new, old, NULL, NULL);
3177
3178 if (old)
3179 {
3180 if (!new)
3181 hmap_clear(&tab->id_map, old->id);
3182
3183 rte_free_quick(old);
3184 }
3185
3186 if (!net->routes)
3187 fib_delete(&tab->fib, net);
3188
3189 return 1;
3190
3191 drop_update:
3192 c->stats.imp_updates_received++;
3193 c->stats.imp_updates_ignored++;
3194 rte_free(new);
3195
3196 if (!net->routes)
3197 fib_delete(&tab->fib, net);
3198
3199 return 0;
3200
3201 drop_withdraw:
3202 c->stats.imp_withdraws_received++;
3203 c->stats.imp_withdraws_ignored++;
3204 return 0;
3205 }
3206
3207 int
3208 rt_reload_channel(struct channel *c)
3209 {
3210 struct rtable *tab = c->in_table;
3211 struct fib_iterator *fit = &c->reload_fit;
3212 int max_feed = 64;
3213
3214 ASSERT(c->channel_state == CS_UP);
3215
3216 if (!c->reload_active)
3217 {
3218 FIB_ITERATE_INIT(fit, &tab->fib);
3219 c->reload_active = 1;
3220 }
3221
3222 do {
3223 for (rte *e = c->reload_next_rte; e; e = e->next)
3224 {
3225 if (max_feed-- <= 0)
3226 {
3227 c->reload_next_rte = e;
3228 debug("%s channel reload burst split (max_feed=%d)", c->proto->name, max_feed);
3229 return 0;
3230 }
3231
3232 rte_update2(c, e->net->n.addr, rte_do_cow(e), e->src);
3233 }
3234
3235 c->reload_next_rte = NULL;
3236
3237 FIB_ITERATE_START(&tab->fib, fit, net, n)
3238 {
3239 if (c->reload_next_rte = n->routes)
3240 {
3241 FIB_ITERATE_PUT_NEXT(fit, &tab->fib);
3242 break;
3243 }
3244 }
3245 FIB_ITERATE_END;
3246 }
3247 while (c->reload_next_rte);
3248
3249 c->reload_active = 0;
3250 return 1;
3251 }
3252
3253 void
3254 rt_reload_channel_abort(struct channel *c)
3255 {
3256 if (c->reload_active)
3257 {
3258 /* Unlink the iterator */
3259 fit_get(&c->in_table->fib, &c->reload_fit);
3260 c->reload_next_rte = NULL;
3261 c->reload_active = 0;
3262 }
3263 }
3264
3265 void
3266 rt_prune_sync(rtable *t, int all)
3267 {
3268 struct fib_iterator fit;
3269
3270 FIB_ITERATE_INIT(&fit, &t->fib);
3271
3272 again:
3273 FIB_ITERATE_START(&t->fib, &fit, net, n)
3274 {
3275 rte *e, **ee = &n->routes;
3276
3277 while (e = *ee)
3278 {
3279 if (all || (e->flags & (REF_STALE | REF_DISCARD)))
3280 {
3281 *ee = e->next;
3282 rte_free_quick(e);
3283 t->rt_count--;
3284 }
3285 else
3286 ee = &e->next;
3287 }
3288
3289 if (all || !n->routes)
3290 {
3291 FIB_ITERATE_PUT(&fit);
3292 fib_delete(&t->fib, n);
3293 goto again;
3294 }
3295 }
3296 FIB_ITERATE_END;
3297 }
3298
3299
3300 /*
3301 * Export table
3302 */
3303
3304 int
3305 rte_update_out(struct channel *c, const net_addr *n, rte *new, rte *old0, int refeed)
3306 {
3307 struct rtable *tab = c->out_table;
3308 struct rte_src *src;
3309 rte *old, **pos;
3310 net *net;
3311
3312 if (new)
3313 {
3314 net = net_get(tab, n);
3315 src = new->src;
3316
3317 if (!rta_is_cached(new->attrs))
3318 new->attrs = rta_lookup(new->attrs);
3319 }
3320 else
3321 {
3322 net = net_find(tab, n);
3323 src = old0->src;
3324
3325 if (!net)
3326 goto drop_withdraw;
3327 }
3328
3329 /* Find the old rte */
3330 for (pos = &net->routes; old = *pos; pos = &old->next)
3331 if ((c->ra_mode != RA_ANY) || (old->src == src))
3332 {
3333 if (new && rte_same(old, new))
3334 {
3335 /* REF_STALE / REF_DISCARD not used in export table */
3336 /*
3337 if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY))
3338 {
3339 old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
3340 return 1;
3341 }
3342 */
3343
3344 goto drop_update;
3345 }
3346
3347 /* Remove the old rte */
3348 *pos = old->next;
3349 rte_free_quick(old);
3350 tab->rt_count--;
3351
3352 break;
3353 }
3354
3355 if (!new)
3356 {
3357 if (!old)
3358 goto drop_withdraw;
3359
3360 if (!net->routes)
3361 fib_delete(&tab->fib, net);
3362
3363 return 1;
3364 }
3365
3366 /* Insert the new rte */
3367 rte *e = rte_do_cow(new);
3368 e->flags |= REF_COW;
3369 e->net = net;
3370 e->sender = c;
3371 e->lastmod = current_time();
3372 e->next = *pos;
3373 *pos = e;
3374 tab->rt_count++;
3375 return 1;
3376
3377 drop_update:
3378 return refeed;
3379
3380 drop_withdraw:
3381 return 0;
3382 }
3383
3384
3385 /*
3386 * Hostcache
3387 */
3388
3389 static inline u32
3390 hc_hash(ip_addr a, rtable *dep)
3391 {
3392 return ipa_hash(a) ^ ptr_hash(dep);
3393 }
3394
3395 static inline void
3396 hc_insert(struct hostcache *hc, struct hostentry *he)
3397 {
3398 uint k = he->hash_key >> hc->hash_shift;
3399 he->next = hc->hash_table[k];
3400 hc->hash_table[k] = he;
3401 }
3402
3403 static inline void
3404 hc_remove(struct hostcache *hc, struct hostentry *he)
3405 {
3406 struct hostentry **hep;
3407 uint k = he->hash_key >> hc->hash_shift;
3408
3409 for (hep = &hc->hash_table[k]; *hep != he; hep = &(*hep)->next);
3410 *hep = he->next;
3411 }
3412
3413 #define HC_DEF_ORDER 10
3414 #define HC_HI_MARK *4
3415 #define HC_HI_STEP 2
3416 #define HC_HI_ORDER 16 /* Must be at most 16 */
3417 #define HC_LO_MARK /5
3418 #define HC_LO_STEP 2
3419 #define HC_LO_ORDER 10
3420
3421 static void
3422 hc_alloc_table(struct hostcache *hc, pool *p, unsigned order)
3423 {
3424 uint hsize = 1 << order;
3425 hc->hash_order = order;
3426 hc->hash_shift = 32 - order;
3427 hc->hash_max = (order >= HC_HI_ORDER) ? ~0U : (hsize HC_HI_MARK);
3428 hc->hash_min = (order <= HC_LO_ORDER) ? 0U : (hsize HC_LO_MARK);
3429
3430 hc->hash_table = mb_allocz(p, hsize * sizeof(struct hostentry *));
3431 }
3432
3433 static void
3434 hc_resize(struct hostcache *hc, pool *p, unsigned new_order)
3435 {
3436 struct hostentry **old_table = hc->hash_table;
3437 struct hostentry *he, *hen;
3438 uint old_size = 1 << hc->hash_order;
3439 uint i;
3440
3441 hc_alloc_table(hc, p, new_order);
3442 for (i = 0; i < old_size; i++)
3443 for (he = old_table[i]; he != NULL; he=hen)
3444 {
3445 hen = he->next;
3446 hc_insert(hc, he);
3447 }
3448 mb_free(old_table);
3449 }
3450
3451 static struct hostentry *
3452 hc_new_hostentry(struct hostcache *hc, pool *p, ip_addr a, ip_addr ll, rtable *dep, unsigned k)
3453 {
3454 struct hostentry *he = sl_alloc(hc->slab);
3455
3456 *he = (struct hostentry) {
3457 .addr = a,
3458 .link = ll,
3459 .tab = dep,
3460 .hash_key = k,
3461 };
3462
3463 add_tail(&hc->hostentries, &he->ln);
3464 hc_insert(hc, he);
3465
3466 hc->hash_items++;
3467 if (hc->hash_items > hc->hash_max)
3468 hc_resize(hc, p, hc->hash_order + HC_HI_STEP);
3469
3470 return he;
3471 }
3472
3473 static void
3474 hc_delete_hostentry(struct hostcache *hc, pool *p, struct hostentry *he)
3475 {
3476 rta_free(he->src);
3477
3478 rem_node(&he->ln);
3479 hc_remove(hc, he);
3480 sl_free(he);
3481
3482 hc->hash_items--;
3483 if (hc->hash_items < hc->hash_min)
3484 hc_resize(hc, p, hc->hash_order - HC_LO_STEP);
3485 }
3486
3487 static void
3488 rt_init_hostcache(rtable *tab)
3489 {
3490 struct hostcache *hc = mb_allocz(tab->rp, sizeof(struct hostcache));
3491 init_list(&hc->hostentries);
3492
3493 hc->hash_items = 0;
3494 hc_alloc_table(hc, tab->rp, HC_DEF_ORDER);
3495 hc->slab = sl_new(tab->rp, sizeof(struct hostentry));
3496
3497 hc->lp = lp_new(tab->rp);
3498 hc->trie = f_new_trie(hc->lp, 0);
3499
3500 tab->hostcache = hc;
3501 }
3502
3503 static void
3504 rt_free_hostcache(rtable *tab)
3505 {
3506 struct hostcache *hc = tab->hostcache;
3507
3508 node *n;
3509 WALK_LIST(n, hc->hostentries)
3510 {
3511 struct hostentry *he = SKIP_BACK(struct hostentry, ln, n);
3512 rta_free(he->src);
3513
3514 if (he->uc)
3515 log(L_ERR "Hostcache is not empty in table %s", tab->name);
3516 }
3517
3518 /* Freed automagically by the resource pool
3519 rfree(hc->slab);
3520 rfree(hc->lp);
3521 mb_free(hc->hash_table);
3522 mb_free(hc);
3523 */
3524 }
3525
3526 static void
3527 rt_notify_hostcache(rtable *tab, net *net)
3528 {
3529 if (tab->hcu_scheduled)
3530 return;
3531
3532 if (trie_match_net(tab->hostcache->trie, net->n.addr))
3533 rt_schedule_hcu(tab);
3534 }
3535
3536 static int
3537 if_local_addr(ip_addr a, struct iface *i)
3538 {
3539 struct ifa *b;
3540
3541 WALK_LIST(b, i->addrs)
3542 if (ipa_equal(a, b->ip))
3543 return 1;
3544
3545 return 0;
3546 }
3547
3548 u32
3549 rt_get_igp_metric(rte *rt)
3550 {
3551 eattr *ea = ea_find(rt->attrs->eattrs, EA_GEN_IGP_METRIC);
3552
3553 if (ea)
3554 return ea->u.data;
3555
3556 if (rt->attrs->source == RTS_DEVICE)
3557 return 0;
3558
3559 if (rt->src->proto->rte_igp_metric)
3560 return rt->src->proto->rte_igp_metric(rt);
3561
3562 return IGP_METRIC_UNKNOWN;
3563 }
3564
3565 static int
3566 rt_update_hostentry(rtable *tab, struct hostentry *he)
3567 {
3568 rta *old_src = he->src;
3569 int direct = 0;
3570 int pxlen = 0;
3571
3572 /* Reset the hostentry */
3573 he->src = NULL;
3574 he->dest = RTD_UNREACHABLE;
3575 he->nexthop_linkable = 0;
3576 he->igp_metric = 0;
3577
3578 net_addr he_addr;
3579 net_fill_ip_host(&he_addr, he->addr);
3580 net *n = net_route(tab, &he_addr);
3581 if (n)
3582 {
3583 rte *e = n->routes;
3584 rta *a = e->attrs;
3585 word pref = a->pref;
3586
3587 for (rte *ee = n->routes; ee; ee = ee->next)
3588 if ((ee->attrs->pref >= pref) && ee->attrs->hostentry)
3589 {
3590 /* Recursive route should not depend on another recursive route */
3591 log(L_WARN "Next hop address %I resolvable through recursive route for %N",
3592 he->addr, n->n.addr);
3593 goto done;
3594 }
3595
3596 pxlen = n->n.addr->pxlen;
3597
3598 if (a->dest == RTD_UNICAST)
3599 {
3600 for (struct nexthop *nh = &(a->nh); nh; nh = nh->next)
3601 if (ipa_zero(nh->gw))
3602 {
3603 if (if_local_addr(he->addr, nh->iface))
3604 {
3605 /* The host address is a local address, this is not valid */
3606 log(L_WARN "Next hop address %I is a local address of iface %s",
3607 he->addr, nh->iface->name);
3608 goto done;
3609 }
3610
3611 direct++;
3612 }
3613 }
3614
3615 he->src = rta_clone(a);
3616 he->dest = a->dest;
3617 he->nexthop_linkable = !direct;
3618 he->igp_metric = rt_get_igp_metric(e);
3619 }
3620
3621 done:
3622 /* Add a prefix range to the trie */
3623 trie_add_prefix(tab->hostcache->trie, &he_addr, pxlen, he_addr.pxlen);
3624
3625 rta_free(old_src);
3626 return old_src != he->src;
3627 }
3628
3629 static void
3630 rt_update_hostcache(rtable *tab)
3631 {
3632 struct hostcache *hc = tab->hostcache;
3633 struct hostentry *he;
3634 node *n, *x;
3635
3636 /* Reset the trie */
3637 lp_flush(hc->lp);
3638 hc->trie = f_new_trie(hc->lp, 0);
3639
3640 WALK_LIST_DELSAFE(n, x, hc->hostentries)
3641 {
3642 he = SKIP_BACK(struct hostentry, ln, n);
3643 if (!he->uc)
3644 {
3645 hc_delete_hostentry(hc, tab->rp, he);
3646 continue;
3647 }
3648
3649 if (rt_update_hostentry(tab, he))
3650 rt_schedule_nhu(he->tab);
3651 }
3652
3653 tab->hcu_scheduled = 0;
3654 }
3655
3656 struct hostentry *
3657 rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep)
3658 {
3659 ip_addr link = ipa_zero(ll) ? a : ll;
3660 struct hostentry *he;
3661
3662 if (!tab->hostcache)
3663 rt_init_hostcache(tab);
3664
3665 u32 k = hc_hash(a, dep);
3666 struct hostcache *hc = tab->hostcache;
3667 for (he = hc->hash_table[k >> hc->hash_shift]; he != NULL; he = he->next)
3668 if (ipa_equal(he->addr, a) && ipa_equal(he->link, link) && (he->tab == dep))
3669 return he;
3670
3671 he = hc_new_hostentry(hc, tab->rp, a, link, dep, k);
3672 rt_update_hostentry(tab, he);
3673 return he;
3674 }
3675
3676
3677 /*
3678 * Documentation for functions declared inline in route.h
3679 */
3680 #if 0
3681
3682 /**
3683 * net_find - find a network entry
3684 * @tab: a routing table
3685 * @addr: address of the network
3686 *
3687 * net_find() looks up the given network in routing table @tab and
3688 * returns a pointer to its &net entry or %NULL if no such network
3689 * exists.
3690 */
3691 static inline net *net_find(rtable *tab, net_addr *addr)
3692 { DUMMY; }
3693
3694 /**
3695 * net_get - obtain a network entry
3696 * @tab: a routing table
3697 * @addr: address of the network
3698 *
3699 * net_get() looks up the given network in routing table @tab and
3700 * returns a pointer to its &net entry. If no such entry exists, it's
3701 * created.
3702 */
3703 static inline net *net_get(rtable *tab, net_addr *addr)
3704 { DUMMY; }
3705
3706 /**
3707 * rte_cow - copy a route for writing
3708 * @r: a route entry to be copied
3709 *
3710 * rte_cow() takes a &rte and prepares it for modification. The exact action
3711 * taken depends on the flags of the &rte -- if it's a temporary entry, it's
3712 * just returned unchanged, else a new temporary entry with the same contents
3713 * is created.
3714 *
3715 * The primary use of this function is inside the filter machinery -- when
3716 * a filter wants to modify &rte contents (to change the preference or to
3717 * attach another set of attributes), it must ensure that the &rte is not
3718 * shared with anyone else (and especially that it isn't stored in any routing
3719 * table).
3720 *
3721 * Result: a pointer to the new writable &rte.
3722 */
3723 static inline rte * rte_cow(rte *r)
3724 { DUMMY; }
3725
3726 #endif