]> git.ipfire.org Git - thirdparty/bird.git/blob - nest/rt-table.c
ee3f11882fd613c43eb4c87be9e9f5217d9699fc
[thirdparty/bird.git] / nest / rt-table.c
1 /*
2 * BIRD -- Routing Tables
3 *
4 * (c) 1998--2000 Martin Mares <mj@ucw.cz>
5 *
6 * Can be freely distributed and used under the terms of the GNU GPL.
7 */
8
9 /**
10 * DOC: Routing tables
11 *
12 * Routing tables are probably the most important structures BIRD uses. They
13 * hold all the information about known networks, the associated routes and
14 * their attributes.
15 *
16 * There are multiple routing tables (a primary one together with any
17 * number of secondary ones if requested by the configuration). Each table
18 * is basically a FIB containing entries describing the individual
19 * destination networks. For each network (represented by structure &net),
20 * there is a one-way linked list of route entries (&rte), the first entry
21 * on the list being the best one (i.e., the one we currently use
22 * for routing), the order of the other ones is undetermined.
23 *
24 * The &rte contains information about the route. There are net and src, which
25 * together forms a key identifying the route in a routing table. There is a
26 * pointer to a &rta structure (see the route attribute module for a precise
27 * explanation) holding the route attributes, which are primary data about the
28 * route. There are several technical fields used by routing table code (route
29 * id, REF_* flags), There is also the pflags field, holding protocol-specific
30 * flags. They are not used by routing table code, but by protocol-specific
31 * hooks. In contrast to route attributes, they are not primary data and their
32 * validity is also limited to the routing table.
33 *
34 * There are several mechanisms that allow automatic update of routes in one
35 * routing table (dst) as a result of changes in another routing table (src).
36 * They handle issues of recursive next hop resolving, flowspec validation and
37 * RPKI validation.
38 *
39 * The first such mechanism is handling of recursive next hops. A route in the
40 * dst table has an indirect next hop address, which is resolved through a route
41 * in the src table (which may also be the same table) to get an immediate next
42 * hop. This is implemented using structure &hostcache attached to the src
43 * table, which contains &hostentry structures for each tracked next hop
44 * address. These structures are linked from recursive routes in dst tables,
45 * possibly multiple routes sharing one hostentry (as many routes may have the
46 * same indirect next hop). There is also a trie in the hostcache, which matches
47 * all prefixes that may influence resolving of tracked next hops.
48 *
49 * When a best route changes in the src table, the hostcache is notified using
50 * rt_notify_hostcache(), which immediately checks using the trie whether the
51 * change is relevant and if it is, then it schedules asynchronous hostcache
52 * recomputation. The recomputation is done by rt_update_hostcache() (called
53 * from rt_event() of src table), it walks through all hostentries and resolves
54 * them (by rt_update_hostentry()). It also updates the trie. If a change in
55 * hostentry resolution was found, then it schedules asynchronous nexthop
56 * recomputation of associated dst table. That is done by rt_next_hop_update()
57 * (called from rt_event() of dst table), it iterates over all routes in the dst
58 * table and re-examines their hostentries for changes. Note that in contrast to
59 * hostcache update, next hop update can be interrupted by main loop. These two
60 * full-table walks (over hostcache and dst table) are necessary due to absence
61 * of direct lookups (route -> affected nexthop, nexthop -> its route).
62 *
63 * The second mechanism is for flowspec validation, where validity of flowspec
64 * routes depends of resolving their network prefixes in IP routing tables. This
65 * is similar to the recursive next hop mechanism, but simpler as there are no
66 * intermediate hostcache and hostentries (because flows are less likely to
67 * share common net prefix than routes sharing a common next hop). In src table,
68 * there is a list of dst tables (list flowspec_links), this list is updated by
69 * flowpsec channels (by rt_flowspec_link() and rt_flowspec_unlink() during
70 * channel start/stop). Each dst table has its own trie of prefixes that may
71 * influence validation of flowspec routes in it (flowspec_trie).
72 *
73 * When a best route changes in the src table, rt_flowspec_notify() immediately
74 * checks all dst tables from the list using their tries to see whether the
75 * change is relevant for them. If it is, then an asynchronous re-validation of
76 * flowspec routes in the dst table is scheduled. That is also done by function
77 * rt_next_hop_update(), like nexthop recomputation above. It iterates over all
78 * flowspec routes and re-validates them. It also recalculates the trie.
79 *
80 * Note that in contrast to the hostcache update, here the trie is recalculated
81 * during the rt_next_hop_update(), which may be interleaved with IP route
82 * updates. The trie is flushed at the beginning of recalculation, which means
83 * that such updates may use partial trie to see if they are relevant. But it
84 * works anyway! Either affected flowspec was already re-validated and added to
85 * the trie, then IP route change would match the trie and trigger a next round
86 * of re-validation, or it was not yet re-validated and added to the trie, but
87 * will be re-validated later in this round anyway.
88 *
89 * The third mechanism is used for RPKI re-validation of IP routes and it is the
90 * simplest. It is just a list of subscribers in src table, who are notified
91 * when any change happened, but only after a settle time. Also, in RPKI case
92 * the dst is not a table, but a channel, who refeeds routes through a filter.
93 */
94
95 #undef LOCAL_DEBUG
96
97 #include "nest/bird.h"
98 #include "nest/route.h"
99 #include "nest/protocol.h"
100 #include "nest/iface.h"
101 #include "nest/mpls.h"
102 #include "lib/resource.h"
103 #include "lib/event.h"
104 #include "lib/timer.h"
105 #include "lib/string.h"
106 #include "conf/conf.h"
107 #include "filter/filter.h"
108 #include "filter/data.h"
109 #include "lib/hash.h"
110 #include "lib/string.h"
111 #include "lib/alloca.h"
112 #include "lib/flowspec.h"
113
114 #ifdef CONFIG_BGP
115 #include "proto/bgp/bgp.h"
116 #endif
117
118 pool *rt_table_pool;
119
120 static slab *rte_slab;
121 linpool *rte_update_pool;
122
123 list routing_tables;
124
125 static void rt_free_hostcache(rtable *tab);
126 static void rt_notify_hostcache(rtable *tab, net *net);
127 static void rt_update_hostcache(rtable *tab);
128 static void rt_next_hop_update(rtable *tab);
129 static inline void rt_prune_table(rtable *tab);
130 static inline void rt_schedule_notify(rtable *tab);
131 static void rt_flowspec_notify(rtable *tab, net *net);
132 static void rt_kick_prune_timer(rtable *tab);
133
134
135 static void
136 net_init_with_trie(struct fib *f, void *N)
137 {
138 rtable *tab = SKIP_BACK(rtable, fib, f);
139 net *n = N;
140
141 if (tab->trie)
142 trie_add_prefix(tab->trie, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen);
143
144 if (tab->trie_new)
145 trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen);
146 }
147
148 static inline void *
149 net_route_ip6_sadr_trie(rtable *t, const net_addr_ip6_sadr *n0)
150 {
151 TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px)
152 {
153 net_addr_ip6_sadr n = NET_ADDR_IP6_SADR(px.prefix, px.pxlen, n0->src_prefix, n0->src_pxlen);
154 net *best = NULL;
155 int best_pxlen = 0;
156
157 /* We need to do dst first matching. Since sadr addresses are hashed on dst
158 prefix only, find the hash table chain and go through it to find the
159 match with the longest matching src prefix. */
160 for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next)
161 {
162 net_addr_ip6_sadr *a = (void *) fn->addr;
163
164 if (net_equal_dst_ip6_sadr(&n, a) &&
165 net_in_net_src_ip6_sadr(&n, a) &&
166 (a->src_pxlen >= best_pxlen))
167 {
168 best = fib_node_to_user(&t->fib, fn);
169 best_pxlen = a->src_pxlen;
170 }
171 }
172
173 if (best)
174 return best;
175 }
176 TRIE_WALK_TO_ROOT_END;
177
178 return NULL;
179 }
180
181
182 static inline void *
183 net_route_ip6_sadr_fib(rtable *t, const net_addr_ip6_sadr *n0)
184 {
185 net_addr_ip6_sadr n;
186 net_copy_ip6_sadr(&n, n0);
187
188 while (1)
189 {
190 net *best = NULL;
191 int best_pxlen = 0;
192
193 /* We need to do dst first matching. Since sadr addresses are hashed on dst
194 prefix only, find the hash table chain and go through it to find the
195 match with the longest matching src prefix. */
196 for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next)
197 {
198 net_addr_ip6_sadr *a = (void *) fn->addr;
199
200 if (net_equal_dst_ip6_sadr(&n, a) &&
201 net_in_net_src_ip6_sadr(&n, a) &&
202 (a->src_pxlen >= best_pxlen))
203 {
204 best = fib_node_to_user(&t->fib, fn);
205 best_pxlen = a->src_pxlen;
206 }
207 }
208
209 if (best)
210 return best;
211
212 if (!n.dst_pxlen)
213 break;
214
215 n.dst_pxlen--;
216 ip6_clrbit(&n.dst_prefix, n.dst_pxlen);
217 }
218
219 return NULL;
220 }
221
222 net *
223 net_route(rtable *tab, const net_addr *n)
224 {
225 ASSERT(tab->addr_type == n->type);
226 net_addr_union *nu = SKIP_BACK(net_addr_union, n, n);
227
228 #define TW(ipv, what) \
229 TRIE_WALK_TO_ROOT_IP##ipv(tab->trie, &(nu->ip##ipv), var) \
230 { what(ipv, var); } \
231 TRIE_WALK_TO_ROOT_END; return NULL;
232
233 #define FW(ipv, what) do { \
234 net_addr_union nuc; net_copy(&nuc.n, n); \
235 while (1) { \
236 what(ipv, nuc.ip##ipv); if (!nuc.n.pxlen) return NULL; \
237 nuc.n.pxlen--; ip##ipv##_clrbit(&nuc.ip##ipv.prefix, nuc.ip##ipv.pxlen); \
238 } \
239 } while(0); return NULL;
240
241 #define FVR_IP(ipv, var) \
242 net *r; if (r = net_find_valid(tab, (net_addr *) &var)) return r;
243
244 #define FVR_VPN(ipv, var) \
245 net_addr_vpn##ipv _var0 = NET_ADDR_VPN##ipv(var.prefix, var.pxlen, nu->vpn##ipv.rd); FVR_IP(ipv, _var0);
246
247 if (tab->trie)
248 switch (n->type) {
249 case NET_IP4: TW(4, FVR_IP);
250 case NET_VPN4: TW(4, FVR_VPN);
251 case NET_IP6: TW(6, FVR_IP);
252 case NET_VPN6: TW(6, FVR_VPN);
253
254 case NET_IP6_SADR:
255 return net_route_ip6_sadr_trie(tab, (net_addr_ip6_sadr *) n);
256 default:
257 return NULL;
258 }
259 else
260 switch (n->type) {
261 case NET_IP4: FW(4, FVR_IP);
262 case NET_VPN4: FW(4, FVR_VPN);
263 case NET_IP6: FW(6, FVR_IP);
264 case NET_VPN6: FW(6, FVR_VPN);
265
266 case NET_IP6_SADR:
267 return net_route_ip6_sadr_fib (tab, (net_addr_ip6_sadr *) n);
268 default:
269 return NULL;
270 }
271
272 #undef TW
273 #undef FW
274 #undef FVR_IP
275 #undef FVR_VPN
276 }
277
278
279 /**
280 * roa_check - check validity of route origination in a ROA table
281 * @tab: ROA table
282 * @n: network prefix to check
283 * @asn: AS number of network prefix
284 *
285 * Implements RFC 6483 route validation for the given network prefix. The
286 * procedure is to find all candidate ROAs - ROAs whose prefixes cover the given
287 * network prefix. If there is no candidate ROA, return ROA_UNKNOWN. If there is
288 * a candidate ROA with matching ASN and maxlen field greater than or equal to
289 * the given prefix length, return ROA_VALID. Otherwise, return ROA_INVALID. If
290 * caller cannot determine origin AS, 0 could be used (in that case ROA_VALID
291 * cannot happen). Table @tab must have type NET_ROA4 or NET_ROA6, network @n
292 * must have type NET_IP4 or NET_IP6, respectively.
293 */
294 int
295 net_roa_check(rtable *tab, const net_addr *n, u32 asn)
296 {
297 net_addr_union *nu = SKIP_BACK(net_addr_union, n, n);
298 int anything = 0;
299 struct fib_node *fn;
300
301 #define TW(ipv) do { \
302 TRIE_WALK_TO_ROOT_IP##ipv(tab->trie, &(nu->ip##ipv), var) { \
303 net_addr_roa##ipv roa0 = NET_ADDR_ROA##ipv(var.prefix, var.pxlen, 0, 0); \
304 ROA_PARTIAL_CHECK(ipv); \
305 } TRIE_WALK_TO_ROOT_END; \
306 return anything ? ROA_INVALID : ROA_UNKNOWN; \
307 } while (0)
308
309 #define FW(ipv) do { \
310 net_addr_roa##ipv roa0 = NET_ADDR_ROA##ipv(nu->ip##ipv.prefix, nu->ip##ipv.pxlen, 0, 0);\
311 while (1) { \
312 ROA_PARTIAL_CHECK(ipv); \
313 if (roa0.pxlen == 0) break; \
314 roa0.pxlen--; ip##ipv##_clrbit(&roa0.prefix, roa0.pxlen); \
315 } \
316 } while (0)
317
318 #define ROA_PARTIAL_CHECK(ipv) do { \
319 for (fn = fib_get_chain(&tab->fib, (net_addr *) &roa0); fn; fn = fn->next) \
320 { \
321 net_addr_roa##ipv *roa = (void *) fn->addr; \
322 net *r = fib_node_to_user(&tab->fib, fn); \
323 if (net_equal_prefix_roa##ipv(roa, &roa0) && rte_is_valid(r->routes)) \
324 { \
325 anything = 1; \
326 if (asn && (roa->asn == asn) && (roa->max_pxlen >= nu->ip##ipv.pxlen)) \
327 return ROA_VALID; \
328 } \
329 } \
330 } while (0)
331
332 if ((tab->addr_type == NET_ROA4) && (n->type == NET_IP4))
333 {
334 if (tab->trie) TW(4);
335 else FW(4);
336 }
337 else if ((tab->addr_type == NET_ROA6) && (n->type == NET_IP6))
338 {
339 if (tab->trie) TW(6);
340 else FW(6);
341 }
342
343 return anything ? ROA_INVALID : ROA_UNKNOWN;
344 #undef ROA_PARTIAL_CHECK
345 #undef TW
346 #undef FW
347 }
348
349 /**
350 * aspa_check - check validity of AS Path in an ASPA table
351 * @tab: ASPA table
352 * @path: AS Path to check
353 *
354 * Implements draft-ietf-sidrops-aspa-verification-16.
355 */
356 enum aspa_result aspa_check(rtable *tab, const adata *path, bool force_upstream)
357 {
358 struct lp_state lps;
359 lp_save(tmp_linpool, &lps);
360
361 /* No support for confed paths */
362 if (as_path_contains_confed(path))
363 return ASPA_INVALID;
364
365 /* Check path length */
366 uint len = as_path_getlen(path);
367 if (len == 0)
368 return ASPA_INVALID;
369
370 /* Normalize the AS Path: drop stuffings */
371 u32 *asns = alloca(sizeof(u32) * len);
372 uint ppos = 0;
373 uint nsz = 0;
374 while (as_path_walk(path, &ppos, &asns[nsz]))
375 if ((nsz == 0) || (asns[nsz] != asns[nsz-1]))
376 nsz++;
377
378 /* Find the provider blocks for every AS on the path
379 * and check allowed directions */
380 uint max_up = 0, min_up = 0, max_down = 0, min_down = 0;
381
382 for (uint ap=0; ap<nsz; ap++)
383 {
384 net_addr_union nau = { .aspa = NET_ADDR_ASPA(asns[ap]), };
385 net *n = net_find(tab, &nau.n);
386
387 bool found = false, down = false, up = false;
388
389 for (rte *e = (n ? n->routes: NULL); e; e = e->next)
390 {
391 if (!rte_is_valid(e))
392 continue;
393
394 eattr *ea = ea_find(e->attrs->eattrs, EA_ASPA_PROVIDERS);
395 if (!ea)
396 continue;
397
398 /* Actually found some ASPA */
399 found = true;
400
401 for (uint i=0; i * sizeof(u32) < ea->u.ptr->length; i++)
402 {
403 if ((ap > 0) && ((u32 *) ea->u.ptr->data)[i] == asns[ap-1])
404 up = true;
405 if ((ap + 1 < nsz) && ((u32 *) ea->u.ptr->data)[i] == asns[ap+1])
406 down = true;
407
408 if (down && up)
409 /* Both peers found */
410 goto end_of_aspa;
411 }
412 }
413 end_of_aspa:;
414
415 /* Fast path for the upstream check */
416 if (force_upstream)
417 {
418 if (!found)
419 /* Move min-upstream */
420 min_up = ap;
421 else if (ap && !up)
422 /* Exists but doesn't allow this upstream */
423 return ASPA_INVALID;
424 }
425
426 /* Fast path for no ASPA here */
427 else if (!found)
428 {
429 /* Extend max-downstream (min-downstream is stopped by unknown) */
430 max_down = ap+1;
431
432 /* Move min-upstream (can't include unknown) */
433 min_up = ap;
434 }
435
436 /* ASPA exists and downstream may be extended */
437 else if (down)
438 {
439 /* Extending max-downstream always */
440 max_down = ap+1;
441
442 /* Extending min-downstream unless unknown seen */
443 if (min_down == ap)
444 min_down = ap+1;
445
446 /* Downstream only */
447 if (!up)
448 min_up = max_up = ap;
449 }
450
451 /* No extension for downstream, force upstream only from now */
452 else
453 {
454 force_upstream = 1;
455
456 /* Not even upstream, move the ending here */
457 if (!up)
458 min_up = max_up = ap;
459 }
460 }
461
462 /* Is the path surely valid? */
463 if (min_up <= min_down)
464 return ASPA_VALID;
465
466 /* Is the path maybe valid? */
467 if (max_up <= max_down)
468 return ASPA_UNKNOWN;
469
470 /* Now there is surely a valley there. */
471 return ASPA_INVALID;
472 }
473
474 /**
475 * rte_find - find a route
476 * @net: network node
477 * @src: route source
478 *
479 * The rte_find() function returns a route for destination @net
480 * which is from route source @src.
481 */
482 rte *
483 rte_find(net *net, struct rte_src *src)
484 {
485 rte *e = net->routes;
486
487 while (e && e->src != src)
488 e = e->next;
489 return e;
490 }
491
492 /**
493 * rte_get_temp - get a temporary &rte
494 * @a: attributes to assign to the new route (a &rta; in case it's
495 * un-cached, rte_update() will create a cached copy automatically)
496 * @src: route source
497 *
498 * Create a temporary &rte and bind it with the attributes @a.
499 */
500 rte *
501 rte_get_temp(rta *a, struct rte_src *src)
502 {
503 rte *e = sl_alloc(rte_slab);
504
505 e->attrs = a;
506 e->id = 0;
507 e->flags = 0;
508 e->pflags = 0;
509 rt_lock_source(e->src = src);
510 return e;
511 }
512
513 rte *
514 rte_do_cow(rte *r)
515 {
516 rte *e = sl_alloc(rte_slab);
517
518 memcpy(e, r, sizeof(rte));
519
520 rt_lock_source(e->src);
521 e->attrs = rta_clone(r->attrs);
522 e->flags = 0;
523 return e;
524 }
525
526 /**
527 * rte_cow_rta - get a private writable copy of &rte with writable &rta
528 * @r: a route entry to be copied
529 * @lp: a linpool from which to allocate &rta
530 *
531 * rte_cow_rta() takes a &rte and prepares it and associated &rta for
532 * modification. There are three possibilities: First, both &rte and &rta are
533 * private copies, in that case they are returned unchanged. Second, &rte is
534 * private copy, but &rta is cached, in that case &rta is duplicated using
535 * rta_do_cow(). Third, both &rte is shared and &rta is cached, in that case
536 * both structures are duplicated by rte_do_cow() and rta_do_cow().
537 *
538 * Note that in the second case, cached &rta loses one reference, while private
539 * copy created by rta_do_cow() is a shallow copy sharing indirect data (eattrs,
540 * nexthops, ...) with it. To work properly, original shared &rta should have
541 * another reference during the life of created private copy.
542 *
543 * Result: a pointer to the new writable &rte with writable &rta.
544 */
545 rte *
546 rte_cow_rta(rte *r, linpool *lp)
547 {
548 if (!rta_is_cached(r->attrs))
549 return r;
550
551 r = rte_cow(r);
552 rta *a = rta_do_cow(r->attrs, lp);
553 rta_free(r->attrs);
554 r->attrs = a;
555 return r;
556 }
557
558 static int /* Actually better or at least as good as */
559 rte_better(rte *new, rte *old)
560 {
561 int (*better)(rte *, rte *);
562
563 if (!rte_is_valid(old))
564 return 1;
565 if (!rte_is_valid(new))
566 return 0;
567
568 if (new->attrs->pref > old->attrs->pref)
569 return 1;
570 if (new->attrs->pref < old->attrs->pref)
571 return 0;
572 if (new->src->proto->proto != old->src->proto->proto)
573 {
574 /*
575 * If the user has configured protocol preferences, so that two different protocols
576 * have the same preference, try to break the tie by comparing addresses. Not too
577 * useful, but keeps the ordering of routes unambiguous.
578 */
579 return new->src->proto->proto > old->src->proto->proto;
580 }
581 if (better = new->src->proto->rte_better)
582 return better(new, old);
583 return 0;
584 }
585
586 static int
587 rte_mergable(rte *pri, rte *sec)
588 {
589 int (*mergable)(rte *, rte *);
590
591 if (!rte_is_valid(pri) || !rte_is_valid(sec))
592 return 0;
593
594 if (pri->attrs->pref != sec->attrs->pref)
595 return 0;
596
597 if (pri->src->proto->proto != sec->src->proto->proto)
598 return 0;
599
600 if (mergable = pri->src->proto->rte_mergable)
601 return mergable(pri, sec);
602
603 return 0;
604 }
605
606 static void
607 rte_trace(struct channel *c, rte *e, int dir, char *msg)
608 {
609 log(L_TRACE "%s.%s %c %s %N %luL %uG %s",
610 c->proto->name, c->name ?: "?", dir, msg, e->net->n.addr, e->src->private_id, e->src->global_id,
611 rta_dest_name(e->attrs->dest));
612 }
613
614 static inline void
615 rte_trace_in(uint flag, struct channel *c, rte *e, char *msg)
616 {
617 if ((c->debug & flag) || (c->proto->debug & flag))
618 rte_trace(c, e, '>', msg);
619 }
620
621 static inline void
622 rte_trace_out(uint flag, struct channel *c, rte *e, char *msg)
623 {
624 if ((c->debug & flag) || (c->proto->debug & flag))
625 rte_trace(c, e, '<', msg);
626 }
627
628 static rte *
629 export_filter_(struct channel *c, rte *rt0, rte **rt_free, linpool *pool, int silent)
630 {
631 struct proto *p = c->proto;
632 const struct filter *filter = c->out_filter;
633 struct proto_stats *stats = &c->stats;
634 rte *rt;
635 int v;
636
637 rt = rt0;
638 *rt_free = NULL;
639
640 v = p->preexport ? p->preexport(c, rt) : 0;
641 if (v < 0)
642 {
643 if (silent)
644 goto reject;
645
646 stats->exp_updates_rejected++;
647 if (v == RIC_REJECT)
648 rte_trace_out(D_FILTERS, c, rt, "rejected by protocol");
649 goto reject;
650 }
651 if (v > 0)
652 {
653 if (!silent)
654 rte_trace_out(D_FILTERS, c, rt, "forced accept by protocol");
655 goto accept;
656 }
657
658 v = filter && ((filter == FILTER_REJECT) ||
659 (f_run(filter, &rt, pool,
660 (silent ? FF_SILENT : 0)) > F_ACCEPT));
661 if (v)
662 {
663 if (silent)
664 goto reject;
665
666 stats->exp_updates_filtered++;
667 rte_trace_out(D_FILTERS, c, rt, "filtered out");
668 goto reject;
669 }
670
671 accept:
672 if (rt != rt0)
673 *rt_free = rt;
674 return rt;
675
676 reject:
677 /* Discard temporary rte */
678 if (rt != rt0)
679 rte_free(rt);
680 return NULL;
681 }
682
683 static inline rte *
684 export_filter(struct channel *c, rte *rt0, rte **rt_free, int silent)
685 {
686 return export_filter_(c, rt0, rt_free, rte_update_pool, silent);
687 }
688
689 static void
690 do_rt_notify(struct channel *c, net *net, rte *new, rte *old, int refeed)
691 {
692 struct proto *p = c->proto;
693 struct proto_stats *stats = &c->stats;
694
695 if (refeed && new)
696 c->refeed_count++;
697
698 /* Apply export limit */
699 struct channel_limit *l = &c->out_limit;
700 if (l->action && !old && new)
701 {
702 if (stats->exp_routes >= l->limit)
703 channel_notify_limit(c, l, PLD_OUT, stats->exp_routes);
704
705 if (l->state == PLS_BLOCKED)
706 {
707 stats->exp_updates_rejected++;
708 rte_trace_out(D_FILTERS, c, new, "rejected [limit]");
709 return;
710 }
711 }
712
713 /* Apply export table */
714 if (c->out_table && !rte_update_out(c, net->n.addr, new, old, refeed))
715 return;
716
717 if (new)
718 stats->exp_updates_accepted++;
719 else
720 stats->exp_withdraws_accepted++;
721
722 if (old)
723 {
724 bmap_clear(&c->export_map, old->id);
725 stats->exp_routes--;
726 }
727
728 if (new)
729 {
730 bmap_set(&c->export_map, new->id);
731 stats->exp_routes++;
732 }
733
734 if (p->debug & D_ROUTES)
735 {
736 if (new && old)
737 rte_trace_out(D_ROUTES, c, new, "replaced");
738 else if (new)
739 rte_trace_out(D_ROUTES, c, new, "added");
740 else if (old)
741 rte_trace_out(D_ROUTES, c, old, "removed");
742 }
743
744 p->rt_notify(p, c, net, new, old);
745 }
746
747 static void
748 rt_notify_basic(struct channel *c, net *net, rte *new, rte *old, int refeed)
749 {
750 // struct proto *p = c->proto;
751 rte *new_free = NULL;
752
753 if (new)
754 c->stats.exp_updates_received++;
755 else
756 c->stats.exp_withdraws_received++;
757
758 if (new)
759 new = export_filter(c, new, &new_free, 0);
760
761 if (old && !bmap_test(&c->export_map, old->id))
762 old = NULL;
763
764 if (!new && !old)
765 return;
766
767 do_rt_notify(c, net, new, old, refeed);
768
769 /* Discard temporary rte */
770 if (new_free)
771 rte_free(new_free);
772 }
773
774 static void
775 rt_notify_accepted(struct channel *c, net *net, rte *new_changed, rte *old_changed, int refeed)
776 {
777 // struct proto *p = c->proto;
778 rte *new_best = NULL;
779 rte *old_best = NULL;
780 rte *new_free = NULL;
781 int new_first = 0;
782
783 /*
784 * We assume that there are no changes in net route order except (added)
785 * new_changed and (removed) old_changed. Therefore, the function is not
786 * compatible with deterministic_med (where nontrivial reordering can happen
787 * as a result of a route change) and with recomputation of recursive routes
788 * due to next hop update (where many routes can be changed in one step).
789 *
790 * Note that we need this assumption just for optimizations, we could just
791 * run full new_best recomputation otherwise.
792 *
793 * There are three cases:
794 * feed or old_best is old_changed -> we need to recompute new_best
795 * old_best is before new_changed -> new_best is old_best, ignore
796 * old_best is after new_changed -> try new_changed, otherwise old_best
797 */
798
799 if (net->routes)
800 c->stats.exp_updates_received++;
801 else
802 c->stats.exp_withdraws_received++;
803
804 /* Find old_best - either old_changed, or route for net->routes */
805 if (old_changed && bmap_test(&c->export_map, old_changed->id))
806 old_best = old_changed;
807 else
808 {
809 for (rte *r = net->routes; rte_is_valid(r); r = r->next)
810 {
811 if (bmap_test(&c->export_map, r->id))
812 {
813 old_best = r;
814 break;
815 }
816
817 /* Note if new_changed found before old_best */
818 if (r == new_changed)
819 new_first = 1;
820 }
821 }
822
823 /* Find new_best */
824 if ((new_changed == old_changed) || (old_best == old_changed))
825 {
826 /* Feed or old_best changed -> find first accepted by filters */
827 for (rte *r = net->routes; rte_is_valid(r); r = r->next)
828 if (new_best = export_filter(c, r, &new_free, 0))
829 break;
830 }
831 else
832 {
833 /* Other cases -> either new_changed, or old_best (and nothing changed) */
834 if (new_first && (new_changed = export_filter(c, new_changed, &new_free, 0)))
835 new_best = new_changed;
836 else
837 return;
838 }
839
840 if (!new_best && !old_best)
841 return;
842
843 do_rt_notify(c, net, new_best, old_best, refeed);
844
845 /* Discard temporary rte */
846 if (new_free)
847 rte_free(new_free);
848 }
849
850
851 static struct nexthop *
852 nexthop_merge_rta(struct nexthop *nhs, rta *a, linpool *pool, int max)
853 {
854 return nexthop_merge(nhs, &(a->nh), 1, 0, max, pool);
855 }
856
857 rte *
858 rt_export_merged(struct channel *c, net *net, rte **rt_free, linpool *pool, int silent)
859 {
860 // struct proto *p = c->proto;
861 struct nexthop *nhs = NULL;
862 rte *best0, *best, *rt0, *rt, *tmp;
863
864 best0 = net->routes;
865 *rt_free = NULL;
866
867 if (!rte_is_valid(best0))
868 return NULL;
869
870 best = export_filter_(c, best0, rt_free, pool, silent);
871
872 if (!best || !rte_is_reachable(best))
873 return best;
874
875 for (rt0 = best0->next; rt0; rt0 = rt0->next)
876 {
877 if (!rte_mergable(best0, rt0))
878 continue;
879
880 rt = export_filter_(c, rt0, &tmp, pool, 1);
881
882 if (!rt)
883 continue;
884
885 if (rte_is_reachable(rt))
886 nhs = nexthop_merge_rta(nhs, rt->attrs, pool, c->merge_limit);
887
888 if (tmp)
889 rte_free(tmp);
890 }
891
892 if (nhs)
893 {
894 nhs = nexthop_merge_rta(nhs, best->attrs, pool, c->merge_limit);
895
896 if (nhs->next)
897 {
898 best = rte_cow_rta(best, pool);
899 nexthop_link(best->attrs, nhs);
900 }
901 }
902
903 if (best != best0)
904 *rt_free = best;
905
906 return best;
907 }
908
909 static void
910 rt_notify_merged(struct channel *c, net *net, rte *new_changed, rte *old_changed,
911 rte *new_best, rte *old_best, int refeed)
912 {
913 // struct proto *p = c->proto;
914 rte *new_free = NULL;
915
916 /* We assume that all rte arguments are either NULL or rte_is_valid() */
917
918 /* This check should be done by the caller */
919 if (!new_best && !old_best)
920 return;
921
922 /* Check whether the change is relevant to the merged route */
923 if ((new_best == old_best) &&
924 (new_changed != old_changed) &&
925 !rte_mergable(new_best, new_changed) &&
926 !rte_mergable(old_best, old_changed))
927 return;
928
929 if (new_best)
930 c->stats.exp_updates_received++;
931 else
932 c->stats.exp_withdraws_received++;
933
934 /* Prepare new merged route */
935 if (new_best)
936 new_best = rt_export_merged(c, net, &new_free, rte_update_pool, 0);
937
938 /* Check old merged route */
939 if (old_best && !bmap_test(&c->export_map, old_best->id))
940 old_best = NULL;
941
942 if (!new_best && !old_best)
943 return;
944
945 do_rt_notify(c, net, new_best, old_best, refeed);
946
947 /* Discard temporary rte */
948 if (new_free)
949 rte_free(new_free);
950 }
951
952
953 /**
954 * rte_announce - announce a routing table change
955 * @tab: table the route has been added to
956 * @type: type of route announcement (RA_UNDEF or RA_ANY)
957 * @net: network in question
958 * @new: the new or changed route
959 * @old: the previous route replaced by the new one
960 * @new_best: the new best route for the same network
961 * @old_best: the previous best route for the same network
962 *
963 * This function gets a routing table update and announces it to all protocols
964 * that are connected to the same table by their channels.
965 *
966 * There are two ways of how routing table changes are announced. First, there
967 * is a change of just one route in @net (which may caused a change of the best
968 * route of the network). In this case @new and @old describes the changed route
969 * and @new_best and @old_best describes best routes. Other routes are not
970 * affected, but in sorted table the order of other routes might change.
971 *
972 * Second, There is a bulk change of multiple routes in @net, with shared best
973 * route selection. In such case separate route changes are described using
974 * @type of %RA_ANY, with @new and @old specifying the changed route, while
975 * @new_best and @old_best are NULL. After that, another notification is done
976 * where @new_best and @old_best are filled (may be the same), but @new and @old
977 * are NULL.
978 *
979 * The function announces the change to all associated channels. For each
980 * channel, an appropriate preprocessing is done according to channel &ra_mode.
981 * For example, %RA_OPTIMAL channels receive just changes of best routes.
982 *
983 * In general, we first call preexport() hook of a protocol, which performs
984 * basic checks on the route (each protocol has a right to veto or force accept
985 * of the route before any filter is asked). Then we consult an export filter
986 * of the channel and verify the old route in an export map of the channel.
987 * Finally, the rt_notify() hook of the protocol gets called.
988 *
989 * Note that there are also calls of rt_notify() hooks due to feed, but that is
990 * done outside of scope of rte_announce().
991 */
992 static void
993 rte_announce(rtable *tab, uint type, net *net, rte *new, rte *old,
994 rte *new_best, rte *old_best)
995 {
996 if (!rte_is_valid(new))
997 new = NULL;
998
999 if (!rte_is_valid(old))
1000 old = NULL;
1001
1002 if (!rte_is_valid(new_best))
1003 new_best = NULL;
1004
1005 if (!rte_is_valid(old_best))
1006 old_best = NULL;
1007
1008 if (!new && !old && !new_best && !old_best)
1009 return;
1010
1011 if (new_best != old_best)
1012 {
1013 if (new_best)
1014 new_best->sender->stats.pref_routes++;
1015 if (old_best)
1016 old_best->sender->stats.pref_routes--;
1017
1018 if (tab->hostcache)
1019 rt_notify_hostcache(tab, net);
1020
1021 if (!EMPTY_LIST(tab->flowspec_links))
1022 rt_flowspec_notify(tab, net);
1023 }
1024
1025 rt_schedule_notify(tab);
1026
1027 struct channel *c; node *n;
1028 WALK_LIST2(c, n, tab->channels, table_node)
1029 {
1030 if (c->export_state == ES_DOWN)
1031 continue;
1032
1033 if (type && (type != c->ra_mode))
1034 continue;
1035
1036 switch (c->ra_mode)
1037 {
1038 case RA_OPTIMAL:
1039 if (new_best != old_best)
1040 rt_notify_basic(c, net, new_best, old_best, 0);
1041 break;
1042
1043 case RA_ANY:
1044 if (new != old)
1045 rt_notify_basic(c, net, new, old, 0);
1046 break;
1047
1048 case RA_ACCEPTED:
1049 /*
1050 * The (new != old) condition is problematic here, as it would break
1051 * the second usage pattern (announcement after bulk change, used in
1052 * rt_next_hop_update_net(), which sends both new and old as NULL).
1053 *
1054 * But recursive next hops do not work with sorted tables anyways,
1055 * such configuration is forbidden in BGP and not supported in
1056 * rt_notify_accepted().
1057 *
1058 * The condition is needed to eliminate spurious announcements where
1059 * both old and new routes are not valid (so they are NULL).
1060 */
1061 if (new != old)
1062 rt_notify_accepted(c, net, new, old, 0);
1063 break;
1064
1065 case RA_MERGED:
1066 rt_notify_merged(c, net, new, old, new_best, old_best, 0);
1067 break;
1068 }
1069 }
1070 }
1071
1072 static inline int
1073 rte_validate(rte *e)
1074 {
1075 int c;
1076 net *n = e->net;
1077
1078 if (!net_validate(n->n.addr))
1079 {
1080 log(L_WARN "Ignoring bogus prefix %N received via %s",
1081 n->n.addr, e->sender->proto->name);
1082 return 0;
1083 }
1084
1085 /* FIXME: better handling different nettypes */
1086 c = !net_is_flow(n->n.addr) ?
1087 net_classify(n->n.addr): (IADDR_HOST | SCOPE_UNIVERSE);
1088 if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
1089 {
1090 log(L_WARN "Ignoring bogus route %N received via %s",
1091 n->n.addr, e->sender->proto->name);
1092 return 0;
1093 }
1094
1095 if (net_type_match(n->n.addr, NB_DEST) == !e->attrs->dest)
1096 {
1097 /* Exception for flowspec that failed validation */
1098 if (net_is_flow(n->n.addr) && (e->attrs->dest == RTD_UNREACHABLE))
1099 return 1;
1100
1101 log(L_WARN "Ignoring route %N with invalid dest %d received via %s",
1102 n->n.addr, e->attrs->dest, e->sender->proto->name);
1103 return 0;
1104 }
1105
1106 if ((e->attrs->dest == RTD_UNICAST) && !nexthop_is_sorted(&(e->attrs->nh)))
1107 {
1108 log(L_WARN "Ignoring unsorted multipath route %N received via %s",
1109 n->n.addr, e->sender->proto->name);
1110 return 0;
1111 }
1112
1113 return 1;
1114 }
1115
1116 /**
1117 * rte_free - delete a &rte
1118 * @e: &rte to be deleted
1119 *
1120 * rte_free() deletes the given &rte from the routing table it's linked to.
1121 */
1122 void
1123 rte_free(rte *e)
1124 {
1125 rt_unlock_source(e->src);
1126 if (rta_is_cached(e->attrs))
1127 rta_free(e->attrs);
1128 sl_free(e);
1129 }
1130
1131 static inline void
1132 rte_free_quick(rte *e)
1133 {
1134 rt_unlock_source(e->src);
1135 rta_free(e->attrs);
1136 sl_free(e);
1137 }
1138
1139 int
1140 rte_same(rte *x, rte *y)
1141 {
1142 /* rte.flags / rte.pflags are not checked, as they are internal to rtable */
1143 return
1144 x->attrs == y->attrs &&
1145 x->src == y->src &&
1146 rte_is_filtered(x) == rte_is_filtered(y);
1147 }
1148
1149 static inline int rte_is_ok(rte *e) { return e && !rte_is_filtered(e); }
1150
1151 static void
1152 rte_recalculate(struct channel *c, net *net, rte *new, struct rte_src *src)
1153 {
1154 struct proto *p = c->proto;
1155 struct rtable *table = c->table;
1156 struct proto_stats *stats = &c->stats;
1157 static struct tbf rl_pipe = TBF_DEFAULT_LOG_LIMITS;
1158 rte *before_old = NULL;
1159 rte *old_best = net->routes;
1160 rte *old = NULL;
1161 rte **k;
1162
1163 k = &net->routes; /* Find and remove original route from the same protocol */
1164 while (old = *k)
1165 {
1166 if (old->src == src)
1167 {
1168 /* If there is the same route in the routing table but from
1169 * a different sender, then there are two paths from the
1170 * source protocol to this routing table through transparent
1171 * pipes, which is not allowed.
1172 *
1173 * We log that and ignore the route. If it is withdraw, we
1174 * ignore it completely (there might be 'spurious withdraws',
1175 * see FIXME in do_rte_announce())
1176 */
1177 if (old->sender->proto != p)
1178 {
1179 if (new)
1180 {
1181 log_rl(&rl_pipe, L_ERR "Pipe collision detected when sending %N to table %s",
1182 net->n.addr, table->name);
1183 rte_free_quick(new);
1184 }
1185 return;
1186 }
1187
1188 if (new && rte_same(old, new))
1189 {
1190 /* No changes, ignore the new route and refresh the old one */
1191
1192 old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
1193
1194 if (!rte_is_filtered(new))
1195 {
1196 stats->imp_updates_ignored++;
1197 rte_trace_in(D_ROUTES, c, new, "ignored");
1198 }
1199
1200 rte_free_quick(new);
1201 return;
1202 }
1203 *k = old->next;
1204 table->rt_count--;
1205 break;
1206 }
1207 k = &old->next;
1208 before_old = old;
1209 }
1210
1211 /* Save the last accessed position */
1212 rte **pos = k;
1213
1214 if (!old)
1215 before_old = NULL;
1216
1217 if (!old && !new)
1218 {
1219 stats->imp_withdraws_ignored++;
1220 return;
1221 }
1222
1223 int new_ok = rte_is_ok(new);
1224 int old_ok = rte_is_ok(old);
1225
1226 struct channel_limit *l = &c->rx_limit;
1227 if (l->action && !old && new && !c->in_table)
1228 {
1229 u32 all_routes = stats->imp_routes + stats->filt_routes;
1230
1231 if (all_routes >= l->limit)
1232 channel_notify_limit(c, l, PLD_RX, all_routes);
1233
1234 if (l->state == PLS_BLOCKED)
1235 {
1236 /* In receive limit the situation is simple, old is NULL so
1237 we just free new and exit like nothing happened */
1238
1239 stats->imp_updates_ignored++;
1240 rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
1241 rte_free_quick(new);
1242 return;
1243 }
1244 }
1245
1246 l = &c->in_limit;
1247 if (l->action && !old_ok && new_ok)
1248 {
1249 if (stats->imp_routes >= l->limit)
1250 channel_notify_limit(c, l, PLD_IN, stats->imp_routes);
1251
1252 if (l->state == PLS_BLOCKED)
1253 {
1254 /* In import limit the situation is more complicated. We
1255 shouldn't just drop the route, we should handle it like
1256 it was filtered. We also have to continue the route
1257 processing if old or new is non-NULL, but we should exit
1258 if both are NULL as this case is probably assumed to be
1259 already handled. */
1260
1261 stats->imp_updates_ignored++;
1262 rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
1263
1264 if (c->in_keep_filtered)
1265 new->flags |= REF_FILTERED;
1266 else
1267 { rte_free_quick(new); new = NULL; }
1268
1269 /* Note that old && !new could be possible when
1270 c->in_keep_filtered changed in the recent past. */
1271
1272 if (!old && !new)
1273 return;
1274
1275 new_ok = 0;
1276 goto skip_stats1;
1277 }
1278 }
1279
1280 if (new_ok)
1281 stats->imp_updates_accepted++;
1282 else if (old_ok)
1283 stats->imp_withdraws_accepted++;
1284 else
1285 stats->imp_withdraws_ignored++;
1286
1287 if (old_ok || new_ok)
1288 table->last_rt_change = current_time();
1289
1290 skip_stats1:
1291
1292 if (new)
1293 rte_is_filtered(new) ? stats->filt_routes++ : stats->imp_routes++;
1294 if (old)
1295 rte_is_filtered(old) ? stats->filt_routes-- : stats->imp_routes--;
1296
1297 if (table->config->sorted)
1298 {
1299 /* If routes are sorted, just insert new route to appropriate position */
1300 if (new)
1301 {
1302 if (before_old && !rte_better(new, before_old))
1303 k = &before_old->next;
1304 else
1305 k = &net->routes;
1306
1307 for (; *k; k=&(*k)->next)
1308 if (rte_better(new, *k))
1309 break;
1310
1311 new->next = *k;
1312 *k = new;
1313
1314 table->rt_count++;
1315 }
1316 }
1317 else
1318 {
1319 /* If routes are not sorted, find the best route and move it on
1320 the first position. There are several optimized cases. */
1321
1322 if (src->proto->rte_recalculate && src->proto->rte_recalculate(table, net, new, old, old_best))
1323 goto do_recalculate;
1324
1325 if (new && rte_better(new, old_best))
1326 {
1327 /* The first case - the new route is cleary optimal,
1328 we link it at the first position */
1329
1330 new->next = net->routes;
1331 net->routes = new;
1332
1333 table->rt_count++;
1334 }
1335 else if (old == old_best)
1336 {
1337 /* The second case - the old best route disappeared, we add the
1338 new route (if we have any) to the list (we don't care about
1339 position) and then we elect the new optimal route and relink
1340 that route at the first position and announce it. New optimal
1341 route might be NULL if there is no more routes */
1342
1343 do_recalculate:
1344 /* Add the new route to the list */
1345 if (new)
1346 {
1347 new->next = *pos;
1348 *pos = new;
1349
1350 table->rt_count++;
1351 }
1352
1353 /* Find a new optimal route (if there is any) */
1354 if (net->routes)
1355 {
1356 rte **bp = &net->routes;
1357 for (k=&(*bp)->next; *k; k=&(*k)->next)
1358 if (rte_better(*k, *bp))
1359 bp = k;
1360
1361 /* And relink it */
1362 rte *best = *bp;
1363 *bp = best->next;
1364 best->next = net->routes;
1365 net->routes = best;
1366 }
1367 }
1368 else if (new)
1369 {
1370 /* The third case - the new route is not better than the old
1371 best route (therefore old_best != NULL) and the old best
1372 route was not removed (therefore old_best == net->routes).
1373 We just link the new route to the old/last position. */
1374
1375 new->next = *pos;
1376 *pos = new;
1377
1378 table->rt_count++;
1379 }
1380 /* The fourth (empty) case - suboptimal route was removed, nothing to do */
1381 }
1382
1383 if (new)
1384 {
1385 new->lastmod = current_time();
1386
1387 if (!old)
1388 {
1389 new->id = hmap_first_zero(&table->id_map);
1390 hmap_set(&table->id_map, new->id);
1391 }
1392 else
1393 new->id = old->id;
1394 }
1395
1396 /* Log the route change */
1397 if ((c->debug & D_ROUTES) || (p->debug & D_ROUTES))
1398 {
1399 if (new_ok)
1400 rte_trace(c, new, '>', new == net->routes ? "added [best]" : "added");
1401 else if (old_ok)
1402 {
1403 if (old != old_best)
1404 rte_trace(c, old, '>', "removed");
1405 else if (rte_is_ok(net->routes))
1406 rte_trace(c, old, '>', "removed [replaced]");
1407 else
1408 rte_trace(c, old, '>', "removed [sole]");
1409 }
1410 }
1411
1412 /* Propagate the route change */
1413 rte_announce(table, RA_UNDEF, net, new, old, net->routes, old_best);
1414
1415 if (!net->routes &&
1416 (table->gc_counter++ >= table->config->gc_threshold))
1417 rt_kick_prune_timer(table);
1418
1419 if (old_ok && p->rte_remove)
1420 p->rte_remove(net, old);
1421 if (new_ok && p->rte_insert)
1422 p->rte_insert(net, new);
1423
1424 if (old)
1425 {
1426 if (!new)
1427 hmap_clear(&table->id_map, old->id);
1428
1429 rte_free_quick(old);
1430 }
1431 }
1432
1433 static int rte_update_nest_cnt; /* Nesting counter to allow recursive updates */
1434
1435 static inline void
1436 rte_update_lock(void)
1437 {
1438 rte_update_nest_cnt++;
1439 }
1440
1441 static inline void
1442 rte_update_unlock(void)
1443 {
1444 if (!--rte_update_nest_cnt)
1445 lp_flush(rte_update_pool);
1446 }
1447
1448 /**
1449 * rte_update - enter a new update to a routing table
1450 * @table: table to be updated
1451 * @c: channel doing the update
1452 * @net: network node
1453 * @p: protocol submitting the update
1454 * @src: protocol originating the update
1455 * @new: a &rte representing the new route or %NULL for route removal.
1456 *
1457 * This function is called by the routing protocols whenever they discover
1458 * a new route or wish to update/remove an existing route. The right announcement
1459 * sequence is to build route attributes first (either un-cached with @aflags set
1460 * to zero or a cached one using rta_lookup(); in this case please note that
1461 * you need to increase the use count of the attributes yourself by calling
1462 * rta_clone()), call rte_get_temp() to obtain a temporary &rte, fill in all
1463 * the appropriate data and finally submit the new &rte by calling rte_update().
1464 *
1465 * @src specifies the protocol that originally created the route and the meaning
1466 * of protocol-dependent data of @new. If @new is not %NULL, @src have to be the
1467 * same value as @new->attrs->proto. @p specifies the protocol that called
1468 * rte_update(). In most cases it is the same protocol as @src. rte_update()
1469 * stores @p in @new->sender;
1470 *
1471 * When rte_update() gets any route, it automatically validates it (checks,
1472 * whether the network and next hop address are valid IP addresses and also
1473 * whether a normal routing protocol doesn't try to smuggle a host or link
1474 * scope route to the table), converts all protocol dependent attributes stored
1475 * in the &rte to temporary extended attributes, consults import filters of the
1476 * protocol to see if the route should be accepted and/or its attributes modified,
1477 * stores the temporary attributes back to the &rte.
1478 *
1479 * Now, having a "public" version of the route, we
1480 * automatically find any old route defined by the protocol @src
1481 * for network @n, replace it by the new one (or removing it if @new is %NULL),
1482 * recalculate the optimal route for this destination and finally broadcast
1483 * the change (if any) to all routing protocols by calling rte_announce().
1484 *
1485 * All memory used for attribute lists and other temporary allocations is taken
1486 * from a special linear pool @rte_update_pool and freed when rte_update()
1487 * finishes.
1488 */
1489
1490 void
1491 rte_update2(struct channel *c, const net_addr *n, rte *new, struct rte_src *src)
1492 {
1493 struct proto *p = c->proto;
1494 struct proto_stats *stats = &c->stats;
1495 const struct filter *filter = c->in_filter;
1496 struct mpls_fec *fec = NULL;
1497 net *nn;
1498
1499 ASSERT(c->channel_state == CS_UP);
1500
1501 rte_update_lock();
1502 if (new)
1503 {
1504 /* Create a temporary table node */
1505 nn = alloca(sizeof(net) + n->length);
1506 memset(nn, 0, sizeof(net) + n->length);
1507 net_copy(nn->n.addr, n);
1508
1509 new->net = nn;
1510 new->sender = c;
1511
1512 stats->imp_updates_received++;
1513 if (!rte_validate(new))
1514 {
1515 rte_trace_in(D_FILTERS, c, new, "invalid");
1516 stats->imp_updates_invalid++;
1517 goto drop;
1518 }
1519
1520 if (filter == FILTER_REJECT)
1521 {
1522 stats->imp_updates_filtered++;
1523 rte_trace_in(D_FILTERS, c, new, "filtered out");
1524
1525 if (! c->in_keep_filtered)
1526 goto drop;
1527
1528 /* new is a private copy, i could modify it */
1529 new->flags |= REF_FILTERED;
1530 }
1531 else if (filter)
1532 {
1533 int fr = f_run(filter, &new, rte_update_pool, 0);
1534 if (fr > F_ACCEPT)
1535 {
1536 stats->imp_updates_filtered++;
1537 rte_trace_in(D_FILTERS, c, new, "filtered out");
1538
1539 if (! c->in_keep_filtered)
1540 goto drop;
1541
1542 new->flags |= REF_FILTERED;
1543 }
1544 }
1545
1546 if (p->mpls_map)
1547 {
1548 if (mpls_handle_rte(p->mpls_map, n, new, rte_update_pool, &fec) < 0)
1549 {
1550 rte_trace_in(D_FILTERS, c, new, "invalid");
1551 stats->imp_updates_invalid++;
1552 goto drop;
1553 }
1554 }
1555
1556 if (!rta_is_cached(new->attrs)) /* Need to copy attributes */
1557 new->attrs = rta_lookup(new->attrs);
1558 new->flags |= REF_COW;
1559
1560 /* Use the actual struct network, not the dummy one */
1561 nn = net_get(c->table, n);
1562 new->net = nn;
1563 }
1564 else
1565 {
1566 stats->imp_withdraws_received++;
1567
1568 if (!(nn = net_find(c->table, n)) || !src)
1569 {
1570 stats->imp_withdraws_ignored++;
1571 rte_update_unlock();
1572 return;
1573 }
1574 }
1575
1576 recalc:
1577 /* And recalculate the best route */
1578 rte_recalculate(c, nn, new, src);
1579
1580 if (p->mpls_map)
1581 mpls_handle_rte_cleanup(p->mpls_map, &fec);
1582
1583 rte_update_unlock();
1584 return;
1585
1586 drop:
1587 rte_free(new);
1588 new = NULL;
1589 if (nn = net_find(c->table, n))
1590 goto recalc;
1591
1592 rte_update_unlock();
1593 }
1594
1595 /* Independent call to rte_announce(), used from next hop
1596 recalculation, outside of rte_update(). new must be non-NULL */
1597 static inline void
1598 rte_announce_i(rtable *tab, uint type, net *net, rte *new, rte *old,
1599 rte *new_best, rte *old_best)
1600 {
1601 rte_update_lock();
1602 rte_announce(tab, type, net, new, old, new_best, old_best);
1603 rte_update_unlock();
1604 }
1605
1606 static inline void
1607 rte_discard(rte *old) /* Non-filtered route deletion, used during garbage collection */
1608 {
1609 rte_update_lock();
1610 rte_recalculate(old->sender, old->net, NULL, old->src);
1611 rte_update_unlock();
1612 }
1613
1614 /* Modify existing route by protocol hook, used for long-lived graceful restart */
1615 static inline void
1616 rte_modify(rte *old)
1617 {
1618 rte_update_lock();
1619
1620 rte *new = old->sender->proto->rte_modify(old, rte_update_pool);
1621 if (new != old)
1622 {
1623 if (new)
1624 {
1625 if (!rta_is_cached(new->attrs))
1626 new->attrs = rta_lookup(new->attrs);
1627 new->flags = (old->flags & ~REF_MODIFY) | REF_COW;
1628 }
1629
1630 rte_recalculate(old->sender, old->net, new, old->src);
1631 }
1632
1633 rte_update_unlock();
1634 }
1635
1636 /* Check rtable for best route to given net whether it would be exported do p */
1637 int
1638 rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter)
1639 {
1640 struct proto *p = c->proto;
1641 net *n = net_find(t, a);
1642 rte *rt = n ? n->routes : NULL;
1643
1644 if (!rte_is_valid(rt))
1645 return 0;
1646
1647 rte_update_lock();
1648
1649 /* Rest is stripped down export_filter() */
1650 int v = p->preexport ? p->preexport(c, rt) : 0;
1651 if (v == RIC_PROCESS)
1652 v = (f_run(filter, &rt, rte_update_pool, FF_SILENT) <= F_ACCEPT);
1653
1654 /* Discard temporary rte */
1655 if (rt != n->routes)
1656 rte_free(rt);
1657
1658 rte_update_unlock();
1659
1660 return v > 0;
1661 }
1662
1663
1664 /**
1665 * rt_refresh_begin - start a refresh cycle
1666 * @t: related routing table
1667 * @c related channel
1668 *
1669 * This function starts a refresh cycle for given routing table and announce
1670 * hook. The refresh cycle is a sequence where the protocol sends all its valid
1671 * routes to the routing table (by rte_update()). After that, all protocol
1672 * routes (more precisely routes with @c as @sender) not sent during the
1673 * refresh cycle but still in the table from the past are pruned. This is
1674 * implemented by marking all related routes as stale by REF_STALE flag in
1675 * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD
1676 * flag in rt_refresh_end() and then removing such routes in the prune loop.
1677 */
1678 void
1679 rt_refresh_begin(rtable *t, struct channel *c)
1680 {
1681 if (c->debug & D_EVENTS)
1682 log(L_TRACE "%s.%s: Route refresh begin", c->proto->name, c->name);
1683
1684 FIB_WALK(&t->fib, net, n)
1685 {
1686 rte *e;
1687 for (e = n->routes; e; e = e->next)
1688 if (e->sender == c)
1689 e->flags |= REF_STALE;
1690 }
1691 FIB_WALK_END;
1692 }
1693
1694 /**
1695 * rt_refresh_end - end a refresh cycle
1696 * @t: related routing table
1697 * @c: related channel
1698 *
1699 * This function ends a refresh cycle for given routing table and announce
1700 * hook. See rt_refresh_begin() for description of refresh cycles.
1701 */
1702 void
1703 rt_refresh_end(rtable *t, struct channel *c)
1704 {
1705 if (c->debug & D_EVENTS)
1706 log(L_TRACE "%s.%s: Route refresh end", c->proto->name, c->name);
1707
1708 int prune = 0;
1709
1710 FIB_WALK(&t->fib, net, n)
1711 {
1712 rte *e;
1713 for (e = n->routes; e; e = e->next)
1714 if ((e->sender == c) && (e->flags & REF_STALE))
1715 {
1716 e->flags |= REF_DISCARD;
1717 prune = 1;
1718 }
1719 }
1720 FIB_WALK_END;
1721
1722 if (prune)
1723 rt_schedule_prune(t);
1724 }
1725
1726 void
1727 rt_modify_stale(rtable *t, struct channel *c)
1728 {
1729 int prune = 0;
1730
1731 FIB_WALK(&t->fib, net, n)
1732 {
1733 rte *e;
1734 for (e = n->routes; e; e = e->next)
1735 if ((e->sender == c) && (e->flags & REF_STALE) && !(e->flags & REF_FILTERED))
1736 {
1737 e->flags |= REF_MODIFY;
1738 prune = 1;
1739 }
1740 }
1741 FIB_WALK_END;
1742
1743 if (prune)
1744 rt_schedule_prune(t);
1745 }
1746
1747 /**
1748 * rte_dump - dump a route
1749 * @e: &rte to be dumped
1750 *
1751 * This functions dumps contents of a &rte to debug output.
1752 */
1753 void
1754 rte_dump(struct dump_request *dreq, rte *e)
1755 {
1756 net *n = e->net;
1757 RDUMP("%-1N ", n->n.addr);
1758 RDUMP("PF=%02x ", e->pflags);
1759 rta_dump(dreq, e->attrs);
1760 RDUMP("\n");
1761 }
1762
1763 /**
1764 * rt_dump - dump a routing table
1765 * @t: routing table to be dumped
1766 *
1767 * This function dumps contents of a given routing table to debug output.
1768 */
1769 void
1770 rt_dump(struct dump_request *dreq, rtable *t)
1771 {
1772 RDUMP("Dump of routing table <%s>\n", t->name);
1773 #ifdef DEBUGGING
1774 fib_check(&t->fib);
1775 #endif
1776 FIB_WALK(&t->fib, net, n)
1777 {
1778 rte *e;
1779 for(e=n->routes; e; e=e->next)
1780 rte_dump(dreq, e);
1781 }
1782 FIB_WALK_END;
1783 RDUMP("\n");
1784 }
1785
1786 /**
1787 * rt_dump_all - dump all routing tables
1788 *
1789 * This function dumps contents of all routing tables to debug output.
1790 */
1791 void
1792 rt_dump_all(struct dump_request *dreq)
1793 {
1794 rtable *t;
1795 node *n;
1796
1797 WALK_LIST2(t, n, routing_tables, n)
1798 rt_dump(dreq, t);
1799 }
1800
1801 static inline void
1802 rt_schedule_hcu(rtable *tab)
1803 {
1804 if (tab->hcu_scheduled)
1805 return;
1806
1807 tab->hcu_scheduled = 1;
1808 ev_schedule(tab->rt_event);
1809 }
1810
1811 static inline void
1812 rt_schedule_nhu(rtable *tab)
1813 {
1814 if (tab->nhu_state == NHU_CLEAN)
1815 ev_schedule(tab->rt_event);
1816
1817 /* state change:
1818 * NHU_CLEAN -> NHU_SCHEDULED
1819 * NHU_RUNNING -> NHU_DIRTY
1820 */
1821 tab->nhu_state |= NHU_SCHEDULED;
1822 }
1823
1824 void
1825 rt_schedule_prune(rtable *tab)
1826 {
1827 if (tab->prune_state == 0)
1828 ev_schedule(tab->rt_event);
1829
1830 /* state change 0->1, 2->3 */
1831 tab->prune_state |= 1;
1832 }
1833
1834
1835 static void
1836 rt_event(void *ptr)
1837 {
1838 rtable *tab = ptr;
1839
1840 rt_lock_table(tab);
1841
1842 if (tab->hcu_scheduled)
1843 rt_update_hostcache(tab);
1844
1845 if (tab->nhu_state)
1846 rt_next_hop_update(tab);
1847
1848 if (tab->prune_state)
1849 rt_prune_table(tab);
1850
1851 rt_unlock_table(tab);
1852 }
1853
1854
1855 static void
1856 rt_prune_timer(timer *t)
1857 {
1858 rtable *tab = t->data;
1859
1860 if (tab->gc_counter >= tab->config->gc_threshold)
1861 rt_schedule_prune(tab);
1862 }
1863
1864 static void
1865 rt_kick_prune_timer(rtable *tab)
1866 {
1867 /* Return if prune is already scheduled */
1868 if (tm_active(tab->prune_timer) || (tab->prune_state & 1))
1869 return;
1870
1871 /* Randomize GC period to +/- 50% */
1872 btime gc_period = tab->config->gc_period;
1873 gc_period = (gc_period / 2) + (random_u32() % (uint) gc_period);
1874 tm_start(tab->prune_timer, gc_period);
1875 }
1876
1877
1878 static inline btime
1879 rt_settled_time(rtable *tab)
1880 {
1881 ASSUME(tab->base_settle_time != 0);
1882
1883 return MIN(tab->last_rt_change + tab->config->min_settle_time,
1884 tab->base_settle_time + tab->config->max_settle_time);
1885 }
1886
1887 static void
1888 rt_settle_timer(timer *t)
1889 {
1890 rtable *tab = t->data;
1891
1892 if (!tab->base_settle_time)
1893 return;
1894
1895 btime settled_time = rt_settled_time(tab);
1896 if (current_time() < settled_time)
1897 {
1898 tm_set(tab->settle_timer, settled_time);
1899 return;
1900 }
1901
1902 /* Settled */
1903 tab->base_settle_time = 0;
1904
1905 struct rt_subscription *s;
1906 WALK_LIST(s, tab->subscribers)
1907 s->hook(s);
1908 }
1909
1910 static void
1911 rt_kick_settle_timer(rtable *tab)
1912 {
1913 tab->base_settle_time = current_time();
1914
1915 if (!tab->settle_timer)
1916 tab->settle_timer = tm_new_init(tab->rp, rt_settle_timer, tab, 0, 0);
1917
1918 if (!tm_active(tab->settle_timer))
1919 tm_set(tab->settle_timer, rt_settled_time(tab));
1920 }
1921
1922 static inline void
1923 rt_schedule_notify(rtable *tab)
1924 {
1925 if (EMPTY_LIST(tab->subscribers))
1926 return;
1927
1928 if (tab->base_settle_time)
1929 return;
1930
1931 rt_kick_settle_timer(tab);
1932 }
1933
1934 void
1935 rt_subscribe(rtable *tab, struct rt_subscription *s)
1936 {
1937 s->tab = tab;
1938 rt_lock_table(tab);
1939 add_tail(&tab->subscribers, &s->n);
1940 }
1941
1942 void
1943 rt_unsubscribe(struct rt_subscription *s)
1944 {
1945 rem_node(&s->n);
1946 rt_unlock_table(s->tab);
1947 }
1948
1949 static struct rt_flowspec_link *
1950 rt_flowspec_find_link(rtable *src, rtable *dst)
1951 {
1952 struct rt_flowspec_link *ln;
1953 WALK_LIST(ln, src->flowspec_links)
1954 if ((ln->src == src) && (ln->dst == dst))
1955 return ln;
1956
1957 return NULL;
1958 }
1959
1960 void
1961 rt_flowspec_link(rtable *src, rtable *dst)
1962 {
1963 ASSERT(rt_is_ip(src));
1964 ASSERT(rt_is_flow(dst));
1965
1966 struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst);
1967
1968 if (!ln)
1969 {
1970 rt_lock_table(src);
1971 rt_lock_table(dst);
1972
1973 ln = mb_allocz(src->rp, sizeof(struct rt_flowspec_link));
1974 ln->src = src;
1975 ln->dst = dst;
1976 add_tail(&src->flowspec_links, &ln->n);
1977 }
1978
1979 ln->uc++;
1980 }
1981
1982 void
1983 rt_flowspec_unlink(rtable *src, rtable *dst)
1984 {
1985 struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst);
1986
1987 ASSERT(ln && (ln->uc > 0));
1988
1989 ln->uc--;
1990
1991 if (!ln->uc)
1992 {
1993 rem_node(&ln->n);
1994 mb_free(ln);
1995
1996 rt_unlock_table(src);
1997 rt_unlock_table(dst);
1998 }
1999 }
2000
2001 static void
2002 rt_flowspec_notify(rtable *src, net *net)
2003 {
2004 /* Only IP tables are src links */
2005 ASSERT(rt_is_ip(src));
2006
2007 struct rt_flowspec_link *ln;
2008 WALK_LIST(ln, src->flowspec_links)
2009 {
2010 rtable *dst = ln->dst;
2011 ASSERT(rt_is_flow(dst));
2012
2013 /* No need to inspect it further if recalculation is already active */
2014 if ((dst->nhu_state == NHU_SCHEDULED) || (dst->nhu_state == NHU_DIRTY))
2015 continue;
2016
2017 if (trie_match_net(dst->flowspec_trie, net->n.addr))
2018 rt_schedule_nhu(dst);
2019 }
2020 }
2021
2022 static void
2023 rt_flowspec_reset_trie(rtable *tab)
2024 {
2025 linpool *lp = tab->flowspec_trie->lp;
2026 int ipv4 = tab->flowspec_trie->ipv4;
2027
2028 lp_flush(lp);
2029 tab->flowspec_trie = f_new_trie(lp, 0);
2030 tab->flowspec_trie->ipv4 = ipv4;
2031 }
2032
2033 static void
2034 rt_free(resource *_r)
2035 {
2036 rtable *r = (rtable *) _r;
2037
2038 DBG("Deleting routing table %s\n", r->name);
2039 ASSERT_DIE(r->use_count == 0);
2040
2041 if (r->internal)
2042 return;
2043
2044 r->config->table = NULL;
2045 rem_node(&r->n);
2046
2047 if (r->hostcache)
2048 rt_free_hostcache(r);
2049
2050 /* Freed automagically by the resource pool
2051 fib_free(&r->fib);
2052 hmap_free(&r->id_map);
2053 rfree(r->rt_event);
2054 rfree(r->settle_timer);
2055 mb_free(r);
2056 */
2057 }
2058
2059 static void
2060 rt_res_dump(struct dump_request *dreq, resource *_r)
2061 {
2062 rtable *r = (rtable *) _r;
2063 RDUMP("name \"%s\", addr_type=%s, rt_count=%u, use_count=%d\n",
2064 r->name, net_label[r->addr_type], r->rt_count, r->use_count);
2065 }
2066
2067 static struct resclass rt_class = {
2068 .name = "Routing table",
2069 .size = sizeof(struct rtable),
2070 .free = rt_free,
2071 .dump = rt_res_dump,
2072 .lookup = NULL,
2073 .memsize = NULL,
2074 };
2075
2076 rtable *
2077 rt_setup(pool *pp, struct rtable_config *cf)
2078 {
2079 pool *p = rp_newf(pp, "Routing table %s", cf->name);
2080
2081 rtable *t = ralloc(p, &rt_class);
2082 t->rp = p;
2083
2084 t->name = cf->name;
2085 t->config = cf;
2086 t->addr_type = cf->addr_type;
2087 t->debug = cf->debug;
2088
2089 fib_init(&t->fib, p, t->addr_type, sizeof(net), OFFSETOF(net, n), 0, NULL);
2090
2091 if (cf->trie_used)
2092 {
2093 t->trie = f_new_trie(lp_new_default(p), 0);
2094 t->trie->ipv4 = net_val_match(t->addr_type, NB_IP4 | NB_VPN4 | NB_ROA4);
2095
2096 t->fib.init = net_init_with_trie;
2097 }
2098
2099 init_list(&t->channels);
2100 init_list(&t->flowspec_links);
2101 init_list(&t->subscribers);
2102
2103 hmap_init(&t->id_map, p, 1024);
2104 hmap_set(&t->id_map, 0);
2105
2106 if (!(t->internal = cf->internal))
2107 {
2108 t->rt_event = ev_new_init(p, rt_event, t);
2109 t->prune_timer = tm_new_init(p, rt_prune_timer, t, 0, 0);
2110 t->last_rt_change = t->gc_time = current_time();
2111
2112 if (rt_is_flow(t))
2113 {
2114 t->flowspec_trie = f_new_trie(lp_new_default(p), 0);
2115 t->flowspec_trie->ipv4 = (t->addr_type == NET_FLOW4);
2116 }
2117 }
2118
2119 return t;
2120 }
2121
2122 /**
2123 * rt_init - initialize routing tables
2124 *
2125 * This function is called during BIRD startup. It initializes the
2126 * routing table module.
2127 */
2128 void
2129 rt_init(void)
2130 {
2131 rta_init();
2132 rt_table_pool = rp_new(&root_pool, "Routing tables");
2133 rte_update_pool = lp_new_default(rt_table_pool);
2134 rte_slab = sl_new(rt_table_pool, sizeof(rte));
2135 init_list(&routing_tables);
2136 }
2137
2138
2139 /**
2140 * rt_prune_table - prune a routing table
2141 *
2142 * The prune loop scans routing tables and removes routes belonging to flushing
2143 * protocols, discarded routes and also stale network entries. It is called from
2144 * rt_event(). The event is rescheduled if the current iteration do not finish
2145 * the table. The pruning is directed by the prune state (@prune_state),
2146 * specifying whether the prune cycle is scheduled or running, and there
2147 * is also a persistent pruning iterator (@prune_fit).
2148 *
2149 * The prune loop is used also for channel flushing. For this purpose, the
2150 * channels to flush are marked before the iteration and notified after the
2151 * iteration.
2152 */
2153 static void
2154 rt_prune_table(rtable *tab)
2155 {
2156 struct fib_iterator *fit = &tab->prune_fit;
2157 int limit = 2000;
2158
2159 struct channel *c;
2160 node *n, *x;
2161
2162 DBG("Pruning route table %s\n", tab->name);
2163 #ifdef DEBUGGING
2164 fib_check(&tab->fib);
2165 #endif
2166
2167 if (tab->prune_state == 0)
2168 return;
2169
2170 if (tab->prune_state == 1)
2171 {
2172 /* Mark channels to flush */
2173 WALK_LIST2(c, n, tab->channels, table_node)
2174 if (c->channel_state == CS_FLUSHING)
2175 c->flush_active = 1;
2176
2177 FIB_ITERATE_INIT(fit, &tab->fib);
2178 tab->prune_state = 2;
2179
2180 tab->gc_counter = 0;
2181 tab->gc_time = current_time();
2182
2183 if (tab->prune_trie)
2184 {
2185 /* Init prefix trie pruning */
2186 tab->trie_new = f_new_trie(lp_new_default(tab->rp), 0);
2187 tab->trie_new->ipv4 = tab->trie->ipv4;
2188 }
2189 }
2190
2191 again:
2192 FIB_ITERATE_START(&tab->fib, fit, net, n)
2193 {
2194 rte *e;
2195
2196 rescan:
2197 if (limit <= 0)
2198 {
2199 FIB_ITERATE_PUT(fit);
2200 ev_schedule(tab->rt_event);
2201 return;
2202 }
2203
2204 for (e=n->routes; e; e=e->next)
2205 {
2206 if (e->sender->flush_active || (e->flags & REF_DISCARD))
2207 {
2208 rte_discard(e);
2209 limit--;
2210
2211 goto rescan;
2212 }
2213
2214 if (e->flags & REF_MODIFY)
2215 {
2216 rte_modify(e);
2217 limit--;
2218
2219 goto rescan;
2220 }
2221 }
2222
2223 if (!n->routes) /* Orphaned FIB entry */
2224 {
2225 FIB_ITERATE_PUT(fit);
2226 fib_delete(&tab->fib, n);
2227 goto again;
2228 }
2229
2230 if (tab->trie_new)
2231 {
2232 trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen);
2233 limit--;
2234 }
2235 }
2236 FIB_ITERATE_END;
2237
2238 #ifdef DEBUGGING
2239 fib_check(&tab->fib);
2240 #endif
2241
2242 /* state change 2->0, 3->1 */
2243 tab->prune_state &= 1;
2244
2245 if (tab->trie_new)
2246 {
2247 /* Finish prefix trie pruning */
2248
2249 if (!tab->trie_lock_count)
2250 {
2251 rfree(tab->trie->lp);
2252 }
2253 else
2254 {
2255 ASSERT(!tab->trie_old);
2256 tab->trie_old = tab->trie;
2257 tab->trie_old_lock_count = tab->trie_lock_count;
2258 tab->trie_lock_count = 0;
2259 }
2260
2261 tab->trie = tab->trie_new;
2262 tab->trie_new = NULL;
2263 tab->prune_trie = 0;
2264 }
2265 else
2266 {
2267 /* Schedule prefix trie pruning */
2268 if (tab->trie && !tab->trie_old && (tab->trie->prefix_count > (2 * tab->fib.entries)))
2269 {
2270 /* state change 0->1, 2->3 */
2271 tab->prune_state |= 1;
2272 tab->prune_trie = 1;
2273 }
2274 }
2275
2276 if (tab->prune_state > 0)
2277 ev_schedule(tab->rt_event);
2278
2279 /* FIXME: This should be handled in a better way */
2280 rt_prune_sources();
2281
2282 /* Close flushed channels */
2283 WALK_LIST2_DELSAFE(c, n, x, tab->channels, table_node)
2284 if (c->flush_active)
2285 {
2286 c->flush_active = 0;
2287 channel_set_state(c, CS_DOWN);
2288 }
2289
2290 return;
2291 }
2292
2293 /**
2294 * rt_lock_trie - lock a prefix trie of a routing table
2295 * @tab: routing table with prefix trie to be locked
2296 *
2297 * The prune loop may rebuild the prefix trie and invalidate f_trie_walk_state
2298 * structures. Therefore, asynchronous walks should lock the prefix trie using
2299 * this function. That allows the prune loop to rebuild the trie, but postpones
2300 * its freeing until all walks are done (unlocked by rt_unlock_trie()).
2301 *
2302 * Return a current trie that will be locked, the value should be passed back to
2303 * rt_unlock_trie() for unlocking.
2304 *
2305 */
2306 struct f_trie *
2307 rt_lock_trie(rtable *tab)
2308 {
2309 ASSERT(tab->trie);
2310
2311 tab->trie_lock_count++;
2312 return tab->trie;
2313 }
2314
2315 /**
2316 * rt_unlock_trie - unlock a prefix trie of a routing table
2317 * @tab: routing table with prefix trie to be locked
2318 * @trie: value returned by matching rt_lock_trie()
2319 *
2320 * Done for trie locked by rt_lock_trie() after walk over the trie is done.
2321 * It may free the trie and schedule next trie pruning.
2322 */
2323 void
2324 rt_unlock_trie(rtable *tab, struct f_trie *trie)
2325 {
2326 ASSERT(trie);
2327
2328 if (trie == tab->trie)
2329 {
2330 /* Unlock the current prefix trie */
2331 ASSERT(tab->trie_lock_count);
2332 tab->trie_lock_count--;
2333 }
2334 else if (trie == tab->trie_old)
2335 {
2336 /* Unlock the old prefix trie */
2337 ASSERT(tab->trie_old_lock_count);
2338 tab->trie_old_lock_count--;
2339
2340 /* Free old prefix trie that is no longer needed */
2341 if (!tab->trie_old_lock_count)
2342 {
2343 rfree(tab->trie_old->lp);
2344 tab->trie_old = NULL;
2345
2346 /* Kick prefix trie pruning that was postponed */
2347 if (tab->trie && (tab->trie->prefix_count > (2 * tab->fib.entries)))
2348 {
2349 tab->prune_trie = 1;
2350 rt_schedule_prune(tab);
2351 }
2352 }
2353 }
2354 else
2355 log(L_BUG "Invalid arg to rt_unlock_trie()");
2356 }
2357
2358
2359 void
2360 rt_preconfig(struct config *c)
2361 {
2362 init_list(&c->tables);
2363
2364 rt_new_table(cf_get_symbol(c, "master4"), NET_IP4);
2365 rt_new_table(cf_get_symbol(c, "master6"), NET_IP6);
2366 }
2367
2368 void
2369 rt_postconfig(struct config *c)
2370 {
2371 uint num_tables = list_length(&c->tables);
2372 btime def_gc_period = 400 MS * num_tables;
2373 def_gc_period = MAX(def_gc_period, 10 S);
2374 def_gc_period = MIN(def_gc_period, 600 S);
2375
2376 struct rtable_config *rc;
2377 WALK_LIST(rc, c->tables)
2378 if (rc->gc_period == (uint) -1)
2379 rc->gc_period = (uint) def_gc_period;
2380 }
2381
2382
2383 /*
2384 * Some functions for handing internal next hop updates
2385 * triggered by rt_schedule_nhu().
2386 */
2387
2388 void
2389 rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls)
2390 {
2391 a->hostentry = he;
2392 a->dest = he->dest;
2393 a->igp_metric = he->igp_metric;
2394
2395 if (a->dest != RTD_UNICAST)
2396 {
2397 /* No nexthop */
2398 no_nexthop:
2399 a->nh = (struct nexthop) {};
2400 if (mls)
2401 { /* Store the label stack for later changes */
2402 a->nh.labels_orig = a->nh.labels = mls->len;
2403 memcpy(a->nh.label, mls->stack, mls->len * sizeof(u32));
2404 }
2405 return;
2406 }
2407
2408 if (((!mls) || (!mls->len)) && he->nexthop_linkable)
2409 { /* Just link the nexthop chain, no label append happens. */
2410 memcpy(&(a->nh), &(he->src->nh), nexthop_size(&(he->src->nh)));
2411 return;
2412 }
2413
2414 struct nexthop *nhp = NULL, *nhr = NULL;
2415 int skip_nexthop = 0;
2416
2417 for (struct nexthop *nh = &(he->src->nh); nh; nh = nh->next)
2418 {
2419 if (skip_nexthop)
2420 skip_nexthop--;
2421 else
2422 {
2423 nhr = nhp;
2424 nhp = (nhp ? (nhp->next = lp_alloc(rte_update_pool, NEXTHOP_MAX_SIZE)) : &(a->nh));
2425 }
2426
2427 memset(nhp, 0, NEXTHOP_MAX_SIZE);
2428 nhp->iface = nh->iface;
2429 nhp->weight = nh->weight;
2430
2431 if (mls)
2432 {
2433 nhp->labels = nh->labels + mls->len;
2434 nhp->labels_orig = mls->len;
2435 if (nhp->labels <= MPLS_MAX_LABEL_STACK)
2436 {
2437 memcpy(nhp->label, nh->label, nh->labels * sizeof(u32)); /* First the hostentry labels */
2438 memcpy(&(nhp->label[nh->labels]), mls->stack, mls->len * sizeof(u32)); /* Then the bottom labels */
2439 }
2440 else
2441 {
2442 log(L_WARN "Sum of label stack sizes %d + %d = %d exceedes allowed maximum (%d)",
2443 nh->labels, mls->len, nhp->labels, MPLS_MAX_LABEL_STACK);
2444 skip_nexthop++;
2445 continue;
2446 }
2447 }
2448 else if (nh->labels)
2449 {
2450 nhp->labels = nh->labels;
2451 nhp->labels_orig = 0;
2452 memcpy(nhp->label, nh->label, nh->labels * sizeof(u32));
2453 }
2454
2455 if (ipa_nonzero(nh->gw))
2456 {
2457 nhp->gw = nh->gw; /* Router nexthop */
2458 nhp->flags |= (nh->flags & RNF_ONLINK);
2459 }
2460 else if (!(nh->iface->flags & IF_MULTIACCESS) || (nh->iface->flags & IF_LOOPBACK))
2461 nhp->gw = IPA_NONE; /* PtP link - no need for nexthop */
2462 else if (ipa_nonzero(he->link))
2463 nhp->gw = he->link; /* Device nexthop with link-local address known */
2464 else
2465 nhp->gw = he->addr; /* Device nexthop with link-local address unknown */
2466 }
2467
2468 if (skip_nexthop)
2469 if (nhr)
2470 nhr->next = NULL;
2471 else
2472 {
2473 a->dest = RTD_UNREACHABLE;
2474 log(L_WARN "No valid nexthop remaining, setting route unreachable");
2475 goto no_nexthop;
2476 }
2477 }
2478
2479 static inline int
2480 rta_next_hop_outdated(rta *a)
2481 {
2482 struct hostentry *he = a->hostentry;
2483
2484 if (!he)
2485 return 0;
2486
2487 if (!he->src)
2488 return a->dest != RTD_UNREACHABLE;
2489
2490 return (a->dest != he->dest) || (a->igp_metric != he->igp_metric) ||
2491 (!he->nexthop_linkable) || !nexthop_same(&(a->nh), &(he->src->nh));
2492 }
2493
2494 static inline rte *
2495 rt_next_hop_update_rte(rtable *tab UNUSED, rte *old)
2496 {
2497 if (!rta_next_hop_outdated(old->attrs))
2498 return NULL;
2499
2500 rta *a = alloca(RTA_MAX_SIZE);
2501 memcpy(a, old->attrs, rta_size(old->attrs));
2502
2503 mpls_label_stack mls = { .len = a->nh.labels_orig };
2504 memcpy(mls.stack, &a->nh.label[a->nh.labels - mls.len], mls.len * sizeof(u32));
2505
2506 rta_apply_hostentry(a, old->attrs->hostentry, &mls);
2507 a->cached = 0;
2508
2509 rte *e = sl_alloc(rte_slab);
2510 memcpy(e, old, sizeof(rte));
2511 e->attrs = rta_lookup(a);
2512 rt_lock_source(e->src);
2513
2514 return e;
2515 }
2516
2517
2518 #ifdef CONFIG_BGP
2519
2520 static inline int
2521 net_flow_has_dst_prefix(const net_addr *n)
2522 {
2523 ASSUME(net_is_flow(n));
2524
2525 if (n->pxlen)
2526 return 1;
2527
2528 if (n->type == NET_FLOW4)
2529 {
2530 const net_addr_flow4 *n4 = (void *) n;
2531 return (n4->length > sizeof(net_addr_flow4)) && (n4->data[0] == FLOW_TYPE_DST_PREFIX);
2532 }
2533 else
2534 {
2535 const net_addr_flow6 *n6 = (void *) n;
2536 return (n6->length > sizeof(net_addr_flow6)) && (n6->data[0] == FLOW_TYPE_DST_PREFIX);
2537 }
2538 }
2539
2540 static inline int
2541 rta_as_path_is_empty(rta *a)
2542 {
2543 eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH));
2544 return !e || (as_path_getlen(e->u.ptr) == 0);
2545 }
2546
2547 static inline u32
2548 rta_get_first_asn(rta *a)
2549 {
2550 eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH));
2551 u32 asn;
2552
2553 return (e && as_path_get_first_regular(e->u.ptr, &asn)) ? asn : 0;
2554 }
2555
2556 int
2557 rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, rta *a, int interior)
2558 {
2559 ASSERT(rt_is_ip(tab_ip));
2560 ASSERT(rt_is_flow(tab_flow));
2561 ASSERT(tab_ip->trie);
2562
2563 /* RFC 8955 6. a) Flowspec has defined dst prefix */
2564 if (!net_flow_has_dst_prefix(n))
2565 return 0;
2566
2567 /* RFC 9117 4.1. Accept AS_PATH is empty (fr */
2568 if (interior && rta_as_path_is_empty(a))
2569 return 1;
2570
2571
2572 /* RFC 8955 6. b) Flowspec and its best-match route have the same originator */
2573
2574 /* Find flowspec dst prefix */
2575 net_addr dst;
2576 if (n->type == NET_FLOW4)
2577 net_fill_ip4(&dst, net4_prefix(n), net4_pxlen(n));
2578 else
2579 net_fill_ip6(&dst, net6_prefix(n), net6_pxlen(n));
2580
2581 /* Find best-match BGP unicast route for flowspec dst prefix */
2582 net *nb = net_route(tab_ip, &dst);
2583 rte *rb = nb ? nb->routes : NULL;
2584
2585 /* Register prefix to trie for tracking further changes */
2586 int max_pxlen = (n->type == NET_FLOW4) ? IP4_MAX_PREFIX_LENGTH : IP6_MAX_PREFIX_LENGTH;
2587 trie_add_prefix(tab_flow->flowspec_trie, &dst, (nb ? nb->n.addr->pxlen : 0), max_pxlen);
2588
2589 /* No best-match BGP route -> no flowspec */
2590 if (!rb || (rb->attrs->source != RTS_BGP))
2591 return 0;
2592
2593 /* Find ORIGINATOR_ID values */
2594 u32 orig_a = ea_get_int(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0);
2595 u32 orig_b = ea_get_int(rb->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0);
2596
2597 /* Originator is either ORIGINATOR_ID (if present), or BGP neighbor address (if not) */
2598 if ((orig_a != orig_b) || (!orig_a && !orig_b && !ipa_equal(a->from, rb->attrs->from)))
2599 return 0;
2600
2601
2602 /* Find ASN of the best-match route, for use in next checks */
2603 u32 asn_b = rta_get_first_asn(rb->attrs);
2604 if (!asn_b)
2605 return 0;
2606
2607 /* RFC 9117 4.2. For EBGP, flowspec and its best-match route are from the same AS */
2608 if (!interior && (rta_get_first_asn(a) != asn_b))
2609 return 0;
2610
2611 /* RFC 8955 6. c) More-specific routes are from the same AS as the best-match route */
2612 TRIE_WALK(tab_ip->trie, subnet, &dst)
2613 {
2614 net *nc = net_find_valid(tab_ip, &subnet);
2615 if (!nc)
2616 continue;
2617
2618 rte *rc = nc->routes;
2619 if (rc->attrs->source != RTS_BGP)
2620 return 0;
2621
2622 if (rta_get_first_asn(rc->attrs) != asn_b)
2623 return 0;
2624 }
2625 TRIE_WALK_END;
2626
2627 return 1;
2628 }
2629
2630 #endif /* CONFIG_BGP */
2631
2632 static rte *
2633 rt_flowspec_update_rte(rtable *tab, rte *r)
2634 {
2635 #ifdef CONFIG_BGP
2636 if ((r->attrs->source != RTS_BGP) || (r->sender->proto != r->src->proto))
2637 return NULL;
2638
2639 struct bgp_channel *bc = (struct bgp_channel *) r->sender;
2640 if (!bc->base_table)
2641 return NULL;
2642
2643 const net_addr *n = r->net->n.addr;
2644 struct bgp_proto *p = (void *) r->src->proto;
2645 int valid = rt_flowspec_check(bc->base_table, tab, n, r->attrs, p->is_interior);
2646 int dest = valid ? RTD_NONE : RTD_UNREACHABLE;
2647
2648 if (dest == r->attrs->dest)
2649 return NULL;
2650
2651 rta *a = alloca(RTA_MAX_SIZE);
2652 memcpy(a, r->attrs, rta_size(r->attrs));
2653 a->dest = dest;
2654 a->cached = 0;
2655
2656 rte *new = sl_alloc(rte_slab);
2657 memcpy(new, r, sizeof(rte));
2658 new->attrs = rta_lookup(a);
2659 rt_lock_source(new->src);
2660
2661 return new;
2662 #else
2663 return NULL;
2664 #endif
2665 }
2666
2667
2668 static inline int
2669 rt_next_hop_update_net(rtable *tab, net *n)
2670 {
2671 rte **k, *e, *new, *old_best, **new_best;
2672 int count = 0;
2673 int free_old_best = 0;
2674
2675 old_best = n->routes;
2676 if (!old_best)
2677 return 0;
2678
2679 for (k = &n->routes; e = *k; k = &e->next)
2680 {
2681 if (!net_is_flow(n->n.addr))
2682 new = rt_next_hop_update_rte(tab, e);
2683 else
2684 new = rt_flowspec_update_rte(tab, e);
2685
2686 if (new)
2687 {
2688 *k = new;
2689
2690 rte_trace_in(D_ROUTES, new->sender, new, "updated");
2691 rte_announce_i(tab, RA_ANY, n, new, e, NULL, NULL);
2692
2693 /* Call a pre-comparison hook */
2694 /* Not really an efficient way to compute this */
2695 if (e->src->proto->rte_recalculate)
2696 e->src->proto->rte_recalculate(tab, n, new, e, NULL);
2697
2698 if (e != old_best)
2699 rte_free_quick(e);
2700 else /* Freeing of the old best rte is postponed */
2701 free_old_best = 1;
2702
2703 e = new;
2704 count++;
2705 }
2706 }
2707
2708 if (!count)
2709 return 0;
2710
2711 /* Find the new best route */
2712 new_best = NULL;
2713 for (k = &n->routes; e = *k; k = &e->next)
2714 {
2715 if (!new_best || rte_better(e, *new_best))
2716 new_best = k;
2717 }
2718
2719 /* Relink the new best route to the first position */
2720 new = *new_best;
2721 if (new != n->routes)
2722 {
2723 *new_best = new->next;
2724 new->next = n->routes;
2725 n->routes = new;
2726 }
2727
2728 /* Announce the new best route */
2729 if (new != old_best)
2730 rte_trace_in(D_ROUTES, new->sender, new, "updated [best]");
2731
2732 /* Propagate changes */
2733 rte_announce_i(tab, RA_UNDEF, n, NULL, NULL, n->routes, old_best);
2734
2735 if (free_old_best)
2736 rte_free_quick(old_best);
2737
2738 return count;
2739 }
2740
2741 static void
2742 rt_next_hop_update(rtable *tab)
2743 {
2744 struct fib_iterator *fit = &tab->nhu_fit;
2745 int max_feed = 32;
2746
2747 if (tab->nhu_state == NHU_CLEAN)
2748 return;
2749
2750 if (tab->nhu_state == NHU_SCHEDULED)
2751 {
2752 FIB_ITERATE_INIT(fit, &tab->fib);
2753 tab->nhu_state = NHU_RUNNING;
2754
2755 if (tab->flowspec_trie)
2756 rt_flowspec_reset_trie(tab);
2757 }
2758
2759 FIB_ITERATE_START(&tab->fib, fit, net, n)
2760 {
2761 if (max_feed <= 0)
2762 {
2763 FIB_ITERATE_PUT(fit);
2764 ev_schedule(tab->rt_event);
2765 return;
2766 }
2767 max_feed -= rt_next_hop_update_net(tab, n);
2768 }
2769 FIB_ITERATE_END;
2770
2771 /* State change:
2772 * NHU_DIRTY -> NHU_SCHEDULED
2773 * NHU_RUNNING -> NHU_CLEAN
2774 */
2775 tab->nhu_state &= 1;
2776
2777 if (tab->nhu_state != NHU_CLEAN)
2778 ev_schedule(tab->rt_event);
2779 }
2780
2781
2782 struct rtable_config *
2783 rt_new_table(struct symbol *s, uint addr_type)
2784 {
2785 /* Hack that allows to 'redefine' the master table */
2786 if ((s->class == SYM_TABLE) &&
2787 (s->table == new_config->def_tables[addr_type]) &&
2788 ((addr_type == NET_IP4) || (addr_type == NET_IP6)))
2789 return s->table;
2790
2791 struct rtable_config *c = cfg_allocz(sizeof(struct rtable_config));
2792
2793 cf_define_symbol(new_config, s, SYM_TABLE, table, c);
2794 c->name = s->name;
2795 c->addr_type = addr_type;
2796 c->gc_threshold = 1000;
2797 c->gc_period = (uint) -1; /* set in rt_postconfig() */
2798 c->min_settle_time = 1 S;
2799 c->max_settle_time = 20 S;
2800 c->debug = new_config->table_default_debug;
2801
2802 add_tail(&new_config->tables, &c->n);
2803
2804 /* First table of each type is kept as default */
2805 if (! new_config->def_tables[addr_type])
2806 new_config->def_tables[addr_type] = c;
2807
2808 return c;
2809 }
2810
2811 /**
2812 * rt_lock_table - lock a routing table
2813 * @r: routing table to be locked
2814 *
2815 * Lock a routing table, because it's in use by a protocol,
2816 * preventing it from being freed when it gets undefined in a new
2817 * configuration.
2818 */
2819 void
2820 rt_lock_table(rtable *r)
2821 {
2822 r->use_count++;
2823 }
2824
2825 /**
2826 * rt_unlock_table - unlock a routing table
2827 * @r: routing table to be unlocked
2828 *
2829 * Unlock a routing table formerly locked by rt_lock_table(),
2830 * that is decrease its use count and delete it if it's scheduled
2831 * for deletion by configuration changes.
2832 */
2833 void
2834 rt_unlock_table(rtable *r)
2835 {
2836 if (!--r->use_count && r->deleted)
2837 {
2838 struct config *conf = r->deleted;
2839
2840 /* Delete the routing table by freeing its pool */
2841 rt_shutdown(r);
2842 config_del_obstacle(conf);
2843 }
2844 }
2845
2846 static int
2847 rt_reconfigure(rtable *tab, struct rtable_config *new, struct rtable_config *old)
2848 {
2849 if ((new->addr_type != old->addr_type) ||
2850 (new->sorted != old->sorted) ||
2851 (new->trie_used != old->trie_used))
2852 return 0;
2853
2854 DBG("\t%s: same\n", new->name);
2855 new->table = tab;
2856 tab->name = new->name;
2857 tab->config = new;
2858 tab->debug = new->debug;
2859
2860 return 1;
2861 }
2862
2863 static struct rtable_config *
2864 rt_find_table_config(struct config *cf, char *name)
2865 {
2866 struct symbol *sym = cf_find_symbol(cf, name);
2867 return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL;
2868 }
2869
2870 /**
2871 * rt_commit - commit new routing table configuration
2872 * @new: new configuration
2873 * @old: original configuration or %NULL if it's boot time config
2874 *
2875 * Scan differences between @old and @new configuration and modify
2876 * the routing tables according to these changes. If @new defines a
2877 * previously unknown table, create it, if it omits a table existing
2878 * in @old, schedule it for deletion (it gets deleted when all protocols
2879 * disconnect from it by calling rt_unlock_table()), if it exists
2880 * in both configurations, leave it unchanged.
2881 */
2882 void
2883 rt_commit(struct config *new, struct config *old)
2884 {
2885 struct rtable_config *o, *r;
2886
2887 DBG("rt_commit:\n");
2888 if (old)
2889 {
2890 WALK_LIST(o, old->tables)
2891 {
2892 rtable *tab = o->table;
2893 if (tab->deleted)
2894 continue;
2895
2896 r = rt_find_table_config(new, o->name);
2897 if (r && !new->shutdown && rt_reconfigure(tab, r, o))
2898 continue;
2899
2900 DBG("\t%s: deleted\n", o->name);
2901 tab->deleted = old;
2902 config_add_obstacle(old);
2903 rt_lock_table(tab);
2904 rt_unlock_table(tab);
2905 }
2906 }
2907
2908 WALK_LIST(r, new->tables)
2909 if (!r->table)
2910 {
2911 r->table = rt_setup(rt_table_pool, r);
2912 DBG("\t%s: created\n", r->name);
2913 add_tail(&routing_tables, &r->table->n);
2914 }
2915 DBG("\tdone\n");
2916 }
2917
2918 static inline void
2919 do_feed_channel(struct channel *c, net *n, rte *e)
2920 {
2921 rte_update_lock();
2922 if (c->ra_mode == RA_ACCEPTED)
2923 rt_notify_accepted(c, n, NULL, NULL, c->refeeding);
2924 else if (c->ra_mode == RA_MERGED)
2925 rt_notify_merged(c, n, NULL, NULL, e, e, c->refeeding);
2926 else /* RA_BASIC */
2927 rt_notify_basic(c, n, e, e, c->refeeding);
2928 rte_update_unlock();
2929 }
2930
2931 /**
2932 * rt_feed_channel - advertise all routes to a channel
2933 * @c: channel to be fed
2934 *
2935 * This function performs one pass of advertisement of routes to a channel that
2936 * is in the ES_FEEDING state. It is called by the protocol code as long as it
2937 * has something to do. (We avoid transferring all the routes in single pass in
2938 * order not to monopolize CPU time.)
2939 */
2940 int
2941 rt_feed_channel(struct channel *c)
2942 {
2943 struct fib_iterator *fit = &c->feed_fit;
2944 int max_feed = 256;
2945
2946 ASSERT(c->export_state == ES_FEEDING);
2947
2948 if (!c->feed_active)
2949 {
2950 FIB_ITERATE_INIT(fit, &c->table->fib);
2951 c->feed_active = 1;
2952 }
2953
2954 FIB_ITERATE_START(&c->table->fib, fit, net, n)
2955 {
2956 rte *e = n->routes;
2957 if (max_feed <= 0)
2958 {
2959 FIB_ITERATE_PUT(fit);
2960 return 0;
2961 }
2962
2963 if ((c->ra_mode == RA_OPTIMAL) ||
2964 (c->ra_mode == RA_ACCEPTED) ||
2965 (c->ra_mode == RA_MERGED))
2966 if (rte_is_valid(e))
2967 {
2968 /* In the meantime, the protocol may fell down */
2969 if (c->export_state != ES_FEEDING)
2970 goto done;
2971
2972 do_feed_channel(c, n, e);
2973 max_feed--;
2974 }
2975
2976 if (c->ra_mode == RA_ANY)
2977 for(e = n->routes; e; e = e->next)
2978 {
2979 /* In the meantime, the protocol may fell down */
2980 if (c->export_state != ES_FEEDING)
2981 goto done;
2982
2983 if (!rte_is_valid(e))
2984 continue;
2985
2986 do_feed_channel(c, n, e);
2987 max_feed--;
2988 }
2989 }
2990 FIB_ITERATE_END;
2991
2992 done:
2993 c->feed_active = 0;
2994 return 1;
2995 }
2996
2997 /**
2998 * rt_feed_baby_abort - abort protocol feeding
2999 * @c: channel
3000 *
3001 * This function is called by the protocol code when the protocol stops or
3002 * ceases to exist during the feeding.
3003 */
3004 void
3005 rt_feed_channel_abort(struct channel *c)
3006 {
3007 if (c->feed_active)
3008 {
3009 /* Unlink the iterator */
3010 fit_get(&c->table->fib, &c->feed_fit);
3011 c->feed_active = 0;
3012 }
3013 }
3014
3015
3016 /*
3017 * Import table
3018 */
3019
3020 int
3021 rte_update_in(struct channel *c, const net_addr *n, rte *new, struct rte_src *src)
3022 {
3023 struct rtable *tab = c->in_table;
3024 rte *old, **pos;
3025 net *net;
3026
3027 if (new)
3028 {
3029 net = net_get(tab, n);
3030
3031 if (!rta_is_cached(new->attrs))
3032 new->attrs = rta_lookup(new->attrs);
3033 }
3034 else
3035 {
3036 net = net_find(tab, n);
3037
3038 if (!net)
3039 goto drop_withdraw;
3040 }
3041
3042 /* Find the old rte */
3043 for (pos = &net->routes; old = *pos; pos = &old->next)
3044 if (old->src == src)
3045 {
3046 if (new && rte_same(old, new))
3047 {
3048 /* Refresh the old rte, continue with update to main rtable */
3049 if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY))
3050 {
3051 old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
3052
3053 return 1;
3054 }
3055
3056 goto drop_update;
3057 }
3058
3059 /* Move iterator if needed */
3060 if (old == c->reload_next_rte)
3061 c->reload_next_rte = old->next;
3062
3063 /* Remove the old rte */
3064 *pos = old->next;
3065 tab->rt_count--;
3066 break;
3067 }
3068
3069 if (!old && !new)
3070 goto drop_withdraw;
3071
3072 struct channel_limit *l = &c->rx_limit;
3073 if (l->action && !old && new)
3074 {
3075 if (tab->rt_count >= l->limit)
3076 channel_notify_limit(c, l, PLD_RX, tab->rt_count);
3077
3078 if (l->state == PLS_BLOCKED)
3079 {
3080 /* Required by rte_trace_in() */
3081 new->net = net;
3082
3083 rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
3084 goto drop_update;
3085 }
3086 }
3087
3088 if (new)
3089 {
3090 /* Insert the new rte */
3091 rte *e = rte_do_cow(new);
3092 e->flags |= REF_COW;
3093 e->net = net;
3094 e->sender = c;
3095 e->lastmod = current_time();
3096 e->next = *pos;
3097 *pos = new = e;
3098 tab->rt_count++;
3099
3100 if (!old)
3101 {
3102 new->id = hmap_first_zero(&tab->id_map);
3103 hmap_set(&tab->id_map, new->id);
3104 }
3105 else
3106 new->id = old->id;
3107 }
3108
3109 rte_announce(tab, RA_ANY, net, new, old, NULL, NULL);
3110
3111 if (old)
3112 {
3113 if (!new)
3114 hmap_clear(&tab->id_map, old->id);
3115
3116 rte_free_quick(old);
3117 }
3118
3119 if (!net->routes)
3120 fib_delete(&tab->fib, net);
3121
3122 return 1;
3123
3124 drop_update:
3125 c->stats.imp_updates_received++;
3126 c->stats.imp_updates_ignored++;
3127 rte_free(new);
3128
3129 if (!net->routes)
3130 fib_delete(&tab->fib, net);
3131
3132 return 0;
3133
3134 drop_withdraw:
3135 c->stats.imp_withdraws_received++;
3136 c->stats.imp_withdraws_ignored++;
3137 return 0;
3138 }
3139
3140 int
3141 rt_reload_channel(struct channel *c)
3142 {
3143 struct rtable *tab = c->in_table;
3144 struct fib_iterator *fit = &c->reload_fit;
3145 int max_feed = 64;
3146
3147 ASSERT(c->channel_state == CS_UP);
3148
3149 if (!c->reload_active)
3150 {
3151 FIB_ITERATE_INIT(fit, &tab->fib);
3152 c->reload_active = 1;
3153 }
3154
3155 do {
3156 for (rte *e = c->reload_next_rte; e; e = e->next)
3157 {
3158 if (max_feed-- <= 0)
3159 {
3160 c->reload_next_rte = e;
3161 debug("%s channel reload burst split (max_feed=%d)", c->proto->name, max_feed);
3162 return 0;
3163 }
3164
3165 rte_update2(c, e->net->n.addr, rte_do_cow(e), e->src);
3166 }
3167
3168 c->reload_next_rte = NULL;
3169
3170 FIB_ITERATE_START(&tab->fib, fit, net, n)
3171 {
3172 if (c->reload_next_rte = n->routes)
3173 {
3174 FIB_ITERATE_PUT_NEXT(fit, &tab->fib);
3175 break;
3176 }
3177 }
3178 FIB_ITERATE_END;
3179 }
3180 while (c->reload_next_rte);
3181
3182 c->reload_active = 0;
3183 return 1;
3184 }
3185
3186 void
3187 rt_reload_channel_abort(struct channel *c)
3188 {
3189 if (c->reload_active)
3190 {
3191 /* Unlink the iterator */
3192 fit_get(&c->in_table->fib, &c->reload_fit);
3193 c->reload_next_rte = NULL;
3194 c->reload_active = 0;
3195 }
3196 }
3197
3198 void
3199 rt_prune_sync(rtable *t, int all)
3200 {
3201 struct fib_iterator fit;
3202
3203 FIB_ITERATE_INIT(&fit, &t->fib);
3204
3205 again:
3206 FIB_ITERATE_START(&t->fib, &fit, net, n)
3207 {
3208 rte *e, **ee = &n->routes;
3209
3210 while (e = *ee)
3211 {
3212 if (all || (e->flags & (REF_STALE | REF_DISCARD)))
3213 {
3214 *ee = e->next;
3215 rte_free_quick(e);
3216 t->rt_count--;
3217 }
3218 else
3219 ee = &e->next;
3220 }
3221
3222 if (all || !n->routes)
3223 {
3224 FIB_ITERATE_PUT(&fit);
3225 fib_delete(&t->fib, n);
3226 goto again;
3227 }
3228 }
3229 FIB_ITERATE_END;
3230 }
3231
3232
3233 /*
3234 * Export table
3235 */
3236
3237 int
3238 rte_update_out(struct channel *c, const net_addr *n, rte *new, rte *old0, int refeed)
3239 {
3240 struct rtable *tab = c->out_table;
3241 struct rte_src *src;
3242 rte *old, **pos;
3243 net *net;
3244
3245 if (new)
3246 {
3247 net = net_get(tab, n);
3248 src = new->src;
3249
3250 if (!rta_is_cached(new->attrs))
3251 new->attrs = rta_lookup(new->attrs);
3252 }
3253 else
3254 {
3255 net = net_find(tab, n);
3256 src = old0->src;
3257
3258 if (!net)
3259 goto drop_withdraw;
3260 }
3261
3262 /* Find the old rte */
3263 for (pos = &net->routes; old = *pos; pos = &old->next)
3264 if ((c->ra_mode != RA_ANY) || (old->src == src))
3265 {
3266 if (new && rte_same(old, new))
3267 {
3268 /* REF_STALE / REF_DISCARD not used in export table */
3269 /*
3270 if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY))
3271 {
3272 old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
3273 return 1;
3274 }
3275 */
3276
3277 goto drop_update;
3278 }
3279
3280 /* Remove the old rte */
3281 *pos = old->next;
3282 rte_free_quick(old);
3283 tab->rt_count--;
3284
3285 break;
3286 }
3287
3288 if (!new)
3289 {
3290 if (!old)
3291 goto drop_withdraw;
3292
3293 if (!net->routes)
3294 fib_delete(&tab->fib, net);
3295
3296 return 1;
3297 }
3298
3299 /* Insert the new rte */
3300 rte *e = rte_do_cow(new);
3301 e->flags |= REF_COW;
3302 e->net = net;
3303 e->sender = c;
3304 e->lastmod = current_time();
3305 e->next = *pos;
3306 *pos = e;
3307 tab->rt_count++;
3308 return 1;
3309
3310 drop_update:
3311 return refeed;
3312
3313 drop_withdraw:
3314 return 0;
3315 }
3316
3317
3318 /*
3319 * Hostcache
3320 */
3321
3322 static inline u32
3323 hc_hash(ip_addr a, rtable *dep)
3324 {
3325 return ipa_hash(a) ^ ptr_hash(dep);
3326 }
3327
3328 static inline void
3329 hc_insert(struct hostcache *hc, struct hostentry *he)
3330 {
3331 uint k = he->hash_key >> hc->hash_shift;
3332 he->next = hc->hash_table[k];
3333 hc->hash_table[k] = he;
3334 }
3335
3336 static inline void
3337 hc_remove(struct hostcache *hc, struct hostentry *he)
3338 {
3339 struct hostentry **hep;
3340 uint k = he->hash_key >> hc->hash_shift;
3341
3342 for (hep = &hc->hash_table[k]; *hep != he; hep = &(*hep)->next);
3343 *hep = he->next;
3344 }
3345
3346 #define HC_DEF_ORDER 10
3347 #define HC_HI_MARK *4
3348 #define HC_HI_STEP 2
3349 #define HC_HI_ORDER 16 /* Must be at most 16 */
3350 #define HC_LO_MARK /5
3351 #define HC_LO_STEP 2
3352 #define HC_LO_ORDER 10
3353
3354 static void
3355 hc_alloc_table(struct hostcache *hc, pool *p, unsigned order)
3356 {
3357 uint hsize = 1 << order;
3358 hc->hash_order = order;
3359 hc->hash_shift = 32 - order;
3360 hc->hash_max = (order >= HC_HI_ORDER) ? ~0U : (hsize HC_HI_MARK);
3361 hc->hash_min = (order <= HC_LO_ORDER) ? 0U : (hsize HC_LO_MARK);
3362
3363 hc->hash_table = mb_allocz(p, hsize * sizeof(struct hostentry *));
3364 }
3365
3366 static void
3367 hc_resize(struct hostcache *hc, pool *p, unsigned new_order)
3368 {
3369 struct hostentry **old_table = hc->hash_table;
3370 struct hostentry *he, *hen;
3371 uint old_size = 1 << hc->hash_order;
3372 uint i;
3373
3374 hc_alloc_table(hc, p, new_order);
3375 for (i = 0; i < old_size; i++)
3376 for (he = old_table[i]; he != NULL; he=hen)
3377 {
3378 hen = he->next;
3379 hc_insert(hc, he);
3380 }
3381 mb_free(old_table);
3382 }
3383
3384 static struct hostentry *
3385 hc_new_hostentry(struct hostcache *hc, pool *p, ip_addr a, ip_addr ll, rtable *dep, unsigned k)
3386 {
3387 struct hostentry *he = sl_alloc(hc->slab);
3388
3389 *he = (struct hostentry) {
3390 .addr = a,
3391 .link = ll,
3392 .tab = dep,
3393 .hash_key = k,
3394 };
3395
3396 add_tail(&hc->hostentries, &he->ln);
3397 hc_insert(hc, he);
3398
3399 hc->hash_items++;
3400 if (hc->hash_items > hc->hash_max)
3401 hc_resize(hc, p, hc->hash_order + HC_HI_STEP);
3402
3403 return he;
3404 }
3405
3406 static void
3407 hc_delete_hostentry(struct hostcache *hc, pool *p, struct hostentry *he)
3408 {
3409 rta_free(he->src);
3410
3411 rem_node(&he->ln);
3412 hc_remove(hc, he);
3413 sl_free(he);
3414
3415 hc->hash_items--;
3416 if (hc->hash_items < hc->hash_min)
3417 hc_resize(hc, p, hc->hash_order - HC_LO_STEP);
3418 }
3419
3420 static void
3421 rt_init_hostcache(rtable *tab)
3422 {
3423 struct hostcache *hc = mb_allocz(tab->rp, sizeof(struct hostcache));
3424 init_list(&hc->hostentries);
3425
3426 hc->hash_items = 0;
3427 hc_alloc_table(hc, tab->rp, HC_DEF_ORDER);
3428 hc->slab = sl_new(tab->rp, sizeof(struct hostentry));
3429
3430 hc->lp = lp_new(tab->rp);
3431 hc->trie = f_new_trie(hc->lp, 0);
3432
3433 tab->hostcache = hc;
3434 }
3435
3436 static void
3437 rt_free_hostcache(rtable *tab)
3438 {
3439 struct hostcache *hc = tab->hostcache;
3440
3441 node *n;
3442 WALK_LIST(n, hc->hostentries)
3443 {
3444 struct hostentry *he = SKIP_BACK(struct hostentry, ln, n);
3445 rta_free(he->src);
3446
3447 if (he->uc)
3448 log(L_ERR "Hostcache is not empty in table %s", tab->name);
3449 }
3450
3451 /* Freed automagically by the resource pool
3452 rfree(hc->slab);
3453 rfree(hc->lp);
3454 mb_free(hc->hash_table);
3455 mb_free(hc);
3456 */
3457 }
3458
3459 static void
3460 rt_notify_hostcache(rtable *tab, net *net)
3461 {
3462 if (tab->hcu_scheduled)
3463 return;
3464
3465 if (trie_match_net(tab->hostcache->trie, net->n.addr))
3466 rt_schedule_hcu(tab);
3467 }
3468
3469 static int
3470 if_local_addr(ip_addr a, struct iface *i)
3471 {
3472 struct ifa *b;
3473
3474 WALK_LIST(b, i->addrs)
3475 if (ipa_equal(a, b->ip))
3476 return 1;
3477
3478 return 0;
3479 }
3480
3481 u32
3482 rt_get_igp_metric(rte *rt)
3483 {
3484 eattr *ea = ea_find(rt->attrs->eattrs, EA_GEN_IGP_METRIC);
3485
3486 if (ea)
3487 return ea->u.data;
3488
3489 if (rt->attrs->source == RTS_DEVICE)
3490 return 0;
3491
3492 if (rt->src->proto->rte_igp_metric)
3493 return rt->src->proto->rte_igp_metric(rt);
3494
3495 return IGP_METRIC_UNKNOWN;
3496 }
3497
3498 static int
3499 rt_update_hostentry(rtable *tab, struct hostentry *he)
3500 {
3501 rta *old_src = he->src;
3502 int direct = 0;
3503 int pxlen = 0;
3504
3505 /* Reset the hostentry */
3506 he->src = NULL;
3507 he->dest = RTD_UNREACHABLE;
3508 he->nexthop_linkable = 0;
3509 he->igp_metric = 0;
3510
3511 net_addr he_addr;
3512 net_fill_ip_host(&he_addr, he->addr);
3513 net *n = net_route(tab, &he_addr);
3514 if (n)
3515 {
3516 rte *e = n->routes;
3517 rta *a = e->attrs;
3518 word pref = a->pref;
3519
3520 for (rte *ee = n->routes; ee; ee = ee->next)
3521 if ((ee->attrs->pref >= pref) && ee->attrs->hostentry)
3522 {
3523 /* Recursive route should not depend on another recursive route */
3524 log(L_WARN "Next hop address %I resolvable through recursive route for %N",
3525 he->addr, n->n.addr);
3526 goto done;
3527 }
3528
3529 pxlen = n->n.addr->pxlen;
3530
3531 if (a->dest == RTD_UNICAST)
3532 {
3533 for (struct nexthop *nh = &(a->nh); nh; nh = nh->next)
3534 if (ipa_zero(nh->gw))
3535 {
3536 if (if_local_addr(he->addr, nh->iface))
3537 {
3538 /* The host address is a local address, this is not valid */
3539 log(L_WARN "Next hop address %I is a local address of iface %s",
3540 he->addr, nh->iface->name);
3541 goto done;
3542 }
3543
3544 direct++;
3545 }
3546 }
3547
3548 he->src = rta_clone(a);
3549 he->dest = a->dest;
3550 he->nexthop_linkable = !direct;
3551 he->igp_metric = rt_get_igp_metric(e);
3552 }
3553
3554 done:
3555 /* Add a prefix range to the trie */
3556 trie_add_prefix(tab->hostcache->trie, &he_addr, pxlen, he_addr.pxlen);
3557
3558 rta_free(old_src);
3559 return old_src != he->src;
3560 }
3561
3562 static void
3563 rt_update_hostcache(rtable *tab)
3564 {
3565 struct hostcache *hc = tab->hostcache;
3566 struct hostentry *he;
3567 node *n, *x;
3568
3569 /* Reset the trie */
3570 lp_flush(hc->lp);
3571 hc->trie = f_new_trie(hc->lp, 0);
3572
3573 WALK_LIST_DELSAFE(n, x, hc->hostentries)
3574 {
3575 he = SKIP_BACK(struct hostentry, ln, n);
3576 if (!he->uc)
3577 {
3578 hc_delete_hostentry(hc, tab->rp, he);
3579 continue;
3580 }
3581
3582 if (rt_update_hostentry(tab, he))
3583 rt_schedule_nhu(he->tab);
3584 }
3585
3586 tab->hcu_scheduled = 0;
3587 }
3588
3589 struct hostentry *
3590 rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep)
3591 {
3592 ip_addr link = ipa_zero(ll) ? a : ll;
3593 struct hostentry *he;
3594
3595 if (!tab->hostcache)
3596 rt_init_hostcache(tab);
3597
3598 u32 k = hc_hash(a, dep);
3599 struct hostcache *hc = tab->hostcache;
3600 for (he = hc->hash_table[k >> hc->hash_shift]; he != NULL; he = he->next)
3601 if (ipa_equal(he->addr, a) && ipa_equal(he->link, link) && (he->tab == dep))
3602 return he;
3603
3604 he = hc_new_hostentry(hc, tab->rp, a, link, dep, k);
3605 he->owner = tab;
3606 rt_update_hostentry(tab, he);
3607 return he;
3608 }
3609
3610
3611 /*
3612 * Documentation for functions declared inline in route.h
3613 */
3614 #if 0
3615
3616 /**
3617 * net_find - find a network entry
3618 * @tab: a routing table
3619 * @addr: address of the network
3620 *
3621 * net_find() looks up the given network in routing table @tab and
3622 * returns a pointer to its &net entry or %NULL if no such network
3623 * exists.
3624 */
3625 static inline net *net_find(rtable *tab, net_addr *addr)
3626 { DUMMY; }
3627
3628 /**
3629 * net_get - obtain a network entry
3630 * @tab: a routing table
3631 * @addr: address of the network
3632 *
3633 * net_get() looks up the given network in routing table @tab and
3634 * returns a pointer to its &net entry. If no such entry exists, it's
3635 * created.
3636 */
3637 static inline net *net_get(rtable *tab, net_addr *addr)
3638 { DUMMY; }
3639
3640 /**
3641 * rte_cow - copy a route for writing
3642 * @r: a route entry to be copied
3643 *
3644 * rte_cow() takes a &rte and prepares it for modification. The exact action
3645 * taken depends on the flags of the &rte -- if it's a temporary entry, it's
3646 * just returned unchanged, else a new temporary entry with the same contents
3647 * is created.
3648 *
3649 * The primary use of this function is inside the filter machinery -- when
3650 * a filter wants to modify &rte contents (to change the preference or to
3651 * attach another set of attributes), it must ensure that the &rte is not
3652 * shared with anyone else (and especially that it isn't stored in any routing
3653 * table).
3654 *
3655 * Result: a pointer to the new writable &rte.
3656 */
3657 static inline rte * rte_cow(rte *r)
3658 { DUMMY; }
3659
3660 #endif