nest/rt-table.c

   1 /*
   2  *      BIRD -- Routing Tables
   3  *
   4  *      (c) 1998--2000 Martin Mares <mj@ucw.cz>
   5  *
   6  *      Can be freely distributed and used under the terms of the GNU GPL.
   7  */
   8
   9 /**
  10  * DOC: Routing tables
  11  *
  12  * Routing tables are probably the most important structures BIRD uses. They
  13  * hold all the information about known networks, the associated routes and
  14  * their attributes.
  15  *
  16  * There are multiple routing tables (a primary one together with any
  17  * number of secondary ones if requested by the configuration). Each table
  18  * is basically a FIB containing entries describing the individual
  19  * destination networks. For each network (represented by structure &net),
  20  * there is a one-way linked list of route entries (&rte), the first entry
  21  * on the list being the best one (i.e., the one we currently use
  22  * for routing), the order of the other ones is undetermined.
  23  *
  24  * The &rte contains information about the route. There are net and src, which
  25  * together forms a key identifying the route in a routing table. There is a
  26  * pointer to a &rta structure (see the route attribute module for a precise
  27  * explanation) holding the route attributes, which are primary data about the
  28  * route. There are several technical fields used by routing table code (route
  29  * id, REF_* flags), There is also the pflags field, holding protocol-specific
  30  * flags. They are not used by routing table code, but by protocol-specific
  31  * hooks. In contrast to route attributes, they are not primary data and their
  32  * validity is also limited to the routing table.
  33  *
  34  * There are several mechanisms that allow automatic update of routes in one
  35  * routing table (dst) as a result of changes in another routing table (src).
  36  * They handle issues of recursive next hop resolving, flowspec validation and
  37  * RPKI validation.
  38  *
  39  * The first such mechanism is handling of recursive next hops. A route in the
  40  * dst table has an indirect next hop address, which is resolved through a route
  41  * in the src table (which may also be the same table) to get an immediate next
  42  * hop. This is implemented using structure &hostcache attached to the src
  43  * table, which contains &hostentry structures for each tracked next hop
  44  * address. These structures are linked from recursive routes in dst tables,
  45  * possibly multiple routes sharing one hostentry (as many routes may have the
  46  * same indirect next hop). There is also a trie in the hostcache, which matches
  47  * all prefixes that may influence resolving of tracked next hops.
  48  *
  49  * When a best route changes in the src table, the hostcache is notified using
  50  * rt_notify_hostcache(), which immediately checks using the trie whether the
  51  * change is relevant and if it is, then it schedules asynchronous hostcache
  52  * recomputation. The recomputation is done by rt_update_hostcache() (called
  53  * from rt_event() of src table), it walks through all hostentries and resolves
  54  * them (by rt_update_hostentry()). It also updates the trie. If a change in
  55  * hostentry resolution was found, then it schedules asynchronous nexthop
  56  * recomputation of associated dst table. That is done by rt_next_hop_update()
  57  * (called from rt_event() of dst table), it iterates over all routes in the dst
  58  * table and re-examines their hostentries for changes. Note that in contrast to
  59  * hostcache update, next hop update can be interrupted by main loop. These two
  60  * full-table walks (over hostcache and dst table) are necessary due to absence
  61  * of direct lookups (route -> affected nexthop, nexthop -> its route).
  62  *
  63  * The second mechanism is for flowspec validation, where validity of flowspec
  64  * routes depends of resolving their network prefixes in IP routing tables. This
  65  * is similar to the recursive next hop mechanism, but simpler as there are no
  66  * intermediate hostcache and hostentries (because flows are less likely to
  67  * share common net prefix than routes sharing a common next hop). In src table,
  68  * there is a list of dst tables (list flowspec_links), this list is updated by
  69  * flowpsec channels (by rt_flowspec_link() and rt_flowspec_unlink() during
  70  * channel start/stop). Each dst table has its own trie of prefixes that may
  71  * influence validation of flowspec routes in it (flowspec_trie).
  72  *
  73  * When a best route changes in the src table, rt_flowspec_notify() immediately
  74  * checks all dst tables from the list using their tries to see whether the
  75  * change is relevant for them. If it is, then an asynchronous re-validation of
  76  * flowspec routes in the dst table is scheduled. That is also done by function
  77  * rt_next_hop_update(), like nexthop recomputation above. It iterates over all
  78  * flowspec routes and re-validates them. It also recalculates the trie.
  79  *
  80  * Note that in contrast to the hostcache update, here the trie is recalculated
  81  * during the rt_next_hop_update(), which may be interleaved with IP route
  82  * updates. The trie is flushed at the beginning of recalculation, which means
  83  * that such updates may use partial trie to see if they are relevant. But it
  84  * works anyway! Either affected flowspec was already re-validated and added to
  85  * the trie, then IP route change would match the trie and trigger a next round
  86  * of re-validation, or it was not yet re-validated and added to the trie, but
  87  * will be re-validated later in this round anyway.
  88  *
  89  * The third mechanism is used for RPKI re-validation of IP routes and it is the
  90  * simplest. It is just a list of subscribers in src table, who are notified
  91  * when any change happened, but only after a settle time. Also, in RPKI case
  92  * the dst is not a table, but a channel, who refeeds routes through a filter.
  93  */
  94
  95 #undef LOCAL_DEBUG
  96
  97 #include "nest/bird.h"
  98 #include "nest/route.h"
  99 #include "nest/protocol.h"
 100 #include "nest/iface.h"
 101 #include "lib/resource.h"
 102 #include "lib/event.h"
 103 #include "lib/timer.h"
 104 #include "lib/string.h"
 105 #include "conf/conf.h"
 106 #include "filter/filter.h"
 107 #include "filter/data.h"
 108 #include "lib/hash.h"
 109 #include "lib/string.h"
 110 #include "lib/alloca.h"
 111 #include "lib/flowspec.h"
 112
 113 #ifdef CONFIG_BGP
 114 #include "proto/bgp/bgp.h"
 115 #endif
 116
 117 pool *rt_table_pool;
 118
 119 static slab *rte_slab;
 120 static linpool *rte_update_pool;
 121
 122 list routing_tables;
 123
 124 static void rt_free_hostcache(rtable *tab);
 125 static void rt_notify_hostcache(rtable *tab, net *net);
 126 static void rt_update_hostcache(rtable *tab);
 127 static void rt_next_hop_update(rtable *tab);
 128 static inline void rt_prune_table(rtable *tab);
 129 static inline void rt_schedule_notify(rtable *tab);
 130 static void rt_flowspec_notify(rtable *tab, net *net);
 131 static void rt_kick_prune_timer(rtable *tab);
 132
 133
 134 static void
 135 net_init_with_trie(struct fib *f, void *N)
 136 {
 137   rtable *tab = SKIP_BACK(rtable, fib, f);
 138   net *n = N;
 139
 140   if (tab->trie)
 141     trie_add_prefix(tab->trie, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen);
 142
 143   if (tab->trie_new)
 144     trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen);
 145 }
 146
 147 static inline net *
 148 net_route_ip4_trie(rtable *t, const net_addr_ip4 *n0)
 149 {
 150   TRIE_WALK_TO_ROOT_IP4(t->trie, n0, n)
 151   {
 152     net *r;
 153     if (r = net_find_valid(t, (net_addr *) &n))
 154       return r;
 155   }
 156   TRIE_WALK_TO_ROOT_END;
 157
 158   return NULL;
 159 }
 160
 161 static inline net *
 162 net_route_vpn4_trie(rtable *t, const net_addr_vpn4 *n0)
 163 {
 164   TRIE_WALK_TO_ROOT_IP4(t->trie, (const net_addr_ip4 *) n0, px)
 165   {
 166     net_addr_vpn4 n = NET_ADDR_VPN4(px.prefix, px.pxlen, n0->rd);
 167
 168     net *r;
 169     if (r = net_find_valid(t, (net_addr *) &n))
 170       return r;
 171   }
 172   TRIE_WALK_TO_ROOT_END;
 173
 174   return NULL;
 175 }
 176
 177 static inline net *
 178 net_route_ip6_trie(rtable *t, const net_addr_ip6 *n0)
 179 {
 180   TRIE_WALK_TO_ROOT_IP6(t->trie, n0, n)
 181   {
 182     net *r;
 183     if (r = net_find_valid(t, (net_addr *) &n))
 184       return r;
 185   }
 186   TRIE_WALK_TO_ROOT_END;
 187
 188   return NULL;
 189 }
 190
 191 static inline net *
 192 net_route_vpn6_trie(rtable *t, const net_addr_vpn6 *n0)
 193 {
 194   TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px)
 195   {
 196     net_addr_vpn6 n = NET_ADDR_VPN6(px.prefix, px.pxlen, n0->rd);
 197
 198     net *r;
 199     if (r = net_find_valid(t, (net_addr *) &n))
 200       return r;
 201   }
 202   TRIE_WALK_TO_ROOT_END;
 203
 204   return NULL;
 205 }
 206
 207 static inline void *
 208 net_route_ip6_sadr_trie(rtable *t, const net_addr_ip6_sadr *n0)
 209 {
 210   TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px)
 211   {
 212     net_addr_ip6_sadr n = NET_ADDR_IP6_SADR(px.prefix, px.pxlen, n0->src_prefix, n0->src_pxlen);
 213     net *best = NULL;
 214     int best_pxlen = 0;
 215
 216     /* We need to do dst first matching. Since sadr addresses are hashed on dst
 217        prefix only, find the hash table chain and go through it to find the
 218        match with the longest matching src prefix. */
 219     for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next)
 220     {
 221       net_addr_ip6_sadr *a = (void *) fn->addr;
 222
 223       if (net_equal_dst_ip6_sadr(&n, a) &&
 224           net_in_net_src_ip6_sadr(&n, a) &&
 225           (a->src_pxlen >= best_pxlen))
 226       {
 227         best = fib_node_to_user(&t->fib, fn);
 228         best_pxlen = a->src_pxlen;
 229       }
 230     }
 231
 232     if (best)
 233       return best;
 234   }
 235   TRIE_WALK_TO_ROOT_END;
 236
 237   return NULL;
 238 }
 239
 240 static inline net *
 241 net_route_ip4_fib(rtable *t, const net_addr_ip4 *n0)
 242 {
 243   net_addr_ip4 n;
 244   net_copy_ip4(&n, n0);
 245
 246   net *r;
 247   while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0))
 248   {
 249     n.pxlen--;
 250     ip4_clrbit(&n.prefix, n.pxlen);
 251   }
 252
 253   return r;
 254 }
 255
 256 static inline net *
 257 net_route_vpn4_fib(rtable *t, const net_addr_vpn4 *n0)
 258 {
 259   net_addr_vpn4 n;
 260   net_copy_vpn4(&n, n0);
 261
 262   net *r;
 263   while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0))
 264   {
 265     n.pxlen--;
 266     ip4_clrbit(&n.prefix, n.pxlen);
 267   }
 268
 269   return r;
 270 }
 271
 272 static inline net *
 273 net_route_ip6_fib(rtable *t, const net_addr_ip6 *n0)
 274 {
 275   net_addr_ip6 n;
 276   net_copy_ip6(&n, n0);
 277
 278   net *r;
 279   while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0))
 280   {
 281     n.pxlen--;
 282     ip6_clrbit(&n.prefix, n.pxlen);
 283   }
 284
 285   return r;
 286 }
 287
 288 static inline net *
 289 net_route_vpn6_fib(rtable *t, const net_addr_vpn6 *n0)
 290 {
 291   net_addr_vpn6 n;
 292   net_copy_vpn6(&n, n0);
 293
 294   net *r;
 295   while (r = net_find_valid(t, (net_addr *) &n), (!r) && (n.pxlen > 0))
 296   {
 297     n.pxlen--;
 298     ip6_clrbit(&n.prefix, n.pxlen);
 299   }
 300
 301   return r;
 302 }
 303
 304 static inline void *
 305 net_route_ip6_sadr_fib(rtable *t, const net_addr_ip6_sadr *n0)
 306 {
 307   net_addr_ip6_sadr n;
 308   net_copy_ip6_sadr(&n, n0);
 309
 310   while (1)
 311   {
 312     net *best = NULL;
 313     int best_pxlen = 0;
 314
 315     /* We need to do dst first matching. Since sadr addresses are hashed on dst
 316        prefix only, find the hash table chain and go through it to find the
 317        match with the longest matching src prefix. */
 318     for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next)
 319     {
 320       net_addr_ip6_sadr *a = (void *) fn->addr;
 321
 322       if (net_equal_dst_ip6_sadr(&n, a) &&
 323           net_in_net_src_ip6_sadr(&n, a) &&
 324           (a->src_pxlen >= best_pxlen))
 325       {
 326         best = fib_node_to_user(&t->fib, fn);
 327         best_pxlen = a->src_pxlen;
 328       }
 329     }
 330
 331     if (best)
 332       return best;
 333
 334     if (!n.dst_pxlen)
 335       break;
 336
 337     n.dst_pxlen--;
 338     ip6_clrbit(&n.dst_prefix, n.dst_pxlen);
 339   }
 340
 341   return NULL;
 342 }
 343
 344 net *
 345 net_route(rtable *tab, const net_addr *n)
 346 {
 347   ASSERT(tab->addr_type == n->type);
 348
 349   switch (n->type)
 350   {
 351   case NET_IP4:
 352     if (tab->trie)
 353       return net_route_ip4_trie(tab, (net_addr_ip4 *) n);
 354     else
 355       return net_route_ip4_fib (tab, (net_addr_ip4 *) n);
 356
 357   case NET_VPN4:
 358     if (tab->trie)
 359       return net_route_vpn4_trie(tab, (net_addr_vpn4 *) n);
 360     else
 361       return net_route_vpn4_fib (tab, (net_addr_vpn4 *) n);
 362
 363   case NET_IP6:
 364     if (tab->trie)
 365       return net_route_ip6_trie(tab, (net_addr_ip6 *) n);
 366     else
 367       return net_route_ip6_fib (tab, (net_addr_ip6 *) n);
 368
 369   case NET_VPN6:
 370     if (tab->trie)
 371       return net_route_vpn6_trie(tab, (net_addr_vpn6 *) n);
 372     else
 373       return net_route_vpn6_fib (tab, (net_addr_vpn6 *) n);
 374
 375   case NET_IP6_SADR:
 376     if (tab->trie)
 377       return net_route_ip6_sadr_trie(tab, (net_addr_ip6_sadr *) n);
 378     else
 379       return net_route_ip6_sadr_fib (tab, (net_addr_ip6_sadr *) n);
 380
 381   default:
 382     return NULL;
 383   }
 384 }
 385
 386
 387 static int
 388 net_roa_check_ip4_trie(rtable *tab, const net_addr_ip4 *px, u32 asn)
 389 {
 390   int anything = 0;
 391
 392   TRIE_WALK_TO_ROOT_IP4(tab->trie, px, px0)
 393   {
 394     net_addr_roa4 roa0 = NET_ADDR_ROA4(px0.prefix, px0.pxlen, 0, 0);
 395
 396     struct fib_node *fn;
 397     for (fn = fib_get_chain(&tab->fib, (net_addr *) &roa0); fn; fn = fn->next)
 398     {
 399       net_addr_roa4 *roa = (void *) fn->addr;
 400       net *r = fib_node_to_user(&tab->fib, fn);
 401
 402       if (net_equal_prefix_roa4(roa, &roa0) && rte_is_valid(r->routes))
 403       {
 404         anything = 1;
 405         if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen))
 406           return ROA_VALID;
 407       }
 408     }
 409   }
 410   TRIE_WALK_TO_ROOT_END;
 411
 412   return anything ? ROA_INVALID : ROA_UNKNOWN;
 413 }
 414
 415 static int
 416 net_roa_check_ip4_fib(rtable *tab, const net_addr_ip4 *px, u32 asn)
 417 {
 418   struct net_addr_roa4 n = NET_ADDR_ROA4(px->prefix, px->pxlen, 0, 0);
 419   struct fib_node *fn;
 420   int anything = 0;
 421
 422   while (1)
 423   {
 424     for (fn = fib_get_chain(&tab->fib, (net_addr *) &n); fn; fn = fn->next)
 425     {
 426       net_addr_roa4 *roa = (void *) fn->addr;
 427       net *r = fib_node_to_user(&tab->fib, fn);
 428
 429       if (net_equal_prefix_roa4(roa, &n) && rte_is_valid(r->routes))
 430       {
 431         anything = 1;
 432         if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen))
 433           return ROA_VALID;
 434       }
 435     }
 436
 437     if (n.pxlen == 0)
 438       break;
 439
 440     n.pxlen--;
 441     ip4_clrbit(&n.prefix, n.pxlen);
 442   }
 443
 444   return anything ? ROA_INVALID : ROA_UNKNOWN;
 445 }
 446
 447 static int
 448 net_roa_check_ip6_trie(rtable *tab, const net_addr_ip6 *px, u32 asn)
 449 {
 450   int anything = 0;
 451
 452   TRIE_WALK_TO_ROOT_IP6(tab->trie, px, px0)
 453   {
 454     net_addr_roa6 roa0 = NET_ADDR_ROA6(px0.prefix, px0.pxlen, 0, 0);
 455
 456     struct fib_node *fn;
 457     for (fn = fib_get_chain(&tab->fib, (net_addr *) &roa0); fn; fn = fn->next)
 458     {
 459       net_addr_roa6 *roa = (void *) fn->addr;
 460       net *r = fib_node_to_user(&tab->fib, fn);
 461
 462       if (net_equal_prefix_roa6(roa, &roa0) && rte_is_valid(r->routes))
 463       {
 464         anything = 1;
 465         if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen))
 466           return ROA_VALID;
 467       }
 468     }
 469   }
 470   TRIE_WALK_TO_ROOT_END;
 471
 472   return anything ? ROA_INVALID : ROA_UNKNOWN;
 473 }
 474
 475 static int
 476 net_roa_check_ip6_fib(rtable *tab, const net_addr_ip6 *px, u32 asn)
 477 {
 478   struct net_addr_roa6 n = NET_ADDR_ROA6(px->prefix, px->pxlen, 0, 0);
 479   struct fib_node *fn;
 480   int anything = 0;
 481
 482   while (1)
 483   {
 484     for (fn = fib_get_chain(&tab->fib, (net_addr *) &n); fn; fn = fn->next)
 485     {
 486       net_addr_roa6 *roa = (void *) fn->addr;
 487       net *r = fib_node_to_user(&tab->fib, fn);
 488
 489       if (net_equal_prefix_roa6(roa, &n) && rte_is_valid(r->routes))
 490       {
 491         anything = 1;
 492         if (asn && (roa->asn == asn) && (roa->max_pxlen >= px->pxlen))
 493           return ROA_VALID;
 494       }
 495     }
 496
 497     if (n.pxlen == 0)
 498       break;
 499
 500     n.pxlen--;
 501     ip6_clrbit(&n.prefix, n.pxlen);
 502   }
 503
 504   return anything ? ROA_INVALID : ROA_UNKNOWN;
 505 }
 506
 507 /**
 508  * roa_check - check validity of route origination in a ROA table
 509  * @tab: ROA table
 510  * @n: network prefix to check
 511  * @asn: AS number of network prefix
 512  *
 513  * Implements RFC 6483 route validation for the given network prefix. The
 514  * procedure is to find all candidate ROAs - ROAs whose prefixes cover the given
 515  * network prefix. If there is no candidate ROA, return ROA_UNKNOWN. If there is
 516  * a candidate ROA with matching ASN and maxlen field greater than or equal to
 517  * the given prefix length, return ROA_VALID. Otherwise, return ROA_INVALID. If
 518  * caller cannot determine origin AS, 0 could be used (in that case ROA_VALID
 519  * cannot happen). Table @tab must have type NET_ROA4 or NET_ROA6, network @n
 520  * must have type NET_IP4 or NET_IP6, respectively.
 521  */
 522 int
 523 net_roa_check(rtable *tab, const net_addr *n, u32 asn)
 524 {
 525   if ((tab->addr_type == NET_ROA4) && (n->type == NET_IP4))
 526   {
 527     if (tab->trie)
 528       return net_roa_check_ip4_trie(tab, (const net_addr_ip4 *) n, asn);
 529     else
 530       return net_roa_check_ip4_fib (tab, (const net_addr_ip4 *) n, asn);
 531   }
 532   else if ((tab->addr_type == NET_ROA6) && (n->type == NET_IP6))
 533   {
 534     if (tab->trie)
 535       return net_roa_check_ip6_trie(tab, (const net_addr_ip6 *) n, asn);
 536     else
 537       return net_roa_check_ip6_fib (tab, (const net_addr_ip6 *) n, asn);
 538   }
 539   else
 540     return ROA_UNKNOWN; /* Should not happen */
 541 }
 542
 543 /**
 544  * rte_find - find a route
 545  * @net: network node
 546  * @src: route source
 547  *
 548  * The rte_find() function returns a route for destination @net
 549  * which is from route source @src.
 550  */
 551 rte *
 552 rte_find(net *net, struct rte_src *src)
 553 {
 554   rte *e = net->routes;
 555
 556   while (e && e->src != src)
 557     e = e->next;
 558   return e;
 559 }
 560
 561 /**
 562  * rte_get_temp - get a temporary &rte
 563  * @a: attributes to assign to the new route (a &rta; in case it's
 564  * un-cached, rte_update() will create a cached copy automatically)
 565  * @src: route source
 566  *
 567  * Create a temporary &rte and bind it with the attributes @a.
 568  */
 569 rte *
 570 rte_get_temp(rta *a, struct rte_src *src)
 571 {
 572   rte *e = sl_alloc(rte_slab);
 573
 574   e->attrs = a;
 575   e->id = 0;
 576   e->flags = 0;
 577   e->pflags = 0;
 578   rt_lock_source(e->src = src);
 579   return e;
 580 }
 581
 582 rte *
 583 rte_do_cow(rte *r)
 584 {
 585   rte *e = sl_alloc(rte_slab);
 586
 587   memcpy(e, r, sizeof(rte));
 588
 589   rt_lock_source(e->src);
 590   e->attrs = rta_clone(r->attrs);
 591   e->flags = 0;
 592   return e;
 593 }
 594
 595 /**
 596  * rte_cow_rta - get a private writable copy of &rte with writable &rta
 597  * @r: a route entry to be copied
 598  * @lp: a linpool from which to allocate &rta
 599  *
 600  * rte_cow_rta() takes a &rte and prepares it and associated &rta for
 601  * modification. There are three possibilities: First, both &rte and &rta are
 602  * private copies, in that case they are returned unchanged.  Second, &rte is
 603  * private copy, but &rta is cached, in that case &rta is duplicated using
 604  * rta_do_cow(). Third, both &rte is shared and &rta is cached, in that case
 605  * both structures are duplicated by rte_do_cow() and rta_do_cow().
 606  *
 607  * Note that in the second case, cached &rta loses one reference, while private
 608  * copy created by rta_do_cow() is a shallow copy sharing indirect data (eattrs,
 609  * nexthops, ...) with it. To work properly, original shared &rta should have
 610  * another reference during the life of created private copy.
 611  *
 612  * Result: a pointer to the new writable &rte with writable &rta.
 613  */
 614 rte *
 615 rte_cow_rta(rte *r, linpool *lp)
 616 {
 617   if (!rta_is_cached(r->attrs))
 618     return r;
 619
 620   r = rte_cow(r);
 621   rta *a = rta_do_cow(r->attrs, lp);
 622   rta_free(r->attrs);
 623   r->attrs = a;
 624   return r;
 625 }
 626
 627 static int                              /* Actually better or at least as good as */
 628 rte_better(rte *new, rte *old)
 629 {
 630   int (*better)(rte *, rte *);
 631
 632   if (!rte_is_valid(old))
 633     return 1;
 634   if (!rte_is_valid(new))
 635     return 0;
 636
 637   if (new->attrs->pref > old->attrs->pref)
 638     return 1;
 639   if (new->attrs->pref < old->attrs->pref)
 640     return 0;
 641   if (new->src->proto->proto != old->src->proto->proto)
 642     {
 643       /*
 644        *  If the user has configured protocol preferences, so that two different protocols
 645        *  have the same preference, try to break the tie by comparing addresses. Not too
 646        *  useful, but keeps the ordering of routes unambiguous.
 647        */
 648       return new->src->proto->proto > old->src->proto->proto;
 649     }
 650   if (better = new->src->proto->rte_better)
 651     return better(new, old);
 652   return 0;
 653 }
 654
 655 static int
 656 rte_mergable(rte *pri, rte *sec)
 657 {
 658   int (*mergable)(rte *, rte *);
 659
 660   if (!rte_is_valid(pri) || !rte_is_valid(sec))
 661     return 0;
 662
 663   if (pri->attrs->pref != sec->attrs->pref)
 664     return 0;
 665
 666   if (pri->src->proto->proto != sec->src->proto->proto)
 667     return 0;
 668
 669   if (mergable = pri->src->proto->rte_mergable)
 670     return mergable(pri, sec);
 671
 672   return 0;
 673 }
 674
 675 static void
 676 rte_trace(struct channel *c, rte *e, int dir, char *msg)
 677 {
 678   log(L_TRACE "%s.%s %c %s %N %uL %uG %s",
 679       c->proto->name, c->name ?: "?", dir, msg, e->net->n.addr, e->src->private_id, e->src->global_id,
 680       rta_dest_name(e->attrs->dest));
 681 }
 682
 683 static inline void
 684 rte_trace_in(uint flag, struct channel *c, rte *e, char *msg)
 685 {
 686   if ((c->debug & flag) || (c->proto->debug & flag))
 687     rte_trace(c, e, '>', msg);
 688 }
 689
 690 static inline void
 691 rte_trace_out(uint flag, struct channel *c, rte *e, char *msg)
 692 {
 693   if ((c->debug & flag) || (c->proto->debug & flag))
 694     rte_trace(c, e, '<', msg);
 695 }
 696
 697 static rte *
 698 export_filter_(struct channel *c, rte *rt0, rte **rt_free, linpool *pool, int silent)
 699 {
 700   struct proto *p = c->proto;
 701   const struct filter *filter = c->out_filter;
 702   struct proto_stats *stats = &c->stats;
 703   rte *rt;
 704   int v;
 705
 706   rt = rt0;
 707   *rt_free = NULL;
 708
 709   v = p->preexport ? p->preexport(c, rt) : 0;
 710   if (v < 0)
 711     {
 712       if (silent)
 713         goto reject;
 714
 715       stats->exp_updates_rejected++;
 716       if (v == RIC_REJECT)
 717         rte_trace_out(D_FILTERS, c, rt, "rejected by protocol");
 718       goto reject;
 719     }
 720   if (v > 0)
 721     {
 722       if (!silent)
 723         rte_trace_out(D_FILTERS, c, rt, "forced accept by protocol");
 724       goto accept;
 725     }
 726
 727   v = filter && ((filter == FILTER_REJECT) ||
 728                  (f_run(filter, &rt, pool,
 729                         (silent ? FF_SILENT : 0)) > F_ACCEPT));
 730   if (v)
 731     {
 732       if (silent)
 733         goto reject;
 734
 735       stats->exp_updates_filtered++;
 736       rte_trace_out(D_FILTERS, c, rt, "filtered out");
 737       goto reject;
 738     }
 739
 740  accept:
 741   if (rt != rt0)
 742     *rt_free = rt;
 743   return rt;
 744
 745  reject:
 746   /* Discard temporary rte */
 747   if (rt != rt0)
 748     rte_free(rt);
 749   return NULL;
 750 }
 751
 752 static inline rte *
 753 export_filter(struct channel *c, rte *rt0, rte **rt_free, int silent)
 754 {
 755   return export_filter_(c, rt0, rt_free, rte_update_pool, silent);
 756 }
 757
 758 static void
 759 do_rt_notify(struct channel *c, net *net, rte *new, rte *old, int refeed)
 760 {
 761   struct proto *p = c->proto;
 762   struct proto_stats *stats = &c->stats;
 763
 764   if (refeed && new)
 765     c->refeed_count++;
 766
 767   /* Apply export limit */
 768   struct channel_limit *l = &c->out_limit;
 769   if (l->action && !old && new)
 770   {
 771     if (stats->exp_routes >= l->limit)
 772       channel_notify_limit(c, l, PLD_OUT, stats->exp_routes);
 773
 774     if (l->state == PLS_BLOCKED)
 775     {
 776       stats->exp_updates_rejected++;
 777       rte_trace_out(D_FILTERS, c, new, "rejected [limit]");
 778       return;
 779     }
 780   }
 781
 782   /* Apply export table */
 783   if (c->out_table && !rte_update_out(c, net->n.addr, new, old, refeed))
 784     return;
 785
 786   if (new)
 787     stats->exp_updates_accepted++;
 788   else
 789     stats->exp_withdraws_accepted++;
 790
 791   if (old)
 792   {
 793     bmap_clear(&c->export_map, old->id);
 794     stats->exp_routes--;
 795   }
 796
 797   if (new)
 798   {
 799     bmap_set(&c->export_map, new->id);
 800     stats->exp_routes++;
 801   }
 802
 803   if (p->debug & D_ROUTES)
 804   {
 805     if (new && old)
 806       rte_trace_out(D_ROUTES, c, new, "replaced");
 807     else if (new)
 808       rte_trace_out(D_ROUTES, c, new, "added");
 809     else if (old)
 810       rte_trace_out(D_ROUTES, c, old, "removed");
 811   }
 812
 813   p->rt_notify(p, c, net, new, old);
 814 }
 815
 816 static void
 817 rt_notify_basic(struct channel *c, net *net, rte *new, rte *old, int refeed)
 818 {
 819   // struct proto *p = c->proto;
 820   rte *new_free = NULL;
 821
 822   if (new)
 823     c->stats.exp_updates_received++;
 824   else
 825     c->stats.exp_withdraws_received++;
 826
 827   if (new)
 828     new = export_filter(c, new, &new_free, 0);
 829
 830   if (old && !bmap_test(&c->export_map, old->id))
 831     old = NULL;
 832
 833   if (!new && !old)
 834     return;
 835
 836   do_rt_notify(c, net, new, old, refeed);
 837
 838   /* Discard temporary rte */
 839   if (new_free)
 840     rte_free(new_free);
 841 }
 842
 843 static void
 844 rt_notify_accepted(struct channel *c, net *net, rte *new_changed, rte *old_changed, int refeed)
 845 {
 846   // struct proto *p = c->proto;
 847   rte *new_best = NULL;
 848   rte *old_best = NULL;
 849   rte *new_free = NULL;
 850   int new_first = 0;
 851
 852   /*
 853    * We assume that there are no changes in net route order except (added)
 854    * new_changed and (removed) old_changed. Therefore, the function is not
 855    * compatible with deterministic_med (where nontrivial reordering can happen
 856    * as a result of a route change) and with recomputation of recursive routes
 857    * due to next hop update (where many routes can be changed in one step).
 858    *
 859    * Note that we need this assumption just for optimizations, we could just
 860    * run full new_best recomputation otherwise.
 861    *
 862    * There are three cases:
 863    * feed or old_best is old_changed -> we need to recompute new_best
 864    * old_best is before new_changed -> new_best is old_best, ignore
 865    * old_best is after new_changed -> try new_changed, otherwise old_best
 866    */
 867
 868   if (net->routes)
 869     c->stats.exp_updates_received++;
 870   else
 871     c->stats.exp_withdraws_received++;
 872
 873   /* Find old_best - either old_changed, or route for net->routes */
 874   if (old_changed && bmap_test(&c->export_map, old_changed->id))
 875     old_best = old_changed;
 876   else
 877   {
 878     for (rte *r = net->routes; rte_is_valid(r); r = r->next)
 879     {
 880       if (bmap_test(&c->export_map, r->id))
 881       {
 882         old_best = r;
 883         break;
 884       }
 885
 886       /* Note if new_changed found before old_best */
 887       if (r == new_changed)
 888         new_first = 1;
 889     }
 890   }
 891
 892   /* Find new_best */
 893   if ((new_changed == old_changed) || (old_best == old_changed))
 894   {
 895     /* Feed or old_best changed -> find first accepted by filters */
 896     for (rte *r = net->routes; rte_is_valid(r); r = r->next)
 897       if (new_best = export_filter(c, r, &new_free, 0))
 898         break;
 899   }
 900   else
 901   {
 902     /* Other cases -> either new_changed, or old_best (and nothing changed) */
 903     if (new_first && (new_changed = export_filter(c, new_changed, &new_free, 0)))
 904       new_best = new_changed;
 905     else
 906       return;
 907   }
 908
 909   if (!new_best && !old_best)
 910     return;
 911
 912   do_rt_notify(c, net, new_best, old_best, refeed);
 913
 914   /* Discard temporary rte */
 915   if (new_free)
 916     rte_free(new_free);
 917 }
 918
 919
 920 static struct nexthop *
 921 nexthop_merge_rta(struct nexthop *nhs, rta *a, linpool *pool, int max)
 922 {
 923   return nexthop_merge(nhs, &(a->nh), 1, 0, max, pool);
 924 }
 925
 926 rte *
 927 rt_export_merged(struct channel *c, net *net, rte **rt_free, linpool *pool, int silent)
 928 {
 929   // struct proto *p = c->proto;
 930   struct nexthop *nhs = NULL;
 931   rte *best0, *best, *rt0, *rt, *tmp;
 932
 933   best0 = net->routes;
 934   *rt_free = NULL;
 935
 936   if (!rte_is_valid(best0))
 937     return NULL;
 938
 939   best = export_filter_(c, best0, rt_free, pool, silent);
 940
 941   if (!best || !rte_is_reachable(best))
 942     return best;
 943
 944   for (rt0 = best0->next; rt0; rt0 = rt0->next)
 945   {
 946     if (!rte_mergable(best0, rt0))
 947       continue;
 948
 949     rt = export_filter_(c, rt0, &tmp, pool, 1);
 950
 951     if (!rt)
 952       continue;
 953
 954     if (rte_is_reachable(rt))
 955       nhs = nexthop_merge_rta(nhs, rt->attrs, pool, c->merge_limit);
 956
 957     if (tmp)
 958       rte_free(tmp);
 959   }
 960
 961   if (nhs)
 962   {
 963     nhs = nexthop_merge_rta(nhs, best->attrs, pool, c->merge_limit);
 964
 965     if (nhs->next)
 966     {
 967       best = rte_cow_rta(best, pool);
 968       nexthop_link(best->attrs, nhs);
 969     }
 970   }
 971
 972   if (best != best0)
 973     *rt_free = best;
 974
 975   return best;
 976 }
 977
 978
 979 static void
 980 rt_notify_merged(struct channel *c, net *net, rte *new_changed, rte *old_changed,
 981                  rte *new_best, rte *old_best, int refeed)
 982 {
 983   // struct proto *p = c->proto;
 984   rte *new_free = NULL;
 985
 986   /* We assume that all rte arguments are either NULL or rte_is_valid() */
 987
 988   /* This check should be done by the caller */
 989   if (!new_best && !old_best)
 990     return;
 991
 992   /* Check whether the change is relevant to the merged route */
 993   if ((new_best == old_best) &&
 994       (new_changed != old_changed) &&
 995       !rte_mergable(new_best, new_changed) &&
 996       !rte_mergable(old_best, old_changed))
 997     return;
 998
 999   if (new_best)
1000     c->stats.exp_updates_received++;
1001   else
1002     c->stats.exp_withdraws_received++;
1003
1004   /* Prepare new merged route */
1005   if (new_best)
1006     new_best = rt_export_merged(c, net, &new_free, rte_update_pool, 0);
1007
1008   /* Check old merged route */
1009   if (old_best && !bmap_test(&c->export_map, old_best->id))
1010     old_best = NULL;
1011
1012   if (!new_best && !old_best)
1013     return;
1014
1015   do_rt_notify(c, net, new_best, old_best, refeed);
1016
1017   /* Discard temporary rte */
1018   if (new_free)
1019     rte_free(new_free);
1020 }
1021
1022
1023 /**
1024  * rte_announce - announce a routing table change
1025  * @tab: table the route has been added to
1026  * @type: type of route announcement (RA_UNDEF or RA_ANY)
1027  * @net: network in question
1028  * @new: the new or changed route
1029  * @old: the previous route replaced by the new one
1030  * @new_best: the new best route for the same network
1031  * @old_best: the previous best route for the same network
1032  *
1033  * This function gets a routing table update and announces it to all protocols
1034  * that are connected to the same table by their channels.
1035  *
1036  * There are two ways of how routing table changes are announced. First, there
1037  * is a change of just one route in @net (which may caused a change of the best
1038  * route of the network). In this case @new and @old describes the changed route
1039  * and @new_best and @old_best describes best routes. Other routes are not
1040  * affected, but in sorted table the order of other routes might change.
1041  *
1042  * Second, There is a bulk change of multiple routes in @net, with shared best
1043  * route selection. In such case separate route changes are described using
1044  * @type of %RA_ANY, with @new and @old specifying the changed route, while
1045  * @new_best and @old_best are NULL. After that, another notification is done
1046  * where @new_best and @old_best are filled (may be the same), but @new and @old
1047  * are NULL.
1048  *
1049  * The function announces the change to all associated channels. For each
1050  * channel, an appropriate preprocessing is done according to channel &ra_mode.
1051  * For example, %RA_OPTIMAL channels receive just changes of best routes.
1052  *
1053  * In general, we first call preexport() hook of a protocol, which performs
1054  * basic checks on the route (each protocol has a right to veto or force accept
1055  * of the route before any filter is asked). Then we consult an export filter
1056  * of the channel and verify the old route in an export map of the channel.
1057  * Finally, the rt_notify() hook of the protocol gets called.
1058  *
1059  * Note that there are also calls of rt_notify() hooks due to feed, but that is
1060  * done outside of scope of rte_announce().
1061  */
1062 static void
1063 rte_announce(rtable *tab, uint type, net *net, rte *new, rte *old,
1064              rte *new_best, rte *old_best)
1065 {
1066   if (!rte_is_valid(new))
1067     new = NULL;
1068
1069   if (!rte_is_valid(old))
1070     old = NULL;
1071
1072   if (!rte_is_valid(new_best))
1073     new_best = NULL;
1074
1075   if (!rte_is_valid(old_best))
1076     old_best = NULL;
1077
1078   if (!new && !old && !new_best && !old_best)
1079     return;
1080
1081   if (new_best != old_best)
1082   {
1083     if (new_best)
1084       new_best->sender->stats.pref_routes++;
1085     if (old_best)
1086       old_best->sender->stats.pref_routes--;
1087
1088     if (tab->hostcache)
1089       rt_notify_hostcache(tab, net);
1090
1091     if (!EMPTY_LIST(tab->flowspec_links))
1092       rt_flowspec_notify(tab, net);
1093   }
1094
1095   rt_schedule_notify(tab);
1096
1097   struct channel *c; node *n;
1098   WALK_LIST2(c, n, tab->channels, table_node)
1099   {
1100     if (c->export_state == ES_DOWN)
1101       continue;
1102
1103     if (type && (type != c->ra_mode))
1104       continue;
1105
1106     switch (c->ra_mode)
1107     {
1108     case RA_OPTIMAL:
1109       if (new_best != old_best)
1110         rt_notify_basic(c, net, new_best, old_best, 0);
1111       break;
1112
1113     case RA_ANY:
1114       if (new != old)
1115         rt_notify_basic(c, net, new, old, 0);
1116       break;
1117
1118     case RA_ACCEPTED:
1119       /*
1120        * The (new != old) condition is problematic here, as it would break
1121        * the second usage pattern (announcement after bulk change, used in
1122        * rt_next_hop_update_net(), which sends both new and old as NULL).
1123        *
1124        * But recursive next hops do not work with sorted tables anyways,
1125        * such configuration is forbidden in BGP and not supported in
1126        * rt_notify_accepted().
1127        *
1128        * The condition is needed to eliminate spurious announcements where
1129        * both old and new routes are not valid (so they are NULL).
1130        */
1131       if (new != old)
1132         rt_notify_accepted(c, net, new, old, 0);
1133       break;
1134
1135     case RA_MERGED:
1136       rt_notify_merged(c, net, new, old, new_best, old_best, 0);
1137       break;
1138     }
1139   }
1140 }
1141
1142 static inline int
1143 rte_validate(rte *e)
1144 {
1145   int c;
1146   net *n = e->net;
1147
1148   if (!net_validate(n->n.addr))
1149   {
1150     log(L_WARN "Ignoring bogus prefix %N received via %s",
1151         n->n.addr, e->sender->proto->name);
1152     return 0;
1153   }
1154
1155   /* FIXME: better handling different nettypes */
1156   c = !net_is_flow(n->n.addr) ?
1157     net_classify(n->n.addr): (IADDR_HOST | SCOPE_UNIVERSE);
1158   if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
1159   {
1160     log(L_WARN "Ignoring bogus route %N received via %s",
1161         n->n.addr, e->sender->proto->name);
1162     return 0;
1163   }
1164
1165   if (net_type_match(n->n.addr, NB_DEST) == !e->attrs->dest)
1166   {
1167     /* Exception for flowspec that failed validation */
1168     if (net_is_flow(n->n.addr) && (e->attrs->dest == RTD_UNREACHABLE))
1169       return 1;
1170
1171     log(L_WARN "Ignoring route %N with invalid dest %d received via %s",
1172         n->n.addr, e->attrs->dest, e->sender->proto->name);
1173     return 0;
1174   }
1175
1176   if ((e->attrs->dest == RTD_UNICAST) && !nexthop_is_sorted(&(e->attrs->nh)))
1177   {
1178     log(L_WARN "Ignoring unsorted multipath route %N received via %s",
1179         n->n.addr, e->sender->proto->name);
1180     return 0;
1181   }
1182
1183   return 1;
1184 }
1185
1186 /**
1187  * rte_free - delete a &rte
1188  * @e: &rte to be deleted
1189  *
1190  * rte_free() deletes the given &rte from the routing table it's linked to.
1191  */
1192 void
1193 rte_free(rte *e)
1194 {
1195   rt_unlock_source(e->src);
1196   if (rta_is_cached(e->attrs))
1197     rta_free(e->attrs);
1198   sl_free(e);
1199 }
1200
1201 static inline void
1202 rte_free_quick(rte *e)
1203 {
1204   rt_unlock_source(e->src);
1205   rta_free(e->attrs);
1206   sl_free(e);
1207 }
1208
1209 static int
1210 rte_same(rte *x, rte *y)
1211 {
1212   /* rte.flags / rte.pflags are not checked, as they are internal to rtable */
1213   return
1214     x->attrs == y->attrs &&
1215     x->src == y->src &&
1216     rte_is_filtered(x) == rte_is_filtered(y);
1217 }
1218
1219 static inline int rte_is_ok(rte *e) { return e && !rte_is_filtered(e); }
1220
1221 static void
1222 rte_recalculate(struct channel *c, net *net, rte *new, struct rte_src *src)
1223 {
1224   struct proto *p = c->proto;
1225   struct rtable *table = c->table;
1226   struct proto_stats *stats = &c->stats;
1227   static struct tbf rl_pipe = TBF_DEFAULT_LOG_LIMITS;
1228   rte *before_old = NULL;
1229   rte *old_best = net->routes;
1230   rte *old = NULL;
1231   rte **k;
1232
1233   k = &net->routes;                     /* Find and remove original route from the same protocol */
1234   while (old = *k)
1235     {
1236       if (old->src == src)
1237         {
1238           /* If there is the same route in the routing table but from
1239            * a different sender, then there are two paths from the
1240            * source protocol to this routing table through transparent
1241            * pipes, which is not allowed.
1242            *
1243            * We log that and ignore the route. If it is withdraw, we
1244            * ignore it completely (there might be 'spurious withdraws',
1245            * see FIXME in do_rte_announce())
1246            */
1247           if (old->sender->proto != p)
1248             {
1249               if (new)
1250                 {
1251                   log_rl(&rl_pipe, L_ERR "Pipe collision detected when sending %N to table %s",
1252                       net->n.addr, table->name);
1253                   rte_free_quick(new);
1254                 }
1255               return;
1256             }
1257
1258           if (new && rte_same(old, new))
1259             {
1260               /* No changes, ignore the new route and refresh the old one */
1261
1262               old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
1263
1264               if (!rte_is_filtered(new))
1265                 {
1266                   stats->imp_updates_ignored++;
1267                   rte_trace_in(D_ROUTES, c, new, "ignored");
1268                 }
1269
1270               rte_free_quick(new);
1271               return;
1272             }
1273           *k = old->next;
1274           table->rt_count--;
1275           break;
1276         }
1277       k = &old->next;
1278       before_old = old;
1279     }
1280
1281   /* Save the last accessed position */
1282   rte **pos = k;
1283
1284   if (!old)
1285     before_old = NULL;
1286
1287   if (!old && !new)
1288     {
1289       stats->imp_withdraws_ignored++;
1290       return;
1291     }
1292
1293   int new_ok = rte_is_ok(new);
1294   int old_ok = rte_is_ok(old);
1295
1296   struct channel_limit *l = &c->rx_limit;
1297   if (l->action && !old && new && !c->in_table)
1298     {
1299       u32 all_routes = stats->imp_routes + stats->filt_routes;
1300
1301       if (all_routes >= l->limit)
1302         channel_notify_limit(c, l, PLD_RX, all_routes);
1303
1304       if (l->state == PLS_BLOCKED)
1305         {
1306           /* In receive limit the situation is simple, old is NULL so
1307              we just free new and exit like nothing happened */
1308
1309           stats->imp_updates_ignored++;
1310           rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
1311           rte_free_quick(new);
1312           return;
1313         }
1314     }
1315
1316   l = &c->in_limit;
1317   if (l->action && !old_ok && new_ok)
1318     {
1319       if (stats->imp_routes >= l->limit)
1320         channel_notify_limit(c, l, PLD_IN, stats->imp_routes);
1321
1322       if (l->state == PLS_BLOCKED)
1323         {
1324           /* In import limit the situation is more complicated. We
1325              shouldn't just drop the route, we should handle it like
1326              it was filtered. We also have to continue the route
1327              processing if old or new is non-NULL, but we should exit
1328              if both are NULL as this case is probably assumed to be
1329              already handled. */
1330
1331           stats->imp_updates_ignored++;
1332           rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
1333
1334           if (c->in_keep_filtered)
1335             new->flags |= REF_FILTERED;
1336           else
1337             { rte_free_quick(new); new = NULL; }
1338
1339           /* Note that old && !new could be possible when
1340              c->in_keep_filtered changed in the recent past. */
1341
1342           if (!old && !new)
1343             return;
1344
1345           new_ok = 0;
1346           goto skip_stats1;
1347         }
1348     }
1349
1350   if (new_ok)
1351     stats->imp_updates_accepted++;
1352   else if (old_ok)
1353     stats->imp_withdraws_accepted++;
1354   else
1355     stats->imp_withdraws_ignored++;
1356
1357   if (old_ok || new_ok)
1358     table->last_rt_change = current_time();
1359
1360  skip_stats1:
1361
1362   if (new)
1363     rte_is_filtered(new) ? stats->filt_routes++ : stats->imp_routes++;
1364   if (old)
1365     rte_is_filtered(old) ? stats->filt_routes-- : stats->imp_routes--;
1366
1367   if (table->config->sorted)
1368     {
1369       /* If routes are sorted, just insert new route to appropriate position */
1370       if (new)
1371         {
1372           if (before_old && !rte_better(new, before_old))
1373             k = &before_old->next;
1374           else
1375             k = &net->routes;
1376
1377           for (; *k; k=&(*k)->next)
1378             if (rte_better(new, *k))
1379               break;
1380
1381           new->next = *k;
1382           *k = new;
1383
1384           table->rt_count++;
1385         }
1386     }
1387   else
1388     {
1389       /* If routes are not sorted, find the best route and move it on
1390          the first position. There are several optimized cases. */
1391
1392       if (src->proto->rte_recalculate && src->proto->rte_recalculate(table, net, new, old, old_best))
1393         goto do_recalculate;
1394
1395       if (new && rte_better(new, old_best))
1396         {
1397           /* The first case - the new route is cleary optimal,
1398              we link it at the first position */
1399
1400           new->next = net->routes;
1401           net->routes = new;
1402
1403           table->rt_count++;
1404         }
1405       else if (old == old_best)
1406         {
1407           /* The second case - the old best route disappeared, we add the
1408              new route (if we have any) to the list (we don't care about
1409              position) and then we elect the new optimal route and relink
1410              that route at the first position and announce it. New optimal
1411              route might be NULL if there is no more routes */
1412
1413         do_recalculate:
1414           /* Add the new route to the list */
1415           if (new)
1416             {
1417               new->next = *pos;
1418               *pos = new;
1419
1420               table->rt_count++;
1421             }
1422
1423           /* Find a new optimal route (if there is any) */
1424           if (net->routes)
1425             {
1426               rte **bp = &net->routes;
1427               for (k=&(*bp)->next; *k; k=&(*k)->next)
1428                 if (rte_better(*k, *bp))
1429                   bp = k;
1430
1431               /* And relink it */
1432               rte *best = *bp;
1433               *bp = best->next;
1434               best->next = net->routes;
1435               net->routes = best;
1436             }
1437         }
1438       else if (new)
1439         {
1440           /* The third case - the new route is not better than the old
1441              best route (therefore old_best != NULL) and the old best
1442              route was not removed (therefore old_best == net->routes).
1443              We just link the new route to the old/last position. */
1444
1445           new->next = *pos;
1446           *pos = new;
1447
1448           table->rt_count++;
1449         }
1450       /* The fourth (empty) case - suboptimal route was removed, nothing to do */
1451     }
1452
1453   if (new)
1454     {
1455       new->lastmod = current_time();
1456
1457       if (!old)
1458         {
1459           new->id = hmap_first_zero(&table->id_map);
1460           hmap_set(&table->id_map, new->id);
1461         }
1462       else
1463         new->id = old->id;
1464     }
1465
1466   /* Log the route change */
1467   if ((c->debug & D_ROUTES) || (p->debug & D_ROUTES))
1468     {
1469       if (new_ok)
1470         rte_trace(c, new, '>', new == net->routes ? "added [best]" : "added");
1471       else if (old_ok)
1472         {
1473           if (old != old_best)
1474             rte_trace(c, old, '>', "removed");
1475           else if (rte_is_ok(net->routes))
1476             rte_trace(c, old, '>', "removed [replaced]");
1477           else
1478             rte_trace(c, old, '>', "removed [sole]");
1479         }
1480     }
1481
1482   /* Propagate the route change */
1483   rte_announce(table, RA_UNDEF, net, new, old, net->routes, old_best);
1484
1485   if (!net->routes &&
1486       (table->gc_counter++ >= table->config->gc_threshold))
1487     rt_kick_prune_timer(table);
1488
1489   if (old_ok && p->rte_remove)
1490     p->rte_remove(net, old);
1491   if (new_ok && p->rte_insert)
1492     p->rte_insert(net, new);
1493
1494   if (old)
1495     {
1496       if (!new)
1497         hmap_clear(&table->id_map, old->id);
1498
1499       rte_free_quick(old);
1500     }
1501 }
1502
1503 static int rte_update_nest_cnt;         /* Nesting counter to allow recursive updates */
1504
1505 static inline void
1506 rte_update_lock(void)
1507 {
1508   rte_update_nest_cnt++;
1509 }
1510
1511 static inline void
1512 rte_update_unlock(void)
1513 {
1514   if (!--rte_update_nest_cnt)
1515     lp_flush(rte_update_pool);
1516 }
1517
1518 /**
1519  * rte_update - enter a new update to a routing table
1520  * @table: table to be updated
1521  * @c: channel doing the update
1522  * @net: network node
1523  * @p: protocol submitting the update
1524  * @src: protocol originating the update
1525  * @new: a &rte representing the new route or %NULL for route removal.
1526  *
1527  * This function is called by the routing protocols whenever they discover
1528  * a new route or wish to update/remove an existing route. The right announcement
1529  * sequence is to build route attributes first (either un-cached with @aflags set
1530  * to zero or a cached one using rta_lookup(); in this case please note that
1531  * you need to increase the use count of the attributes yourself by calling
1532  * rta_clone()), call rte_get_temp() to obtain a temporary &rte, fill in all
1533  * the appropriate data and finally submit the new &rte by calling rte_update().
1534  *
1535  * @src specifies the protocol that originally created the route and the meaning
1536  * of protocol-dependent data of @new. If @new is not %NULL, @src have to be the
1537  * same value as @new->attrs->proto. @p specifies the protocol that called
1538  * rte_update(). In most cases it is the same protocol as @src. rte_update()
1539  * stores @p in @new->sender;
1540  *
1541  * When rte_update() gets any route, it automatically validates it (checks,
1542  * whether the network and next hop address are valid IP addresses and also
1543  * whether a normal routing protocol doesn't try to smuggle a host or link
1544  * scope route to the table), converts all protocol dependent attributes stored
1545  * in the &rte to temporary extended attributes, consults import filters of the
1546  * protocol to see if the route should be accepted and/or its attributes modified,
1547  * stores the temporary attributes back to the &rte.
1548  *
1549  * Now, having a "public" version of the route, we
1550  * automatically find any old route defined by the protocol @src
1551  * for network @n, replace it by the new one (or removing it if @new is %NULL),
1552  * recalculate the optimal route for this destination and finally broadcast
1553  * the change (if any) to all routing protocols by calling rte_announce().
1554  *
1555  * All memory used for attribute lists and other temporary allocations is taken
1556  * from a special linear pool @rte_update_pool and freed when rte_update()
1557  * finishes.
1558  */
1559
1560 void
1561 rte_update2(struct channel *c, const net_addr *n, rte *new, struct rte_src *src)
1562 {
1563   // struct proto *p = c->proto;
1564   struct proto_stats *stats = &c->stats;
1565   const struct filter *filter = c->in_filter;
1566   net *nn;
1567
1568   ASSERT(c->channel_state == CS_UP);
1569
1570   rte_update_lock();
1571   if (new)
1572     {
1573       /* Create a temporary table node */
1574       nn = alloca(sizeof(net) + n->length);
1575       memset(nn, 0, sizeof(net) + n->length);
1576       net_copy(nn->n.addr, n);
1577
1578       new->net = nn;
1579       new->sender = c;
1580
1581       stats->imp_updates_received++;
1582       if (!rte_validate(new))
1583         {
1584           rte_trace_in(D_FILTERS, c, new, "invalid");
1585           stats->imp_updates_invalid++;
1586           goto drop;
1587         }
1588
1589       if (filter == FILTER_REJECT)
1590         {
1591           stats->imp_updates_filtered++;
1592           rte_trace_in(D_FILTERS, c, new, "filtered out");
1593
1594           if (! c->in_keep_filtered)
1595             goto drop;
1596
1597           /* new is a private copy, i could modify it */
1598           new->flags |= REF_FILTERED;
1599         }
1600       else if (filter)
1601         {
1602           int fr = f_run(filter, &new, rte_update_pool, 0);
1603           if (fr > F_ACCEPT)
1604           {
1605             stats->imp_updates_filtered++;
1606             rte_trace_in(D_FILTERS, c, new, "filtered out");
1607
1608             if (! c->in_keep_filtered)
1609               goto drop;
1610
1611             new->flags |= REF_FILTERED;
1612           }
1613         }
1614       if (!rta_is_cached(new->attrs)) /* Need to copy attributes */
1615         new->attrs = rta_lookup(new->attrs);
1616       new->flags |= REF_COW;
1617
1618       /* Use the actual struct network, not the dummy one */
1619       nn = net_get(c->table, n);
1620       new->net = nn;
1621     }
1622   else
1623     {
1624       stats->imp_withdraws_received++;
1625
1626       if (!(nn = net_find(c->table, n)) || !src)
1627         {
1628           stats->imp_withdraws_ignored++;
1629           rte_update_unlock();
1630           return;
1631         }
1632     }
1633
1634  recalc:
1635   /* And recalculate the best route */
1636   rte_recalculate(c, nn, new, src);
1637
1638   rte_update_unlock();
1639   return;
1640
1641  drop:
1642   rte_free(new);
1643   new = NULL;
1644   if (nn = net_find(c->table, n))
1645     goto recalc;
1646
1647   rte_update_unlock();
1648 }
1649
1650 /* Independent call to rte_announce(), used from next hop
1651    recalculation, outside of rte_update(). new must be non-NULL */
1652 static inline void
1653 rte_announce_i(rtable *tab, uint type, net *net, rte *new, rte *old,
1654                rte *new_best, rte *old_best)
1655 {
1656   rte_update_lock();
1657   rte_announce(tab, type, net, new, old, new_best, old_best);
1658   rte_update_unlock();
1659 }
1660
1661 static inline void
1662 rte_discard(rte *old)   /* Non-filtered route deletion, used during garbage collection */
1663 {
1664   rte_update_lock();
1665   rte_recalculate(old->sender, old->net, NULL, old->src);
1666   rte_update_unlock();
1667 }
1668
1669 /* Modify existing route by protocol hook, used for long-lived graceful restart */
1670 static inline void
1671 rte_modify(rte *old)
1672 {
1673   rte_update_lock();
1674
1675   rte *new = old->sender->proto->rte_modify(old, rte_update_pool);
1676   if (new != old)
1677   {
1678     if (new)
1679     {
1680       if (!rta_is_cached(new->attrs))
1681         new->attrs = rta_lookup(new->attrs);
1682       new->flags = (old->flags & ~REF_MODIFY) | REF_COW;
1683     }
1684
1685     rte_recalculate(old->sender, old->net, new, old->src);
1686   }
1687
1688   rte_update_unlock();
1689 }
1690
1691 /* Check rtable for best route to given net whether it would be exported do p */
1692 int
1693 rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter)
1694 {
1695   struct proto *p = c->proto;
1696   net *n = net_find(t, a);
1697   rte *rt = n ? n->routes : NULL;
1698
1699   if (!rte_is_valid(rt))
1700     return 0;
1701
1702   rte_update_lock();
1703
1704   /* Rest is stripped down export_filter() */
1705   int v = p->preexport ? p->preexport(c, rt) : 0;
1706   if (v == RIC_PROCESS)
1707     v = (f_run(filter, &rt, rte_update_pool, FF_SILENT) <= F_ACCEPT);
1708
1709   /* Discard temporary rte */
1710   if (rt != n->routes)
1711     rte_free(rt);
1712
1713   rte_update_unlock();
1714
1715   return v > 0;
1716 }
1717
1718
1719 /**
1720  * rt_refresh_begin - start a refresh cycle
1721  * @t: related routing table
1722  * @c related channel
1723  *
1724  * This function starts a refresh cycle for given routing table and announce
1725  * hook. The refresh cycle is a sequence where the protocol sends all its valid
1726  * routes to the routing table (by rte_update()). After that, all protocol
1727  * routes (more precisely routes with @c as @sender) not sent during the
1728  * refresh cycle but still in the table from the past are pruned. This is
1729  * implemented by marking all related routes as stale by REF_STALE flag in
1730  * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD
1731  * flag in rt_refresh_end() and then removing such routes in the prune loop.
1732  */
1733 void
1734 rt_refresh_begin(rtable *t, struct channel *c)
1735 {
1736   FIB_WALK(&t->fib, net, n)
1737     {
1738       rte *e;
1739       for (e = n->routes; e; e = e->next)
1740         if (e->sender == c)
1741           e->flags |= REF_STALE;
1742     }
1743   FIB_WALK_END;
1744 }
1745
1746 /**
1747  * rt_refresh_end - end a refresh cycle
1748  * @t: related routing table
1749  * @c: related channel
1750  *
1751  * This function ends a refresh cycle for given routing table and announce
1752  * hook. See rt_refresh_begin() for description of refresh cycles.
1753  */
1754 void
1755 rt_refresh_end(rtable *t, struct channel *c)
1756 {
1757   int prune = 0;
1758
1759   FIB_WALK(&t->fib, net, n)
1760     {
1761       rte *e;
1762       for (e = n->routes; e; e = e->next)
1763         if ((e->sender == c) && (e->flags & REF_STALE))
1764           {
1765             e->flags |= REF_DISCARD;
1766             prune = 1;
1767           }
1768     }
1769   FIB_WALK_END;
1770
1771   if (prune)
1772     rt_schedule_prune(t);
1773 }
1774
1775 void
1776 rt_modify_stale(rtable *t, struct channel *c)
1777 {
1778   int prune = 0;
1779
1780   FIB_WALK(&t->fib, net, n)
1781     {
1782       rte *e;
1783       for (e = n->routes; e; e = e->next)
1784         if ((e->sender == c) && (e->flags & REF_STALE) && !(e->flags & REF_FILTERED))
1785           {
1786             e->flags |= REF_MODIFY;
1787             prune = 1;
1788           }
1789     }
1790   FIB_WALK_END;
1791
1792   if (prune)
1793     rt_schedule_prune(t);
1794 }
1795
1796 /**
1797  * rte_dump - dump a route
1798  * @e: &rte to be dumped
1799  *
1800  * This functions dumps contents of a &rte to debug output.
1801  */
1802 void
1803 rte_dump(rte *e)
1804 {
1805   net *n = e->net;
1806   debug("%-1N ", n->n.addr);
1807   debug("PF=%02x ", e->pflags);
1808   rta_dump(e->attrs);
1809   debug("\n");
1810 }
1811
1812 /**
1813  * rt_dump - dump a routing table
1814  * @t: routing table to be dumped
1815  *
1816  * This function dumps contents of a given routing table to debug output.
1817  */
1818 void
1819 rt_dump(rtable *t)
1820 {
1821   debug("Dump of routing table <%s>\n", t->name);
1822 #ifdef DEBUGGING
1823   fib_check(&t->fib);
1824 #endif
1825   FIB_WALK(&t->fib, net, n)
1826     {
1827       rte *e;
1828       for(e=n->routes; e; e=e->next)
1829         rte_dump(e);
1830     }
1831   FIB_WALK_END;
1832   debug("\n");
1833 }
1834
1835 /**
1836  * rt_dump_all - dump all routing tables
1837  *
1838  * This function dumps contents of all routing tables to debug output.
1839  */
1840 void
1841 rt_dump_all(void)
1842 {
1843   rtable *t;
1844   node *n;
1845
1846   WALK_LIST2(t, n, routing_tables, n)
1847     rt_dump(t);
1848 }
1849
1850 static inline void
1851 rt_schedule_hcu(rtable *tab)
1852 {
1853   if (tab->hcu_scheduled)
1854     return;
1855
1856   tab->hcu_scheduled = 1;
1857   ev_schedule(tab->rt_event);
1858 }
1859
1860 static inline void
1861 rt_schedule_nhu(rtable *tab)
1862 {
1863   if (tab->nhu_state == NHU_CLEAN)
1864     ev_schedule(tab->rt_event);
1865
1866   /* state change:
1867    *   NHU_CLEAN   -> NHU_SCHEDULED
1868    *   NHU_RUNNING -> NHU_DIRTY
1869    */
1870   tab->nhu_state |= NHU_SCHEDULED;
1871 }
1872
1873 void
1874 rt_schedule_prune(rtable *tab)
1875 {
1876   if (tab->prune_state == 0)
1877     ev_schedule(tab->rt_event);
1878
1879   /* state change 0->1, 2->3 */
1880   tab->prune_state |= 1;
1881 }
1882
1883
1884 static void
1885 rt_event(void *ptr)
1886 {
1887   rtable *tab = ptr;
1888
1889   rt_lock_table(tab);
1890
1891   if (tab->hcu_scheduled)
1892     rt_update_hostcache(tab);
1893
1894   if (tab->nhu_state)
1895     rt_next_hop_update(tab);
1896
1897   if (tab->prune_state)
1898     rt_prune_table(tab);
1899
1900   rt_unlock_table(tab);
1901 }
1902
1903
1904 static void
1905 rt_prune_timer(timer *t)
1906 {
1907   rtable *tab = t->data;
1908
1909   if (tab->gc_counter >= tab->config->gc_threshold)
1910     rt_schedule_prune(tab);
1911 }
1912
1913 static void
1914 rt_kick_prune_timer(rtable *tab)
1915 {
1916   /* Return if prune is already scheduled */
1917   if (tm_active(tab->prune_timer) || (tab->prune_state & 1))
1918     return;
1919
1920   /* Randomize GC period to +/- 50% */
1921   btime gc_period = tab->config->gc_period;
1922   gc_period = (gc_period / 2) + (random_u32() % (uint) gc_period);
1923   tm_start(tab->prune_timer, gc_period);
1924 }
1925
1926
1927 static inline btime
1928 rt_settled_time(rtable *tab)
1929 {
1930   ASSUME(tab->base_settle_time != 0);
1931
1932   return MIN(tab->last_rt_change + tab->config->min_settle_time,
1933              tab->base_settle_time + tab->config->max_settle_time);
1934 }
1935
1936 static void
1937 rt_settle_timer(timer *t)
1938 {
1939   rtable *tab = t->data;
1940
1941   if (!tab->base_settle_time)
1942     return;
1943
1944   btime settled_time = rt_settled_time(tab);
1945   if (current_time() < settled_time)
1946   {
1947     tm_set(tab->settle_timer, settled_time);
1948     return;
1949   }
1950
1951   /* Settled */
1952   tab->base_settle_time = 0;
1953
1954   struct rt_subscription *s;
1955   WALK_LIST(s, tab->subscribers)
1956     s->hook(s);
1957 }
1958
1959 static void
1960 rt_kick_settle_timer(rtable *tab)
1961 {
1962   tab->base_settle_time = current_time();
1963
1964   if (!tab->settle_timer)
1965     tab->settle_timer = tm_new_init(tab->rp, rt_settle_timer, tab, 0, 0);
1966
1967   if (!tm_active(tab->settle_timer))
1968     tm_set(tab->settle_timer, rt_settled_time(tab));
1969 }
1970
1971 static inline void
1972 rt_schedule_notify(rtable *tab)
1973 {
1974   if (EMPTY_LIST(tab->subscribers))
1975     return;
1976
1977   if (tab->base_settle_time)
1978     return;
1979
1980   rt_kick_settle_timer(tab);
1981 }
1982
1983 void
1984 rt_subscribe(rtable *tab, struct rt_subscription *s)
1985 {
1986   s->tab = tab;
1987   rt_lock_table(tab);
1988   add_tail(&tab->subscribers, &s->n);
1989 }
1990
1991 void
1992 rt_unsubscribe(struct rt_subscription *s)
1993 {
1994   rem_node(&s->n);
1995   rt_unlock_table(s->tab);
1996 }
1997
1998 static struct rt_flowspec_link *
1999 rt_flowspec_find_link(rtable *src, rtable *dst)
2000 {
2001   struct rt_flowspec_link *ln;
2002   WALK_LIST(ln, src->flowspec_links)
2003     if ((ln->src == src) && (ln->dst == dst))
2004       return ln;
2005
2006   return NULL;
2007 }
2008
2009 void
2010 rt_flowspec_link(rtable *src, rtable *dst)
2011 {
2012   ASSERT(rt_is_ip(src));
2013   ASSERT(rt_is_flow(dst));
2014
2015   struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst);
2016
2017   if (!ln)
2018   {
2019     rt_lock_table(src);
2020     rt_lock_table(dst);
2021
2022     ln = mb_allocz(src->rp, sizeof(struct rt_flowspec_link));
2023     ln->src = src;
2024     ln->dst = dst;
2025     add_tail(&src->flowspec_links, &ln->n);
2026   }
2027
2028   ln->uc++;
2029 }
2030
2031 void
2032 rt_flowspec_unlink(rtable *src, rtable *dst)
2033 {
2034   struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst);
2035
2036   ASSERT(ln && (ln->uc > 0));
2037
2038   ln->uc--;
2039
2040   if (!ln->uc)
2041   {
2042     rem_node(&ln->n);
2043     mb_free(ln);
2044
2045     rt_unlock_table(src);
2046     rt_unlock_table(dst);
2047   }
2048 }
2049
2050 static void
2051 rt_flowspec_notify(rtable *src, net *net)
2052 {
2053   /* Only IP tables are src links */
2054   ASSERT(rt_is_ip(src));
2055
2056   struct rt_flowspec_link *ln;
2057   WALK_LIST(ln, src->flowspec_links)
2058   {
2059     rtable *dst = ln->dst;
2060     ASSERT(rt_is_flow(dst));
2061
2062     /* No need to inspect it further if recalculation is already active */
2063     if ((dst->nhu_state == NHU_SCHEDULED) || (dst->nhu_state == NHU_DIRTY))
2064       continue;
2065
2066     if (trie_match_net(dst->flowspec_trie, net->n.addr))
2067       rt_schedule_nhu(dst);
2068   }
2069 }
2070
2071 static void
2072 rt_flowspec_reset_trie(rtable *tab)
2073 {
2074   linpool *lp = tab->flowspec_trie->lp;
2075   int ipv4 = tab->flowspec_trie->ipv4;
2076
2077   lp_flush(lp);
2078   tab->flowspec_trie = f_new_trie(lp, 0);
2079   tab->flowspec_trie->ipv4 = ipv4;
2080 }
2081
2082 static void
2083 rt_free(resource *_r)
2084 {
2085   rtable *r = (rtable *) _r;
2086
2087   DBG("Deleting routing table %s\n", r->name);
2088   ASSERT_DIE(r->use_count == 0);
2089
2090   if (r->internal)
2091     return;
2092
2093   r->config->table = NULL;
2094   rem_node(&r->n);
2095
2096   if (r->hostcache)
2097     rt_free_hostcache(r);
2098
2099   /* Freed automagically by the resource pool
2100   fib_free(&r->fib);
2101   hmap_free(&r->id_map);
2102   rfree(r->rt_event);
2103   rfree(r->settle_timer);
2104   mb_free(r);
2105   */
2106 }
2107
2108 static void
2109 rt_res_dump(resource *_r)
2110 {
2111   rtable *r = (rtable *) _r;
2112   debug("name \"%s\", addr_type=%s, rt_count=%u, use_count=%d\n",
2113       r->name, net_label[r->addr_type], r->rt_count, r->use_count);
2114 }
2115
2116 static struct resclass rt_class = {
2117   .name = "Routing table",
2118   .size = sizeof(struct rtable),
2119   .free = rt_free,
2120   .dump = rt_res_dump,
2121   .lookup = NULL,
2122   .memsize = NULL,
2123 };
2124
2125 rtable *
2126 rt_setup(pool *pp, struct rtable_config *cf)
2127 {
2128   pool *p = rp_newf(pp, "Routing table %s", cf->name);
2129
2130   rtable *t = ralloc(p, &rt_class);
2131   t->rp = p;
2132
2133   t->name = cf->name;
2134   t->config = cf;
2135   t->addr_type = cf->addr_type;
2136
2137   fib_init(&t->fib, p, t->addr_type, sizeof(net), OFFSETOF(net, n), 0, NULL);
2138
2139   if (cf->trie_used)
2140   {
2141     t->trie = f_new_trie(lp_new_default(p), 0);
2142     t->trie->ipv4 = net_val_match(t->addr_type, NB_IP4 | NB_VPN4 | NB_ROA4);
2143
2144     t->fib.init = net_init_with_trie;
2145   }
2146
2147   init_list(&t->channels);
2148   init_list(&t->flowspec_links);
2149   init_list(&t->subscribers);
2150
2151   if (!(t->internal = cf->internal))
2152   {
2153     hmap_init(&t->id_map, p, 1024);
2154     hmap_set(&t->id_map, 0);
2155
2156     t->rt_event = ev_new_init(p, rt_event, t);
2157     t->prune_timer = tm_new_init(p, rt_prune_timer, t, 0, 0);
2158     t->last_rt_change = t->gc_time = current_time();
2159
2160     if (rt_is_flow(t))
2161     {
2162       t->flowspec_trie = f_new_trie(lp_new_default(p), 0);
2163       t->flowspec_trie->ipv4 = (t->addr_type == NET_FLOW4);
2164     }
2165   }
2166
2167   return t;
2168 }
2169
2170 /**
2171  * rt_init - initialize routing tables
2172  *
2173  * This function is called during BIRD startup. It initializes the
2174  * routing table module.
2175  */
2176 void
2177 rt_init(void)
2178 {
2179   rta_init();
2180   rt_table_pool = rp_new(&root_pool, "Routing tables");
2181   rte_update_pool = lp_new_default(rt_table_pool);
2182   rte_slab = sl_new(rt_table_pool, sizeof(rte));
2183   init_list(&routing_tables);
2184 }
2185
2186
2187 /**
2188  * rt_prune_table - prune a routing table
2189  *
2190  * The prune loop scans routing tables and removes routes belonging to flushing
2191  * protocols, discarded routes and also stale network entries. It is called from
2192  * rt_event(). The event is rescheduled if the current iteration do not finish
2193  * the table. The pruning is directed by the prune state (@prune_state),
2194  * specifying whether the prune cycle is scheduled or running, and there
2195  * is also a persistent pruning iterator (@prune_fit).
2196  *
2197  * The prune loop is used also for channel flushing. For this purpose, the
2198  * channels to flush are marked before the iteration and notified after the
2199  * iteration.
2200  */
2201 static void
2202 rt_prune_table(rtable *tab)
2203 {
2204   struct fib_iterator *fit = &tab->prune_fit;
2205   int limit = 2000;
2206
2207   struct channel *c;
2208   node *n, *x;
2209
2210   DBG("Pruning route table %s\n", tab->name);
2211 #ifdef DEBUGGING
2212   fib_check(&tab->fib);
2213 #endif
2214
2215   if (tab->prune_state == 0)
2216     return;
2217
2218   if (tab->prune_state == 1)
2219   {
2220     /* Mark channels to flush */
2221     WALK_LIST2(c, n, tab->channels, table_node)
2222       if (c->channel_state == CS_FLUSHING)
2223         c->flush_active = 1;
2224
2225     FIB_ITERATE_INIT(fit, &tab->fib);
2226     tab->prune_state = 2;
2227
2228     tab->gc_counter = 0;
2229     tab->gc_time = current_time();
2230
2231     if (tab->prune_trie)
2232     {
2233       /* Init prefix trie pruning */
2234       tab->trie_new = f_new_trie(lp_new_default(tab->rp), 0);
2235       tab->trie_new->ipv4 = tab->trie->ipv4;
2236     }
2237   }
2238
2239 again:
2240   FIB_ITERATE_START(&tab->fib, fit, net, n)
2241     {
2242       rte *e;
2243
2244     rescan:
2245       if (limit <= 0)
2246       {
2247         FIB_ITERATE_PUT(fit);
2248         ev_schedule(tab->rt_event);
2249         return;
2250       }
2251
2252       for (e=n->routes; e; e=e->next)
2253       {
2254         if (e->sender->flush_active || (e->flags & REF_DISCARD))
2255           {
2256             rte_discard(e);
2257             limit--;
2258
2259             goto rescan;
2260           }
2261
2262         if (e->flags & REF_MODIFY)
2263           {
2264             rte_modify(e);
2265             limit--;
2266
2267             goto rescan;
2268           }
2269       }
2270
2271       if (!n->routes)           /* Orphaned FIB entry */
2272         {
2273           FIB_ITERATE_PUT(fit);
2274           fib_delete(&tab->fib, n);
2275           goto again;
2276         }
2277
2278       if (tab->trie_new)
2279       {
2280         trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen);
2281         limit--;
2282       }
2283     }
2284   FIB_ITERATE_END;
2285
2286 #ifdef DEBUGGING
2287   fib_check(&tab->fib);
2288 #endif
2289
2290   /* state change 2->0, 3->1 */
2291   tab->prune_state &= 1;
2292
2293   if (tab->trie_new)
2294   {
2295     /* Finish prefix trie pruning */
2296
2297     if (!tab->trie_lock_count)
2298     {
2299       rfree(tab->trie->lp);
2300     }
2301     else
2302     {
2303       ASSERT(!tab->trie_old);
2304       tab->trie_old = tab->trie;
2305       tab->trie_old_lock_count = tab->trie_lock_count;
2306       tab->trie_lock_count = 0;
2307     }
2308
2309     tab->trie = tab->trie_new;
2310     tab->trie_new = NULL;
2311     tab->prune_trie = 0;
2312   }
2313   else
2314   {
2315     /* Schedule prefix trie pruning */
2316     if (tab->trie && !tab->trie_old && (tab->trie->prefix_count > (2 * tab->fib.entries)))
2317     {
2318       /* state change 0->1, 2->3 */
2319       tab->prune_state |= 1;
2320       tab->prune_trie = 1;
2321     }
2322   }
2323
2324   if (tab->prune_state > 0)
2325     ev_schedule(tab->rt_event);
2326
2327   /* FIXME: This should be handled in a better way */
2328   rt_prune_sources();
2329
2330   /* Close flushed channels */
2331   WALK_LIST2_DELSAFE(c, n, x, tab->channels, table_node)
2332     if (c->flush_active)
2333       {
2334         c->flush_active = 0;
2335         channel_set_state(c, CS_DOWN);
2336       }
2337
2338   return;
2339 }
2340
2341 /**
2342  * rt_lock_trie - lock a prefix trie of a routing table
2343  * @tab: routing table with prefix trie to be locked
2344  *
2345  * The prune loop may rebuild the prefix trie and invalidate f_trie_walk_state
2346  * structures. Therefore, asynchronous walks should lock the prefix trie using
2347  * this function. That allows the prune loop to rebuild the trie, but postpones
2348  * its freeing until all walks are done (unlocked by rt_unlock_trie()).
2349  *
2350  * Return a current trie that will be locked, the value should be passed back to
2351  * rt_unlock_trie() for unlocking.
2352  *
2353  */
2354 struct f_trie *
2355 rt_lock_trie(rtable *tab)
2356 {
2357   ASSERT(tab->trie);
2358
2359   tab->trie_lock_count++;
2360   return tab->trie;
2361 }
2362
2363 /**
2364  * rt_unlock_trie - unlock a prefix trie of a routing table
2365  * @tab: routing table with prefix trie to be locked
2366  * @trie: value returned by matching rt_lock_trie()
2367  *
2368  * Done for trie locked by rt_lock_trie() after walk over the trie is done.
2369  * It may free the trie and schedule next trie pruning.
2370  */
2371 void
2372 rt_unlock_trie(rtable *tab, struct f_trie *trie)
2373 {
2374   ASSERT(trie);
2375
2376   if (trie == tab->trie)
2377   {
2378     /* Unlock the current prefix trie */
2379     ASSERT(tab->trie_lock_count);
2380     tab->trie_lock_count--;
2381   }
2382   else if (trie == tab->trie_old)
2383   {
2384     /* Unlock the old prefix trie */
2385     ASSERT(tab->trie_old_lock_count);
2386     tab->trie_old_lock_count--;
2387
2388     /* Free old prefix trie that is no longer needed */
2389     if (!tab->trie_old_lock_count)
2390     {
2391       rfree(tab->trie_old->lp);
2392       tab->trie_old = NULL;
2393
2394       /* Kick prefix trie pruning that was postponed */
2395       if (tab->trie && (tab->trie->prefix_count > (2 * tab->fib.entries)))
2396       {
2397         tab->prune_trie = 1;
2398         rt_schedule_prune(tab);
2399       }
2400     }
2401   }
2402   else
2403     log(L_BUG "Invalid arg to rt_unlock_trie()");
2404 }
2405
2406
2407 void
2408 rt_preconfig(struct config *c)
2409 {
2410   init_list(&c->tables);
2411
2412   rt_new_table(cf_get_symbol("master4"), NET_IP4);
2413   rt_new_table(cf_get_symbol("master6"), NET_IP6);
2414 }
2415
2416 void
2417 rt_postconfig(struct config *c)
2418 {
2419   uint num_tables = list_length(&c->tables);
2420   btime def_gc_period = 400 MS * num_tables;
2421   def_gc_period = MAX(def_gc_period, 10 S);
2422   def_gc_period = MIN(def_gc_period, 600 S);
2423
2424   struct rtable_config *rc;
2425   WALK_LIST(rc, c->tables)
2426     if (rc->gc_period == (uint) -1)
2427       rc->gc_period = (uint) def_gc_period;
2428 }
2429
2430
2431 /*
2432  * Some functions for handing internal next hop updates
2433  * triggered by rt_schedule_nhu().
2434  */
2435
2436 void
2437 rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls)
2438 {
2439   a->hostentry = he;
2440   a->dest = he->dest;
2441   a->igp_metric = he->igp_metric;
2442
2443   if (a->dest != RTD_UNICAST)
2444   {
2445     /* No nexthop */
2446 no_nexthop:
2447     a->nh = (struct nexthop) {};
2448     if (mls)
2449     { /* Store the label stack for later changes */
2450       a->nh.labels_orig = a->nh.labels = mls->len;
2451       memcpy(a->nh.label, mls->stack, mls->len * sizeof(u32));
2452     }
2453     return;
2454   }
2455
2456   if (((!mls) || (!mls->len)) && he->nexthop_linkable)
2457   { /* Just link the nexthop chain, no label append happens. */
2458     memcpy(&(a->nh), &(he->src->nh), nexthop_size(&(he->src->nh)));
2459     return;
2460   }
2461
2462   struct nexthop *nhp = NULL, *nhr = NULL;
2463   int skip_nexthop = 0;
2464
2465   for (struct nexthop *nh = &(he->src->nh); nh; nh = nh->next)
2466   {
2467     if (skip_nexthop)
2468       skip_nexthop--;
2469     else
2470     {
2471       nhr = nhp;
2472       nhp = (nhp ? (nhp->next = lp_alloc(rte_update_pool, NEXTHOP_MAX_SIZE)) : &(a->nh));
2473     }
2474
2475     memset(nhp, 0, NEXTHOP_MAX_SIZE);
2476     nhp->iface = nh->iface;
2477     nhp->weight = nh->weight;
2478
2479     if (mls)
2480     {
2481       nhp->labels = nh->labels + mls->len;
2482       nhp->labels_orig = mls->len;
2483       if (nhp->labels <= MPLS_MAX_LABEL_STACK)
2484       {
2485         memcpy(nhp->label, nh->label, nh->labels * sizeof(u32)); /* First the hostentry labels */
2486         memcpy(&(nhp->label[nh->labels]), mls->stack, mls->len * sizeof(u32)); /* Then the bottom labels */
2487       }
2488       else
2489       {
2490         log(L_WARN "Sum of label stack sizes %d + %d = %d exceedes allowed maximum (%d)",
2491             nh->labels, mls->len, nhp->labels, MPLS_MAX_LABEL_STACK);
2492         skip_nexthop++;
2493         continue;
2494       }
2495     }
2496     else if (nh->labels)
2497     {
2498       nhp->labels = nh->labels;
2499       nhp->labels_orig = 0;
2500       memcpy(nhp->label, nh->label, nh->labels * sizeof(u32));
2501     }
2502
2503     if (ipa_nonzero(nh->gw))
2504     {
2505       nhp->gw = nh->gw;                 /* Router nexthop */
2506       nhp->flags |= (nh->flags & RNF_ONLINK);
2507     }
2508     else if (!(nh->iface->flags & IF_MULTIACCESS) || (nh->iface->flags & IF_LOOPBACK))
2509       nhp->gw = IPA_NONE;               /* PtP link - no need for nexthop */
2510     else if (ipa_nonzero(he->link))
2511       nhp->gw = he->link;               /* Device nexthop with link-local address known */
2512     else
2513       nhp->gw = he->addr;               /* Device nexthop with link-local address unknown */
2514   }
2515
2516   if (skip_nexthop)
2517     if (nhr)
2518       nhr->next = NULL;
2519     else
2520     {
2521       a->dest = RTD_UNREACHABLE;
2522       log(L_WARN "No valid nexthop remaining, setting route unreachable");
2523       goto no_nexthop;
2524     }
2525 }
2526
2527 static inline int
2528 rta_next_hop_outdated(rta *a)
2529 {
2530   struct hostentry *he = a->hostentry;
2531
2532   if (!he)
2533     return 0;
2534
2535   if (!he->src)
2536     return a->dest != RTD_UNREACHABLE;
2537
2538   return (a->dest != he->dest) || (a->igp_metric != he->igp_metric) ||
2539     (!he->nexthop_linkable) || !nexthop_same(&(a->nh), &(he->src->nh));
2540 }
2541
2542 static inline rte *
2543 rt_next_hop_update_rte(rtable *tab UNUSED, rte *old)
2544 {
2545   if (!rta_next_hop_outdated(old->attrs))
2546     return NULL;
2547
2548   rta *a = alloca(RTA_MAX_SIZE);
2549   memcpy(a, old->attrs, rta_size(old->attrs));
2550
2551   mpls_label_stack mls = { .len = a->nh.labels_orig };
2552   memcpy(mls.stack, &a->nh.label[a->nh.labels - mls.len], mls.len * sizeof(u32));
2553
2554   rta_apply_hostentry(a, old->attrs->hostentry, &mls);
2555   a->cached = 0;
2556
2557   rte *e = sl_alloc(rte_slab);
2558   memcpy(e, old, sizeof(rte));
2559   e->attrs = rta_lookup(a);
2560   rt_lock_source(e->src);
2561
2562   return e;
2563 }
2564
2565
2566 #ifdef CONFIG_BGP
2567
2568 static inline int
2569 net_flow_has_dst_prefix(const net_addr *n)
2570 {
2571   ASSUME(net_is_flow(n));
2572
2573   if (n->pxlen)
2574     return 1;
2575
2576   if (n->type == NET_FLOW4)
2577   {
2578     const net_addr_flow4 *n4 = (void *) n;
2579     return (n4->length > sizeof(net_addr_flow4)) && (n4->data[0] == FLOW_TYPE_DST_PREFIX);
2580   }
2581   else
2582   {
2583     const net_addr_flow6 *n6 = (void *) n;
2584     return (n6->length > sizeof(net_addr_flow6)) && (n6->data[0] == FLOW_TYPE_DST_PREFIX);
2585   }
2586 }
2587
2588 static inline int
2589 rta_as_path_is_empty(rta *a)
2590 {
2591   eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH));
2592   return !e || (as_path_getlen(e->u.ptr) == 0);
2593 }
2594
2595 static inline u32
2596 rta_get_first_asn(rta *a)
2597 {
2598   eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH));
2599   u32 asn;
2600
2601   return (e && as_path_get_first_regular(e->u.ptr, &asn)) ? asn : 0;
2602 }
2603
2604 int
2605 rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, rta *a, int interior)
2606 {
2607   ASSERT(rt_is_ip(tab_ip));
2608   ASSERT(rt_is_flow(tab_flow));
2609   ASSERT(tab_ip->trie);
2610
2611   /* RFC 8955 6. a) Flowspec has defined dst prefix */
2612   if (!net_flow_has_dst_prefix(n))
2613     return 0;
2614
2615   /* RFC 9117 4.1. Accept  AS_PATH is empty (fr */
2616   if (interior && rta_as_path_is_empty(a))
2617     return 1;
2618
2619
2620   /* RFC 8955 6. b) Flowspec and its best-match route have the same originator */
2621
2622   /* Find flowspec dst prefix */
2623   net_addr dst;
2624   if (n->type == NET_FLOW4)
2625     net_fill_ip4(&dst, net4_prefix(n), net4_pxlen(n));
2626   else
2627     net_fill_ip6(&dst, net6_prefix(n), net6_pxlen(n));
2628
2629   /* Find best-match BGP unicast route for flowspec dst prefix */
2630   net *nb = net_route(tab_ip, &dst);
2631   rte *rb = nb ? nb->routes : NULL;
2632
2633   /* Register prefix to trie for tracking further changes */
2634   int max_pxlen = (n->type == NET_FLOW4) ? IP4_MAX_PREFIX_LENGTH : IP6_MAX_PREFIX_LENGTH;
2635   trie_add_prefix(tab_flow->flowspec_trie, &dst, (nb ? nb->n.addr->pxlen : 0), max_pxlen);
2636
2637   /* No best-match BGP route -> no flowspec */
2638   if (!rb || (rb->attrs->source != RTS_BGP))
2639     return 0;
2640
2641   /* Find ORIGINATOR_ID values */
2642   u32 orig_a = ea_get_int(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0);
2643   u32 orig_b = ea_get_int(rb->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0);
2644
2645   /* Originator is either ORIGINATOR_ID (if present), or BGP neighbor address (if not) */
2646   if ((orig_a != orig_b) || (!orig_a && !orig_b && !ipa_equal(a->from, rb->attrs->from)))
2647     return 0;
2648
2649
2650   /* Find ASN of the best-match route, for use in next checks */
2651   u32 asn_b = rta_get_first_asn(rb->attrs);
2652   if (!asn_b)
2653     return 0;
2654
2655   /* RFC 9117 4.2. For EBGP, flowspec and its best-match route are from the same AS */
2656   if (!interior && (rta_get_first_asn(a) != asn_b))
2657     return 0;
2658
2659   /* RFC 8955 6. c) More-specific routes are from the same AS as the best-match route */
2660   TRIE_WALK(tab_ip->trie, subnet, &dst)
2661   {
2662     net *nc = net_find_valid(tab_ip, &subnet);
2663     if (!nc)
2664       continue;
2665
2666     rte *rc = nc->routes;
2667     if (rc->attrs->source != RTS_BGP)
2668       return 0;
2669
2670     if (rta_get_first_asn(rc->attrs) != asn_b)
2671       return 0;
2672   }
2673   TRIE_WALK_END;
2674
2675   return 1;
2676 }
2677
2678 #endif /* CONFIG_BGP */
2679
2680 static rte *
2681 rt_flowspec_update_rte(rtable *tab, rte *r)
2682 {
2683 #ifdef CONFIG_BGP
2684   if ((r->attrs->source != RTS_BGP) || (r->sender->proto != r->src->proto))
2685     return NULL;
2686
2687   struct bgp_channel *bc = (struct bgp_channel *) r->sender;
2688   if (!bc->base_table)
2689     return NULL;
2690
2691   const net_addr *n = r->net->n.addr;
2692   struct bgp_proto *p = (void *) r->src->proto;
2693   int valid = rt_flowspec_check(bc->base_table, tab, n, r->attrs, p->is_interior);
2694   int dest = valid ? RTD_NONE : RTD_UNREACHABLE;
2695
2696   if (dest == r->attrs->dest)
2697     return NULL;
2698
2699   rta *a = alloca(RTA_MAX_SIZE);
2700   memcpy(a, r->attrs, rta_size(r->attrs));
2701   a->dest = dest;
2702   a->cached = 0;
2703
2704   rte *new = sl_alloc(rte_slab);
2705   memcpy(new, r, sizeof(rte));
2706   new->attrs = rta_lookup(a);
2707   rt_lock_source(new->src);
2708
2709   return new;
2710 #else
2711   return NULL;
2712 #endif
2713 }
2714
2715
2716 static inline int
2717 rt_next_hop_update_net(rtable *tab, net *n)
2718 {
2719   rte **k, *e, *new, *old_best, **new_best;
2720   int count = 0;
2721   int free_old_best = 0;
2722
2723   old_best = n->routes;
2724   if (!old_best)
2725     return 0;
2726
2727   for (k = &n->routes; e = *k; k = &e->next)
2728   {
2729     if (!net_is_flow(n->n.addr))
2730       new = rt_next_hop_update_rte(tab, e);
2731     else
2732       new = rt_flowspec_update_rte(tab, e);
2733
2734     if (new)
2735       {
2736         *k = new;
2737
2738         rte_trace_in(D_ROUTES, new->sender, new, "updated");
2739         rte_announce_i(tab, RA_ANY, n, new, e, NULL, NULL);
2740
2741         /* Call a pre-comparison hook */
2742         /* Not really an efficient way to compute this */
2743         if (e->src->proto->rte_recalculate)
2744           e->src->proto->rte_recalculate(tab, n, new, e, NULL);
2745
2746         if (e != old_best)
2747           rte_free_quick(e);
2748         else /* Freeing of the old best rte is postponed */
2749           free_old_best = 1;
2750
2751         e = new;
2752         count++;
2753       }
2754   }
2755
2756   if (!count)
2757     return 0;
2758
2759   /* Find the new best route */
2760   new_best = NULL;
2761   for (k = &n->routes; e = *k; k = &e->next)
2762     {
2763       if (!new_best || rte_better(e, *new_best))
2764         new_best = k;
2765     }
2766
2767   /* Relink the new best route to the first position */
2768   new = *new_best;
2769   if (new != n->routes)
2770     {
2771       *new_best = new->next;
2772       new->next = n->routes;
2773       n->routes = new;
2774     }
2775
2776   /* Announce the new best route */
2777   if (new != old_best)
2778     rte_trace_in(D_ROUTES, new->sender, new, "updated [best]");
2779
2780   /* Propagate changes */
2781   rte_announce_i(tab, RA_UNDEF, n, NULL, NULL, n->routes, old_best);
2782
2783   if (free_old_best)
2784     rte_free_quick(old_best);
2785
2786   return count;
2787 }
2788
2789 static void
2790 rt_next_hop_update(rtable *tab)
2791 {
2792   struct fib_iterator *fit = &tab->nhu_fit;
2793   int max_feed = 32;
2794
2795   if (tab->nhu_state == NHU_CLEAN)
2796     return;
2797
2798   if (tab->nhu_state == NHU_SCHEDULED)
2799     {
2800       FIB_ITERATE_INIT(fit, &tab->fib);
2801       tab->nhu_state = NHU_RUNNING;
2802
2803       if (tab->flowspec_trie)
2804         rt_flowspec_reset_trie(tab);
2805     }
2806
2807   FIB_ITERATE_START(&tab->fib, fit, net, n)
2808     {
2809       if (max_feed <= 0)
2810         {
2811           FIB_ITERATE_PUT(fit);
2812           ev_schedule(tab->rt_event);
2813           return;
2814         }
2815       max_feed -= rt_next_hop_update_net(tab, n);
2816     }
2817   FIB_ITERATE_END;
2818
2819   /* State change:
2820    *   NHU_DIRTY   -> NHU_SCHEDULED
2821    *   NHU_RUNNING -> NHU_CLEAN
2822    */
2823   tab->nhu_state &= 1;
2824
2825   if (tab->nhu_state != NHU_CLEAN)
2826     ev_schedule(tab->rt_event);
2827 }
2828
2829
2830 struct rtable_config *
2831 rt_new_table(struct symbol *s, uint addr_type)
2832 {
2833   /* Hack that allows to 'redefine' the master table */
2834   if ((s->class == SYM_TABLE) &&
2835       (s->table == new_config->def_tables[addr_type]) &&
2836       ((addr_type == NET_IP4) || (addr_type == NET_IP6)))
2837     return s->table;
2838
2839   struct rtable_config *c = cfg_allocz(sizeof(struct rtable_config));
2840
2841   cf_define_symbol(s, SYM_TABLE, table, c);
2842   c->name = s->name;
2843   c->addr_type = addr_type;
2844   c->gc_threshold = 1000;
2845   c->gc_period = (uint) -1;     /* set in rt_postconfig() */
2846   c->min_settle_time = 1 S;
2847   c->max_settle_time = 20 S;
2848
2849   add_tail(&new_config->tables, &c->n);
2850
2851   /* First table of each type is kept as default */
2852   if (! new_config->def_tables[addr_type])
2853     new_config->def_tables[addr_type] = c;
2854
2855   return c;
2856 }
2857
2858 /**
2859  * rt_lock_table - lock a routing table
2860  * @r: routing table to be locked
2861  *
2862  * Lock a routing table, because it's in use by a protocol,
2863  * preventing it from being freed when it gets undefined in a new
2864  * configuration.
2865  */
2866 void
2867 rt_lock_table(rtable *r)
2868 {
2869   r->use_count++;
2870 }
2871
2872 /**
2873  * rt_unlock_table - unlock a routing table
2874  * @r: routing table to be unlocked
2875  *
2876  * Unlock a routing table formerly locked by rt_lock_table(),
2877  * that is decrease its use count and delete it if it's scheduled
2878  * for deletion by configuration changes.
2879  */
2880 void
2881 rt_unlock_table(rtable *r)
2882 {
2883   if (!--r->use_count && r->deleted)
2884     {
2885       struct config *conf = r->deleted;
2886
2887       /* Delete the routing table by freeing its pool */
2888       rt_shutdown(r);
2889       config_del_obstacle(conf);
2890     }
2891 }
2892
2893 static int
2894 rt_reconfigure(rtable *tab, struct rtable_config *new, struct rtable_config *old)
2895 {
2896   if ((new->addr_type != old->addr_type) ||
2897       (new->sorted != old->sorted) ||
2898       (new->trie_used != old->trie_used))
2899     return 0;
2900
2901   DBG("\t%s: same\n", new->name);
2902   new->table = tab;
2903   tab->name = new->name;
2904   tab->config = new;
2905
2906   return 1;
2907 }
2908
2909 static struct rtable_config *
2910 rt_find_table_config(struct config *cf, char *name)
2911 {
2912   struct symbol *sym = cf_find_symbol(cf, name);
2913   return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL;
2914 }
2915
2916 /**
2917  * rt_commit - commit new routing table configuration
2918  * @new: new configuration
2919  * @old: original configuration or %NULL if it's boot time config
2920  *
2921  * Scan differences between @old and @new configuration and modify
2922  * the routing tables according to these changes. If @new defines a
2923  * previously unknown table, create it, if it omits a table existing
2924  * in @old, schedule it for deletion (it gets deleted when all protocols
2925  * disconnect from it by calling rt_unlock_table()), if it exists
2926  * in both configurations, leave it unchanged.
2927  */
2928 void
2929 rt_commit(struct config *new, struct config *old)
2930 {
2931   struct rtable_config *o, *r;
2932
2933   DBG("rt_commit:\n");
2934   if (old)
2935     {
2936       WALK_LIST(o, old->tables)
2937         {
2938           rtable *tab = o->table;
2939           if (tab->deleted)
2940             continue;
2941
2942           r = rt_find_table_config(new, o->name);
2943           if (r && !new->shutdown && rt_reconfigure(tab, r, o))
2944             continue;
2945
2946           DBG("\t%s: deleted\n", o->name);
2947           tab->deleted = old;
2948           config_add_obstacle(old);
2949           rt_lock_table(tab);
2950           rt_unlock_table(tab);
2951         }
2952     }
2953
2954   WALK_LIST(r, new->tables)
2955     if (!r->table)
2956       {
2957         r->table = rt_setup(rt_table_pool, r);
2958         DBG("\t%s: created\n", r->name);
2959         add_tail(&routing_tables, &r->table->n);
2960       }
2961   DBG("\tdone\n");
2962 }
2963
2964 static inline void
2965 do_feed_channel(struct channel *c, net *n, rte *e)
2966 {
2967   rte_update_lock();
2968   if (c->ra_mode == RA_ACCEPTED)
2969     rt_notify_accepted(c, n, NULL, NULL, c->refeeding);
2970   else if (c->ra_mode == RA_MERGED)
2971     rt_notify_merged(c, n, NULL, NULL, e, e, c->refeeding);
2972   else /* RA_BASIC */
2973     rt_notify_basic(c, n, e, e, c->refeeding);
2974   rte_update_unlock();
2975 }
2976
2977 /**
2978  * rt_feed_channel - advertise all routes to a channel
2979  * @c: channel to be fed
2980  *
2981  * This function performs one pass of advertisement of routes to a channel that
2982  * is in the ES_FEEDING state. It is called by the protocol code as long as it
2983  * has something to do. (We avoid transferring all the routes in single pass in
2984  * order not to monopolize CPU time.)
2985  */
2986 int
2987 rt_feed_channel(struct channel *c)
2988 {
2989   struct fib_iterator *fit = &c->feed_fit;
2990   int max_feed = 256;
2991
2992   ASSERT(c->export_state == ES_FEEDING);
2993
2994   if (!c->feed_active)
2995     {
2996       FIB_ITERATE_INIT(fit, &c->table->fib);
2997       c->feed_active = 1;
2998     }
2999
3000   FIB_ITERATE_START(&c->table->fib, fit, net, n)
3001     {
3002       rte *e = n->routes;
3003       if (max_feed <= 0)
3004         {
3005           FIB_ITERATE_PUT(fit);
3006           return 0;
3007         }
3008
3009       if ((c->ra_mode == RA_OPTIMAL) ||
3010           (c->ra_mode == RA_ACCEPTED) ||
3011           (c->ra_mode == RA_MERGED))
3012         if (rte_is_valid(e))
3013           {
3014             /* In the meantime, the protocol may fell down */
3015             if (c->export_state != ES_FEEDING)
3016               goto done;
3017
3018             do_feed_channel(c, n, e);
3019             max_feed--;
3020           }
3021
3022       if (c->ra_mode == RA_ANY)
3023         for(e = n->routes; e; e = e->next)
3024           {
3025             /* In the meantime, the protocol may fell down */
3026             if (c->export_state != ES_FEEDING)
3027               goto done;
3028
3029             if (!rte_is_valid(e))
3030               continue;
3031
3032             do_feed_channel(c, n, e);
3033             max_feed--;
3034           }
3035     }
3036   FIB_ITERATE_END;
3037
3038 done:
3039   c->feed_active = 0;
3040   return 1;
3041 }
3042
3043 /**
3044  * rt_feed_baby_abort - abort protocol feeding
3045  * @c: channel
3046  *
3047  * This function is called by the protocol code when the protocol stops or
3048  * ceases to exist during the feeding.
3049  */
3050 void
3051 rt_feed_channel_abort(struct channel *c)
3052 {
3053   if (c->feed_active)
3054     {
3055       /* Unlink the iterator */
3056       fit_get(&c->table->fib, &c->feed_fit);
3057       c->feed_active = 0;
3058     }
3059 }
3060
3061
3062 /*
3063  *      Import table
3064  */
3065
3066 int
3067 rte_update_in(struct channel *c, const net_addr *n, rte *new, struct rte_src *src)
3068 {
3069   struct rtable *tab = c->in_table;
3070   rte *old, **pos;
3071   net *net;
3072
3073   if (new)
3074   {
3075     net = net_get(tab, n);
3076
3077     if (!rta_is_cached(new->attrs))
3078       new->attrs = rta_lookup(new->attrs);
3079   }
3080   else
3081   {
3082     net = net_find(tab, n);
3083
3084     if (!net)
3085       goto drop_withdraw;
3086   }
3087
3088   /* Find the old rte */
3089   for (pos = &net->routes; old = *pos; pos = &old->next)
3090     if (old->src == src)
3091     {
3092       if (new && rte_same(old, new))
3093       {
3094         /* Refresh the old rte, continue with update to main rtable */
3095         if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY))
3096         {
3097           old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
3098
3099           if (c->proto->rte_update_in_notify)
3100             c->proto->rte_update_in_notify(c, n, old, src);
3101
3102           return 1;
3103         }
3104
3105         goto drop_update;
3106       }
3107
3108       /* Move iterator if needed */
3109       if (old == c->reload_next_rte)
3110         c->reload_next_rte = old->next;
3111
3112       /* Remove the old rte */
3113       *pos = old->next;
3114       rte_free_quick(old);
3115       tab->rt_count--;
3116
3117       break;
3118     }
3119
3120   if (!new)
3121   {
3122     if (!old)
3123       goto drop_withdraw;
3124
3125     if (!net->routes)
3126       fib_delete(&tab->fib, net);
3127
3128     if (c->proto->rte_update_in_notify)
3129       c->proto->rte_update_in_notify(c, n, NULL, src);
3130
3131     return 1;
3132   }
3133
3134   struct channel_limit *l = &c->rx_limit;
3135   if (l->action && !old)
3136   {
3137     if (tab->rt_count >= l->limit)
3138       channel_notify_limit(c, l, PLD_RX, tab->rt_count);
3139
3140     if (l->state == PLS_BLOCKED)
3141     {
3142       /* Required by rte_trace_in() */
3143       new->net = net;
3144
3145       rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
3146       goto drop_update;
3147     }
3148   }
3149
3150   /* Insert the new rte */
3151   rte *e = rte_do_cow(new);
3152   e->flags |= REF_COW;
3153   e->net = net;
3154   e->sender = c;
3155   e->lastmod = current_time();
3156   e->next = *pos;
3157   *pos = e;
3158   tab->rt_count++;
3159
3160   if (c->proto->rte_update_in_notify)
3161     c->proto->rte_update_in_notify(c, n, e, src);
3162
3163   return 1;
3164
3165 drop_update:
3166   c->stats.imp_updates_received++;
3167   c->stats.imp_updates_ignored++;
3168   rte_free(new);
3169
3170   if (!net->routes)
3171     fib_delete(&tab->fib, net);
3172
3173   return 0;
3174
3175 drop_withdraw:
3176   c->stats.imp_withdraws_received++;
3177   c->stats.imp_withdraws_ignored++;
3178   return 0;
3179 }
3180
3181 int
3182 rt_reload_channel(struct channel *c)
3183 {
3184   struct rtable *tab = c->in_table;
3185   struct fib_iterator *fit = &c->reload_fit;
3186   int max_feed = 64;
3187
3188   ASSERT(c->channel_state == CS_UP);
3189
3190   if (!c->reload_active)
3191   {
3192     FIB_ITERATE_INIT(fit, &tab->fib);
3193     c->reload_active = 1;
3194   }
3195
3196   do {
3197     for (rte *e = c->reload_next_rte; e; e = e->next)
3198     {
3199       if (max_feed-- <= 0)
3200       {
3201         c->reload_next_rte = e;
3202         debug("%s channel reload burst split (max_feed=%d)", c->proto->name, max_feed);
3203         return 0;
3204       }
3205
3206       rte_update2(c, e->net->n.addr, rte_do_cow(e), e->src);
3207     }
3208
3209     c->reload_next_rte = NULL;
3210
3211     FIB_ITERATE_START(&tab->fib, fit, net, n)
3212     {
3213       if (c->reload_next_rte = n->routes)
3214       {
3215         FIB_ITERATE_PUT_NEXT(fit, &tab->fib);
3216         break;
3217       }
3218     }
3219     FIB_ITERATE_END;
3220   }
3221   while (c->reload_next_rte);
3222
3223   c->reload_active = 0;
3224   return 1;
3225 }
3226
3227 void
3228 rt_reload_channel_abort(struct channel *c)
3229 {
3230   if (c->reload_active)
3231   {
3232     /* Unlink the iterator */
3233     fit_get(&c->in_table->fib, &c->reload_fit);
3234     c->reload_next_rte = NULL;
3235     c->reload_active = 0;
3236   }
3237 }
3238
3239 void
3240 rt_prune_sync(rtable *t, int all)
3241 {
3242   struct fib_iterator fit;
3243
3244   FIB_ITERATE_INIT(&fit, &t->fib);
3245
3246 again:
3247   FIB_ITERATE_START(&t->fib, &fit, net, n)
3248   {
3249     rte *e, **ee = &n->routes;
3250
3251     while (e = *ee)
3252     {
3253       if (all || (e->flags & (REF_STALE | REF_DISCARD)))
3254       {
3255         *ee = e->next;
3256         rte_free_quick(e);
3257         t->rt_count--;
3258       }
3259       else
3260         ee = &e->next;
3261     }
3262
3263     if (all || !n->routes)
3264     {
3265       FIB_ITERATE_PUT(&fit);
3266       fib_delete(&t->fib, n);
3267       goto again;
3268     }
3269   }
3270   FIB_ITERATE_END;
3271 }
3272
3273
3274 /*
3275  *      Export table
3276  */
3277
3278 int
3279 rte_update_out(struct channel *c, const net_addr *n, rte *new, rte *old0, int refeed)
3280 {
3281   struct rtable *tab = c->out_table;
3282   struct rte_src *src;
3283   rte *old, **pos;
3284   net *net;
3285
3286   if (new)
3287   {
3288     net = net_get(tab, n);
3289     src = new->src;
3290
3291     if (!rta_is_cached(new->attrs))
3292       new->attrs = rta_lookup(new->attrs);
3293   }
3294   else
3295   {
3296     net = net_find(tab, n);
3297     src = old0->src;
3298
3299     if (!net)
3300       goto drop_withdraw;
3301   }
3302
3303   /* Find the old rte */
3304   for (pos = &net->routes; old = *pos; pos = &old->next)
3305     if ((c->ra_mode != RA_ANY) || (old->src == src))
3306     {
3307       if (new && rte_same(old, new))
3308       {
3309         /* REF_STALE / REF_DISCARD not used in export table */
3310         /*
3311         if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY))
3312         {
3313           old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
3314           return 1;
3315         }
3316         */
3317
3318         goto drop_update;
3319       }
3320
3321       /* Remove the old rte */
3322       *pos = old->next;
3323       rte_free_quick(old);
3324       tab->rt_count--;
3325
3326       break;
3327     }
3328
3329   if (!new)
3330   {
3331     if (!old)
3332       goto drop_withdraw;
3333
3334     if (!net->routes)
3335       fib_delete(&tab->fib, net);
3336
3337     return 1;
3338   }
3339
3340   /* Insert the new rte */
3341   rte *e = rte_do_cow(new);
3342   e->flags |= REF_COW;
3343   e->net = net;
3344   e->sender = c;
3345   e->lastmod = current_time();
3346   e->next = *pos;
3347   *pos = e;
3348   tab->rt_count++;
3349   return 1;
3350
3351 drop_update:
3352   return refeed;
3353
3354 drop_withdraw:
3355   return 0;
3356 }
3357
3358
3359 /*
3360  *      Hostcache
3361  */
3362
3363 static inline u32
3364 hc_hash(ip_addr a, rtable *dep)
3365 {
3366   return ipa_hash(a) ^ ptr_hash(dep);
3367 }
3368
3369 static inline void
3370 hc_insert(struct hostcache *hc, struct hostentry *he)
3371 {
3372   uint k = he->hash_key >> hc->hash_shift;
3373   he->next = hc->hash_table[k];
3374   hc->hash_table[k] = he;
3375 }
3376
3377 static inline void
3378 hc_remove(struct hostcache *hc, struct hostentry *he)
3379 {
3380   struct hostentry **hep;
3381   uint k = he->hash_key >> hc->hash_shift;
3382
3383   for (hep = &hc->hash_table[k]; *hep != he; hep = &(*hep)->next);
3384   *hep = he->next;
3385 }
3386
3387 #define HC_DEF_ORDER 10
3388 #define HC_HI_MARK *4
3389 #define HC_HI_STEP 2
3390 #define HC_HI_ORDER 16                  /* Must be at most 16 */
3391 #define HC_LO_MARK /5
3392 #define HC_LO_STEP 2
3393 #define HC_LO_ORDER 10
3394
3395 static void
3396 hc_alloc_table(struct hostcache *hc, pool *p, unsigned order)
3397 {
3398   uint hsize = 1 << order;
3399   hc->hash_order = order;
3400   hc->hash_shift = 32 - order;
3401   hc->hash_max = (order >= HC_HI_ORDER) ? ~0U : (hsize HC_HI_MARK);
3402   hc->hash_min = (order <= HC_LO_ORDER) ?  0U : (hsize HC_LO_MARK);
3403
3404   hc->hash_table = mb_allocz(p, hsize * sizeof(struct hostentry *));
3405 }
3406
3407 static void
3408 hc_resize(struct hostcache *hc, pool *p, unsigned new_order)
3409 {
3410   struct hostentry **old_table = hc->hash_table;
3411   struct hostentry *he, *hen;
3412   uint old_size = 1 << hc->hash_order;
3413   uint i;
3414
3415   hc_alloc_table(hc, p, new_order);
3416   for (i = 0; i < old_size; i++)
3417     for (he = old_table[i]; he != NULL; he=hen)
3418       {
3419         hen = he->next;
3420         hc_insert(hc, he);
3421       }
3422   mb_free(old_table);
3423 }
3424
3425 static struct hostentry *
3426 hc_new_hostentry(struct hostcache *hc, pool *p, ip_addr a, ip_addr ll, rtable *dep, unsigned k)
3427 {
3428   struct hostentry *he = sl_alloc(hc->slab);
3429
3430   *he = (struct hostentry) {
3431     .addr = a,
3432     .link = ll,
3433     .tab = dep,
3434     .hash_key = k,
3435   };
3436
3437   add_tail(&hc->hostentries, &he->ln);
3438   hc_insert(hc, he);
3439
3440   hc->hash_items++;
3441   if (hc->hash_items > hc->hash_max)
3442     hc_resize(hc, p, hc->hash_order + HC_HI_STEP);
3443
3444   return he;
3445 }
3446
3447 static void
3448 hc_delete_hostentry(struct hostcache *hc, pool *p, struct hostentry *he)
3449 {
3450   rta_free(he->src);
3451
3452   rem_node(&he->ln);
3453   hc_remove(hc, he);
3454   sl_free(he);
3455
3456   hc->hash_items--;
3457   if (hc->hash_items < hc->hash_min)
3458     hc_resize(hc, p, hc->hash_order - HC_LO_STEP);
3459 }
3460
3461 static void
3462 rt_init_hostcache(rtable *tab)
3463 {
3464   struct hostcache *hc = mb_allocz(tab->rp, sizeof(struct hostcache));
3465   init_list(&hc->hostentries);
3466
3467   hc->hash_items = 0;
3468   hc_alloc_table(hc, tab->rp, HC_DEF_ORDER);
3469   hc->slab = sl_new(tab->rp, sizeof(struct hostentry));
3470
3471   hc->lp = lp_new(tab->rp);
3472   hc->trie = f_new_trie(hc->lp, 0);
3473
3474   tab->hostcache = hc;
3475 }
3476
3477 static void
3478 rt_free_hostcache(rtable *tab)
3479 {
3480   struct hostcache *hc = tab->hostcache;
3481
3482   node *n;
3483   WALK_LIST(n, hc->hostentries)
3484     {
3485       struct hostentry *he = SKIP_BACK(struct hostentry, ln, n);
3486       rta_free(he->src);
3487
3488       if (he->uc)
3489         log(L_ERR "Hostcache is not empty in table %s", tab->name);
3490     }
3491
3492   /* Freed automagically by the resource pool
3493   rfree(hc->slab);
3494   rfree(hc->lp);
3495   mb_free(hc->hash_table);
3496   mb_free(hc);
3497   */
3498 }
3499
3500 static void
3501 rt_notify_hostcache(rtable *tab, net *net)
3502 {
3503   if (tab->hcu_scheduled)
3504     return;
3505
3506   if (trie_match_net(tab->hostcache->trie, net->n.addr))
3507     rt_schedule_hcu(tab);
3508 }
3509
3510 static int
3511 if_local_addr(ip_addr a, struct iface *i)
3512 {
3513   struct ifa *b;
3514
3515   WALK_LIST(b, i->addrs)
3516     if (ipa_equal(a, b->ip))
3517       return 1;
3518
3519   return 0;
3520 }
3521
3522 u32
3523 rt_get_igp_metric(rte *rt)
3524 {
3525   eattr *ea = ea_find(rt->attrs->eattrs, EA_GEN_IGP_METRIC);
3526
3527   if (ea)
3528     return ea->u.data;
3529
3530   if (rt->attrs->source == RTS_DEVICE)
3531     return 0;
3532
3533   if (rt->src->proto->rte_igp_metric)
3534     return rt->src->proto->rte_igp_metric(rt);
3535
3536   return IGP_METRIC_UNKNOWN;
3537 }
3538
3539 static int
3540 rt_update_hostentry(rtable *tab, struct hostentry *he)
3541 {
3542   rta *old_src = he->src;
3543   int direct = 0;
3544   int pxlen = 0;
3545
3546   /* Reset the hostentry */
3547   he->src = NULL;
3548   he->dest = RTD_UNREACHABLE;
3549   he->nexthop_linkable = 0;
3550   he->igp_metric = 0;
3551
3552   net_addr he_addr;
3553   net_fill_ip_host(&he_addr, he->addr);
3554   net *n = net_route(tab, &he_addr);
3555   if (n)
3556     {
3557       rte *e = n->routes;
3558       rta *a = e->attrs;
3559       word pref = a->pref;
3560
3561       for (rte *ee = n->routes; ee; ee = ee->next)
3562         if ((ee->attrs->pref >= pref) && ee->attrs->hostentry)
3563         {
3564           /* Recursive route should not depend on another recursive route */
3565           log(L_WARN "Next hop address %I resolvable through recursive route for %N",
3566               he->addr, n->n.addr);
3567           goto done;
3568         }
3569
3570       pxlen = n->n.addr->pxlen;
3571
3572       if (a->dest == RTD_UNICAST)
3573         {
3574           for (struct nexthop *nh = &(a->nh); nh; nh = nh->next)
3575             if (ipa_zero(nh->gw))
3576               {
3577                 if (if_local_addr(he->addr, nh->iface))
3578                   {
3579                     /* The host address is a local address, this is not valid */
3580                     log(L_WARN "Next hop address %I is a local address of iface %s",
3581                         he->addr, nh->iface->name);
3582                     goto done;
3583                   }
3584
3585                 direct++;
3586               }
3587         }
3588
3589       he->src = rta_clone(a);
3590       he->dest = a->dest;
3591       he->nexthop_linkable = !direct;
3592       he->igp_metric = rt_get_igp_metric(e);
3593     }
3594
3595 done:
3596   /* Add a prefix range to the trie */
3597   trie_add_prefix(tab->hostcache->trie, &he_addr, pxlen, he_addr.pxlen);
3598
3599   rta_free(old_src);
3600   return old_src != he->src;
3601 }
3602
3603 static void
3604 rt_update_hostcache(rtable *tab)
3605 {
3606   struct hostcache *hc = tab->hostcache;
3607   struct hostentry *he;
3608   node *n, *x;
3609
3610   /* Reset the trie */
3611   lp_flush(hc->lp);
3612   hc->trie = f_new_trie(hc->lp, 0);
3613
3614   WALK_LIST_DELSAFE(n, x, hc->hostentries)
3615     {
3616       he = SKIP_BACK(struct hostentry, ln, n);
3617       if (!he->uc)
3618         {
3619           hc_delete_hostentry(hc, tab->rp, he);
3620           continue;
3621         }
3622
3623       if (rt_update_hostentry(tab, he))
3624         rt_schedule_nhu(he->tab);
3625     }
3626
3627   tab->hcu_scheduled = 0;
3628 }
3629
3630 struct hostentry *
3631 rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep)
3632 {
3633   ip_addr link = ipa_zero(ll) ? a : ll;
3634   struct hostentry *he;
3635
3636   if (!tab->hostcache)
3637     rt_init_hostcache(tab);
3638
3639   u32 k = hc_hash(a, dep);
3640   struct hostcache *hc = tab->hostcache;
3641   for (he = hc->hash_table[k >> hc->hash_shift]; he != NULL; he = he->next)
3642     if (ipa_equal(he->addr, a) && ipa_equal(he->link, link) && (he->tab == dep))
3643       return he;
3644
3645   he = hc_new_hostentry(hc, tab->rp, a, link, dep, k);
3646   rt_update_hostentry(tab, he);
3647   return he;
3648 }
3649
3650
3651 /*
3652  *  Documentation for functions declared inline in route.h
3653  */
3654 #if 0
3655
3656 /**
3657  * net_find - find a network entry
3658  * @tab: a routing table
3659  * @addr: address of the network
3660  *
3661  * net_find() looks up the given network in routing table @tab and
3662  * returns a pointer to its &net entry or %NULL if no such network
3663  * exists.
3664  */
3665 static inline net *net_find(rtable *tab, net_addr *addr)
3666 { DUMMY; }
3667
3668 /**
3669  * net_get - obtain a network entry
3670  * @tab: a routing table
3671  * @addr: address of the network
3672  *
3673  * net_get() looks up the given network in routing table @tab and
3674  * returns a pointer to its &net entry. If no such entry exists, it's
3675  * created.
3676  */
3677 static inline net *net_get(rtable *tab, net_addr *addr)
3678 { DUMMY; }
3679
3680 /**
3681  * rte_cow - copy a route for writing
3682  * @r: a route entry to be copied
3683  *
3684  * rte_cow() takes a &rte and prepares it for modification. The exact action
3685  * taken depends on the flags of the &rte -- if it's a temporary entry, it's
3686  * just returned unchanged, else a new temporary entry with the same contents
3687  * is created.
3688  *
3689  * The primary use of this function is inside the filter machinery -- when
3690  * a filter wants to modify &rte contents (to change the preference or to
3691  * attach another set of attributes), it must ensure that the &rte is not
3692  * shared with anyone else (and especially that it isn't stored in any routing
3693  * table).
3694  *
3695  * Result: a pointer to the new writable &rte.
3696  */
3697 static inline rte * rte_cow(rte *r)
3698 { DUMMY; }
3699
3700 #endif