nest/rt-table.c

   1 /*
   2  *      BIRD -- Routing Tables
   3  *
   4  *      (c) 1998--2000 Martin Mares <mj@ucw.cz>
   5  *
   6  *      Can be freely distributed and used under the terms of the GNU GPL.
   7  */
   8
   9 /**
  10  * DOC: Routing tables
  11  *
  12  * Routing tables are probably the most important structures BIRD uses. They
  13  * hold all the information about known networks, the associated routes and
  14  * their attributes.
  15  *
  16  * There are multiple routing tables (a primary one together with any
  17  * number of secondary ones if requested by the configuration). Each table
  18  * is basically a FIB containing entries describing the individual
  19  * destination networks. For each network (represented by structure &net),
  20  * there is a one-way linked list of route entries (&rte), the first entry
  21  * on the list being the best one (i.e., the one we currently use
  22  * for routing), the order of the other ones is undetermined.
  23  *
  24  * The &rte contains information about the route. There are net and src, which
  25  * together forms a key identifying the route in a routing table. There is a
  26  * pointer to a &rta structure (see the route attribute module for a precise
  27  * explanation) holding the route attributes, which are primary data about the
  28  * route. There are several technical fields used by routing table code (route
  29  * id, REF_* flags), There is also the pflags field, holding protocol-specific
  30  * flags. They are not used by routing table code, but by protocol-specific
  31  * hooks. In contrast to route attributes, they are not primary data and their
  32  * validity is also limited to the routing table.
  33  *
  34  * There are several mechanisms that allow automatic update of routes in one
  35  * routing table (dst) as a result of changes in another routing table (src).
  36  * They handle issues of recursive next hop resolving, flowspec validation and
  37  * RPKI validation.
  38  *
  39  * The first such mechanism is handling of recursive next hops. A route in the
  40  * dst table has an indirect next hop address, which is resolved through a route
  41  * in the src table (which may also be the same table) to get an immediate next
  42  * hop. This is implemented using structure &hostcache attached to the src
  43  * table, which contains &hostentry structures for each tracked next hop
  44  * address. These structures are linked from recursive routes in dst tables,
  45  * possibly multiple routes sharing one hostentry (as many routes may have the
  46  * same indirect next hop). There is also a trie in the hostcache, which matches
  47  * all prefixes that may influence resolving of tracked next hops.
  48  *
  49  * When a best route changes in the src table, the hostcache is notified using
  50  * rt_notify_hostcache(), which immediately checks using the trie whether the
  51  * change is relevant and if it is, then it schedules asynchronous hostcache
  52  * recomputation. The recomputation is done by rt_update_hostcache() (called
  53  * from rt_event() of src table), it walks through all hostentries and resolves
  54  * them (by rt_update_hostentry()). It also updates the trie. If a change in
  55  * hostentry resolution was found, then it schedules asynchronous nexthop
  56  * recomputation of associated dst table. That is done by rt_next_hop_update()
  57  * (called from rt_event() of dst table), it iterates over all routes in the dst
  58  * table and re-examines their hostentries for changes. Note that in contrast to
  59  * hostcache update, next hop update can be interrupted by main loop. These two
  60  * full-table walks (over hostcache and dst table) are necessary due to absence
  61  * of direct lookups (route -> affected nexthop, nexthop -> its route).
  62  *
  63  * The second mechanism is for flowspec validation, where validity of flowspec
  64  * routes depends of resolving their network prefixes in IP routing tables. This
  65  * is similar to the recursive next hop mechanism, but simpler as there are no
  66  * intermediate hostcache and hostentries (because flows are less likely to
  67  * share common net prefix than routes sharing a common next hop). In src table,
  68  * there is a list of dst tables (list flowspec_links), this list is updated by
  69  * flowpsec channels (by rt_flowspec_link() and rt_flowspec_unlink() during
  70  * channel start/stop). Each dst table has its own trie of prefixes that may
  71  * influence validation of flowspec routes in it (flowspec_trie).
  72  *
  73  * When a best route changes in the src table, rt_flowspec_notify() immediately
  74  * checks all dst tables from the list using their tries to see whether the
  75  * change is relevant for them. If it is, then an asynchronous re-validation of
  76  * flowspec routes in the dst table is scheduled. That is also done by function
  77  * rt_next_hop_update(), like nexthop recomputation above. It iterates over all
  78  * flowspec routes and re-validates them. It also recalculates the trie.
  79  *
  80  * Note that in contrast to the hostcache update, here the trie is recalculated
  81  * during the rt_next_hop_update(), which may be interleaved with IP route
  82  * updates. The trie is flushed at the beginning of recalculation, which means
  83  * that such updates may use partial trie to see if they are relevant. But it
  84  * works anyway! Either affected flowspec was already re-validated and added to
  85  * the trie, then IP route change would match the trie and trigger a next round
  86  * of re-validation, or it was not yet re-validated and added to the trie, but
  87  * will be re-validated later in this round anyway.
  88  *
  89  * The third mechanism is used for RPKI re-validation of IP routes and it is the
  90  * simplest. It is just a list of subscribers in src table, who are notified
  91  * when any change happened, but only after a settle time. Also, in RPKI case
  92  * the dst is not a table, but a channel, who refeeds routes through a filter.
  93  */
  94
  95 #undef LOCAL_DEBUG
  96
  97 #include "nest/bird.h"
  98 #include "nest/route.h"
  99 #include "nest/protocol.h"
 100 #include "nest/iface.h"
 101 #include "nest/mpls.h"
 102 #include "lib/resource.h"
 103 #include "lib/event.h"
 104 #include "lib/timer.h"
 105 #include "lib/string.h"
 106 #include "conf/conf.h"
 107 #include "filter/filter.h"
 108 #include "filter/data.h"
 109 #include "lib/hash.h"
 110 #include "lib/string.h"
 111 #include "lib/alloca.h"
 112 #include "lib/flowspec.h"
 113
 114 #ifdef CONFIG_BGP
 115 #include "proto/bgp/bgp.h"
 116 #endif
 117
 118 pool *rt_table_pool;
 119
 120 static slab *rte_slab;
 121 linpool *rte_update_pool;
 122
 123 list routing_tables;
 124
 125 static void rt_free_hostcache(rtable *tab);
 126 static void rt_notify_hostcache(rtable *tab, net *net);
 127 static void rt_update_hostcache(rtable *tab);
 128 static void rt_next_hop_update(rtable *tab);
 129 static inline void rt_prune_table(rtable *tab);
 130 static inline void rt_schedule_notify(rtable *tab);
 131 static void rt_flowspec_notify(rtable *tab, net *net);
 132 static void rt_kick_prune_timer(rtable *tab);
 133
 134
 135 static void
 136 net_init_with_trie(struct fib *f, void *N)
 137 {
 138   rtable *tab = SKIP_BACK(rtable, fib, f);
 139   net *n = N;
 140
 141   if (tab->trie)
 142     trie_add_prefix(tab->trie, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen);
 143
 144   if (tab->trie_new)
 145     trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen);
 146 }
 147
 148 static inline void *
 149 net_route_ip6_sadr_trie(rtable *t, const net_addr_ip6_sadr *n0)
 150 {
 151   TRIE_WALK_TO_ROOT_IP6(t->trie, (const net_addr_ip6 *) n0, px)
 152   {
 153     net_addr_ip6_sadr n = NET_ADDR_IP6_SADR(px.prefix, px.pxlen, n0->src_prefix, n0->src_pxlen);
 154     net *best = NULL;
 155     int best_pxlen = 0;
 156
 157     /* We need to do dst first matching. Since sadr addresses are hashed on dst
 158        prefix only, find the hash table chain and go through it to find the
 159        match with the longest matching src prefix. */
 160     for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next)
 161     {
 162       net_addr_ip6_sadr *a = (void *) fn->addr;
 163
 164       if (net_equal_dst_ip6_sadr(&n, a) &&
 165           net_in_net_src_ip6_sadr(&n, a) &&
 166           (a->src_pxlen >= best_pxlen))
 167       {
 168         best = fib_node_to_user(&t->fib, fn);
 169         best_pxlen = a->src_pxlen;
 170       }
 171     }
 172
 173     if (best)
 174       return best;
 175   }
 176   TRIE_WALK_TO_ROOT_END;
 177
 178   return NULL;
 179 }
 180
 181
 182 static inline void *
 183 net_route_ip6_sadr_fib(rtable *t, const net_addr_ip6_sadr *n0)
 184 {
 185   net_addr_ip6_sadr n;
 186   net_copy_ip6_sadr(&n, n0);
 187
 188   while (1)
 189   {
 190     net *best = NULL;
 191     int best_pxlen = 0;
 192
 193     /* We need to do dst first matching. Since sadr addresses are hashed on dst
 194        prefix only, find the hash table chain and go through it to find the
 195        match with the longest matching src prefix. */
 196     for (struct fib_node *fn = fib_get_chain(&t->fib, (net_addr *) &n); fn; fn = fn->next)
 197     {
 198       net_addr_ip6_sadr *a = (void *) fn->addr;
 199
 200       if (net_equal_dst_ip6_sadr(&n, a) &&
 201           net_in_net_src_ip6_sadr(&n, a) &&
 202           (a->src_pxlen >= best_pxlen))
 203       {
 204         best = fib_node_to_user(&t->fib, fn);
 205         best_pxlen = a->src_pxlen;
 206       }
 207     }
 208
 209     if (best)
 210       return best;
 211
 212     if (!n.dst_pxlen)
 213       break;
 214
 215     n.dst_pxlen--;
 216     ip6_clrbit(&n.dst_prefix, n.dst_pxlen);
 217   }
 218
 219   return NULL;
 220 }
 221
 222 net *
 223 net_route(rtable *tab, const net_addr *n)
 224 {
 225   ASSERT(tab->addr_type == n->type);
 226   net_addr_union *nu = SKIP_BACK(net_addr_union, n, n);
 227
 228 #define TW(ipv, what) \
 229   TRIE_WALK_TO_ROOT_IP##ipv(tab->trie, &(nu->ip##ipv), var) \
 230   { what(ipv, var); } \
 231   TRIE_WALK_TO_ROOT_END; return NULL;
 232
 233 #define FW(ipv, what) do { \
 234   net_addr_union nuc; net_copy(&nuc.n, n); \
 235   while (1) { \
 236     what(ipv, nuc.ip##ipv); if (!nuc.n.pxlen) return NULL; \
 237     nuc.n.pxlen--; ip##ipv##_clrbit(&nuc.ip##ipv.prefix, nuc.ip##ipv.pxlen); \
 238   } \
 239 } while(0); return NULL;
 240
 241 #define FVR_IP(ipv, var) \
 242     net *r; if (r = net_find_valid(tab, (net_addr *) &var)) return r;
 243
 244 #define FVR_VPN(ipv, var) \
 245     net_addr_vpn##ipv _var0 = NET_ADDR_VPN##ipv(var.prefix, var.pxlen, nu->vpn##ipv.rd); FVR_IP(ipv, _var0);
 246
 247   if (tab->trie)
 248     switch (n->type) {
 249       case NET_IP4:   TW(4, FVR_IP);
 250       case NET_VPN4:  TW(4, FVR_VPN);
 251       case NET_IP6:   TW(6, FVR_IP);
 252       case NET_VPN6:  TW(6, FVR_VPN);
 253
 254       case NET_IP6_SADR:
 255         return net_route_ip6_sadr_trie(tab, (net_addr_ip6_sadr *) n);
 256       default:
 257         return NULL;
 258     }
 259   else
 260     switch (n->type) {
 261       case NET_IP4:   FW(4, FVR_IP);
 262       case NET_VPN4:  FW(4, FVR_VPN);
 263       case NET_IP6:   FW(6, FVR_IP);
 264       case NET_VPN6:  FW(6, FVR_VPN);
 265
 266       case NET_IP6_SADR:
 267         return net_route_ip6_sadr_fib (tab, (net_addr_ip6_sadr *) n);
 268       default:
 269         return NULL;
 270     }
 271
 272 #undef TW
 273 #undef FW
 274 #undef FVR_IP
 275 #undef FVR_VPN
 276 }
 277
 278
 279 /**
 280  * roa_check - check validity of route origination in a ROA table
 281  * @tab: ROA table
 282  * @n: network prefix to check
 283  * @asn: AS number of network prefix
 284  *
 285  * Implements RFC 6483 route validation for the given network prefix. The
 286  * procedure is to find all candidate ROAs - ROAs whose prefixes cover the given
 287  * network prefix. If there is no candidate ROA, return ROA_UNKNOWN. If there is
 288  * a candidate ROA with matching ASN and maxlen field greater than or equal to
 289  * the given prefix length, return ROA_VALID. Otherwise, return ROA_INVALID. If
 290  * caller cannot determine origin AS, 0 could be used (in that case ROA_VALID
 291  * cannot happen). Table @tab must have type NET_ROA4 or NET_ROA6, network @n
 292  * must have type NET_IP4 or NET_IP6, respectively.
 293  */
 294 int
 295 net_roa_check(rtable *tab, const net_addr *n, u32 asn)
 296 {
 297   net_addr_union *nu = SKIP_BACK(net_addr_union, n, n);
 298   int anything = 0;
 299   struct fib_node *fn;
 300
 301 #define TW(ipv) do {                                                            \
 302   TRIE_WALK_TO_ROOT_IP##ipv(tab->trie, &(nu->ip##ipv), var) {                   \
 303     net_addr_roa##ipv roa0 = NET_ADDR_ROA##ipv(var.prefix, var.pxlen, 0, 0);    \
 304     ROA_PARTIAL_CHECK(ipv);                                                     \
 305   } TRIE_WALK_TO_ROOT_END;                                                      \
 306   return anything ? ROA_INVALID : ROA_UNKNOWN;                                  \
 307 } while (0)
 308
 309 #define FW(ipv) do {                                                            \
 310   net_addr_roa##ipv roa0 = NET_ADDR_ROA##ipv(nu->ip##ipv.prefix, nu->ip##ipv.pxlen, 0, 0);\
 311   while (1) {                                                                   \
 312     ROA_PARTIAL_CHECK(ipv);                                                     \
 313     if (roa0.pxlen == 0) break;                                                 \
 314     roa0.pxlen--; ip##ipv##_clrbit(&roa0.prefix, roa0.pxlen);                   \
 315   }                                                                             \
 316 } while (0)
 317
 318 #define ROA_PARTIAL_CHECK(ipv) do {                                             \
 319   for (fn = fib_get_chain(&tab->fib, (net_addr *) &roa0); fn; fn = fn->next)    \
 320   {                                                                             \
 321     net_addr_roa##ipv *roa = (void *) fn->addr;                         \
 322     net *r = fib_node_to_user(&tab->fib, fn);                                   \
 323     if (net_equal_prefix_roa##ipv(roa, &roa0) && rte_is_valid(r->routes))       \
 324     {                                                                           \
 325       anything = 1;                                                             \
 326       if (asn && (roa->asn == asn) && (roa->max_pxlen >= nu->ip##ipv.pxlen))    \
 327       return ROA_VALID;                                                 \
 328     }                                                                           \
 329   }                                                                             \
 330 } while (0)
 331
 332   if ((tab->addr_type == NET_ROA4) && (n->type == NET_IP4))
 333   {
 334     if (tab->trie)  TW(4);
 335     else            FW(4);
 336   }
 337   else if ((tab->addr_type == NET_ROA6) && (n->type == NET_IP6))
 338   {
 339     if (tab->trie)  TW(6);
 340     else            FW(6);
 341   }
 342
 343   return anything ? ROA_INVALID : ROA_UNKNOWN;
 344 #undef ROA_PARTIAL_CHECK
 345 #undef TW
 346 #undef FW
 347 }
 348
 349 /**
 350  * aspa_check - check validity of AS Path in an ASPA table
 351  * @tab: ASPA table
 352  * @path: AS Path to check
 353  *
 354  * Implements draft-ietf-sidrops-aspa-verification-16.
 355  */
 356 enum aspa_result aspa_check(rtable *tab, const adata *path, bool force_upstream)
 357 {
 358   struct lp_state lps;
 359   lp_save(tmp_linpool, &lps);
 360
 361   /* No support for confed paths */
 362   if (as_path_contains_confed(path))
 363     return ASPA_INVALID;
 364
 365   /* Check path length */
 366   uint len = as_path_getlen(path);
 367   if (len == 0)
 368     return ASPA_INVALID;
 369
 370   /* Normalize the AS Path: drop stuffings */
 371   u32 *asns = alloca(sizeof(u32) * len);
 372   uint ppos = 0;
 373   uint nsz = 0;
 374   while (as_path_walk(path, &ppos, &asns[nsz]))
 375     if ((nsz == 0) || (asns[nsz] != asns[nsz-1]))
 376       nsz++;
 377
 378   /* Find the provider blocks for every AS on the path
 379    * and check allowed directions */
 380   uint max_up = 0, min_up = 0, max_down = 0, min_down = 0;
 381
 382   for (uint ap=0; ap<nsz; ap++)
 383   {
 384     net_addr_union nau = { .aspa = NET_ADDR_ASPA(asns[ap]), };
 385     net *n = net_find(tab, &nau.n);
 386
 387     bool found = false, down = false, up = false;
 388
 389     for (rte *e = (n ? n->routes: NULL); e; e = e->next)
 390     {
 391       if (!rte_is_valid(e))
 392         continue;
 393
 394       eattr *ea = ea_find(e->attrs->eattrs, EA_ASPA_PROVIDERS);
 395       if (!ea)
 396         continue;
 397
 398       /* Actually found some ASPA */
 399       found = true;
 400
 401       for (uint i=0; i * sizeof(u32) < ea->u.ptr->length; i++)
 402       {
 403         if ((ap > 0) && ((u32 *) ea->u.ptr->data)[i] == asns[ap-1])
 404           up = true;
 405         if ((ap + 1 < nsz) && ((u32 *) ea->u.ptr->data)[i] == asns[ap+1])
 406           down = true;
 407
 408         if (down && up)
 409           /* Both peers found */
 410           goto end_of_aspa;
 411       }
 412     }
 413 end_of_aspa:;
 414
 415     /* Fast path for the upstream check */
 416     if (force_upstream)
 417     {
 418       if (!found)
 419         /* Move min-upstream */
 420         min_up = ap;
 421       else if (ap && !up)
 422         /* Exists but doesn't allow this upstream */
 423         return ASPA_INVALID;
 424     }
 425
 426     /* Fast path for no ASPA here */
 427     else if (!found)
 428     {
 429       /* Extend max-downstream (min-downstream is stopped by unknown) */
 430       max_down = ap+1;
 431
 432       /* Move min-upstream (can't include unknown) */
 433       min_up = ap;
 434     }
 435
 436     /* ASPA exists and downstream may be extended */
 437     else if (down)
 438     {
 439       /* Extending max-downstream always */
 440       max_down = ap+1;
 441
 442       /* Extending min-downstream unless unknown seen */
 443       if (min_down == ap)
 444         min_down = ap+1;
 445
 446       /* Downstream only */
 447       if (!up)
 448         min_up = max_up = ap;
 449     }
 450
 451     /* No extension for downstream, force upstream only from now */
 452     else
 453     {
 454       force_upstream = 1;
 455
 456       /* Not even upstream, move the ending here */
 457       if (!up)
 458         min_up = max_up = ap;
 459     }
 460   }
 461
 462   /* Is the path surely valid? */
 463   if (min_up <= min_down)
 464     return ASPA_VALID;
 465
 466   /* Is the path maybe valid? */
 467   if (max_up <= max_down)
 468     return ASPA_UNKNOWN;
 469
 470   /* Now there is surely a valley there. */
 471   return ASPA_INVALID;
 472 }
 473
 474 /**
 475  * rte_find - find a route
 476  * @net: network node
 477  * @src: route source
 478  *
 479  * The rte_find() function returns a route for destination @net
 480  * which is from route source @src.
 481  */
 482 rte *
 483 rte_find(net *net, struct rte_src *src)
 484 {
 485   rte *e = net->routes;
 486
 487   while (e && e->src != src)
 488     e = e->next;
 489   return e;
 490 }
 491
 492 /**
 493  * rte_get_temp - get a temporary &rte
 494  * @a: attributes to assign to the new route (a &rta; in case it's
 495  * un-cached, rte_update() will create a cached copy automatically)
 496  * @src: route source
 497  *
 498  * Create a temporary &rte and bind it with the attributes @a.
 499  */
 500 rte *
 501 rte_get_temp(rta *a, struct rte_src *src)
 502 {
 503   rte *e = sl_alloc(rte_slab);
 504
 505   e->attrs = a;
 506   e->id = 0;
 507   e->flags = 0;
 508   e->pflags = 0;
 509   rt_lock_source(e->src = src);
 510   return e;
 511 }
 512
 513 rte *
 514 rte_do_cow(rte *r)
 515 {
 516   rte *e = sl_alloc(rte_slab);
 517
 518   memcpy(e, r, sizeof(rte));
 519
 520   rt_lock_source(e->src);
 521   e->attrs = rta_clone(r->attrs);
 522   e->flags = 0;
 523   return e;
 524 }
 525
 526 /**
 527  * rte_cow_rta - get a private writable copy of &rte with writable &rta
 528  * @r: a route entry to be copied
 529  * @lp: a linpool from which to allocate &rta
 530  *
 531  * rte_cow_rta() takes a &rte and prepares it and associated &rta for
 532  * modification. There are three possibilities: First, both &rte and &rta are
 533  * private copies, in that case they are returned unchanged.  Second, &rte is
 534  * private copy, but &rta is cached, in that case &rta is duplicated using
 535  * rta_do_cow(). Third, both &rte is shared and &rta is cached, in that case
 536  * both structures are duplicated by rte_do_cow() and rta_do_cow().
 537  *
 538  * Note that in the second case, cached &rta loses one reference, while private
 539  * copy created by rta_do_cow() is a shallow copy sharing indirect data (eattrs,
 540  * nexthops, ...) with it. To work properly, original shared &rta should have
 541  * another reference during the life of created private copy.
 542  *
 543  * Result: a pointer to the new writable &rte with writable &rta.
 544  */
 545 rte *
 546 rte_cow_rta(rte *r, linpool *lp)
 547 {
 548   if (!rta_is_cached(r->attrs))
 549     return r;
 550
 551   r = rte_cow(r);
 552   rta *a = rta_do_cow(r->attrs, lp);
 553   rta_free(r->attrs);
 554   r->attrs = a;
 555   return r;
 556 }
 557
 558 static int                              /* Actually better or at least as good as */
 559 rte_better(rte *new, rte *old)
 560 {
 561   int (*better)(rte *, rte *);
 562
 563   if (!rte_is_valid(old))
 564     return 1;
 565   if (!rte_is_valid(new))
 566     return 0;
 567
 568   if (new->attrs->pref > old->attrs->pref)
 569     return 1;
 570   if (new->attrs->pref < old->attrs->pref)
 571     return 0;
 572   if (new->src->proto->proto != old->src->proto->proto)
 573     {
 574       /*
 575        *  If the user has configured protocol preferences, so that two different protocols
 576        *  have the same preference, try to break the tie by comparing addresses. Not too
 577        *  useful, but keeps the ordering of routes unambiguous.
 578        */
 579       return new->src->proto->proto > old->src->proto->proto;
 580     }
 581   if (better = new->src->proto->rte_better)
 582     return better(new, old);
 583   return 0;
 584 }
 585
 586 static int
 587 rte_mergable(rte *pri, rte *sec)
 588 {
 589   int (*mergable)(rte *, rte *);
 590
 591   if (!rte_is_valid(pri) || !rte_is_valid(sec))
 592     return 0;
 593
 594   if (pri->attrs->pref != sec->attrs->pref)
 595     return 0;
 596
 597   if (pri->src->proto->proto != sec->src->proto->proto)
 598     return 0;
 599
 600   if (mergable = pri->src->proto->rte_mergable)
 601     return mergable(pri, sec);
 602
 603   return 0;
 604 }
 605
 606 static void
 607 rte_trace(struct channel *c, rte *e, int dir, char *msg)
 608 {
 609   log(L_TRACE "%s.%s %c %s %N %luL %uG %s",
 610       c->proto->name, c->name ?: "?", dir, msg, e->net->n.addr, e->src->private_id, e->src->global_id,
 611       rta_dest_name(e->attrs->dest));
 612 }
 613
 614 static inline void
 615 rte_trace_in(uint flag, struct channel *c, rte *e, char *msg)
 616 {
 617   if ((c->debug & flag) || (c->proto->debug & flag))
 618     rte_trace(c, e, '>', msg);
 619 }
 620
 621 static inline void
 622 rte_trace_out(uint flag, struct channel *c, rte *e, char *msg)
 623 {
 624   if ((c->debug & flag) || (c->proto->debug & flag))
 625     rte_trace(c, e, '<', msg);
 626 }
 627
 628 static rte *
 629 export_filter_(struct channel *c, rte *rt0, rte **rt_free, linpool *pool, int silent)
 630 {
 631   struct proto *p = c->proto;
 632   const struct filter *filter = c->out_filter;
 633   struct proto_stats *stats = &c->stats;
 634   rte *rt;
 635   int v;
 636
 637   rt = rt0;
 638   *rt_free = NULL;
 639
 640   v = p->preexport ? p->preexport(c, rt) : 0;
 641   if (v < 0)
 642     {
 643       if (silent)
 644         goto reject;
 645
 646       stats->exp_updates_rejected++;
 647       if (v == RIC_REJECT)
 648         rte_trace_out(D_FILTERS, c, rt, "rejected by protocol");
 649       goto reject;
 650     }
 651   if (v > 0)
 652     {
 653       if (!silent)
 654         rte_trace_out(D_FILTERS, c, rt, "forced accept by protocol");
 655       goto accept;
 656     }
 657
 658   v = filter && ((filter == FILTER_REJECT) ||
 659                  (f_run(filter, &rt, pool,
 660                         (silent ? FF_SILENT : 0)) > F_ACCEPT));
 661   if (v)
 662     {
 663       if (silent)
 664         goto reject;
 665
 666       stats->exp_updates_filtered++;
 667       rte_trace_out(D_FILTERS, c, rt, "filtered out");
 668       goto reject;
 669     }
 670
 671  accept:
 672   if (rt != rt0)
 673     *rt_free = rt;
 674   return rt;
 675
 676  reject:
 677   /* Discard temporary rte */
 678   if (rt != rt0)
 679     rte_free(rt);
 680   return NULL;
 681 }
 682
 683 static inline rte *
 684 export_filter(struct channel *c, rte *rt0, rte **rt_free, int silent)
 685 {
 686   return export_filter_(c, rt0, rt_free, rte_update_pool, silent);
 687 }
 688
 689 static void
 690 do_rt_notify(struct channel *c, net *net, rte *new, rte *old, int refeed)
 691 {
 692   struct proto *p = c->proto;
 693   struct proto_stats *stats = &c->stats;
 694
 695   if (refeed && new)
 696     c->refeed_count++;
 697
 698   /* Apply export limit */
 699   struct channel_limit *l = &c->out_limit;
 700   if (l->action && !old && new)
 701   {
 702     if (stats->exp_routes >= l->limit)
 703       channel_notify_limit(c, l, PLD_OUT, stats->exp_routes);
 704
 705     if (l->state == PLS_BLOCKED)
 706     {
 707       stats->exp_updates_rejected++;
 708       rte_trace_out(D_FILTERS, c, new, "rejected [limit]");
 709       return;
 710     }
 711   }
 712
 713   /* Apply export table */
 714   if (c->out_table && !rte_update_out(c, net->n.addr, new, old, refeed))
 715     return;
 716
 717   if (new)
 718     stats->exp_updates_accepted++;
 719   else
 720     stats->exp_withdraws_accepted++;
 721
 722   if (old)
 723   {
 724     bmap_clear(&c->export_map, old->id);
 725     stats->exp_routes--;
 726   }
 727
 728   if (new)
 729   {
 730     bmap_set(&c->export_map, new->id);
 731     stats->exp_routes++;
 732   }
 733
 734   if (p->debug & D_ROUTES)
 735   {
 736     if (new && old)
 737       rte_trace_out(D_ROUTES, c, new, "replaced");
 738     else if (new)
 739       rte_trace_out(D_ROUTES, c, new, "added");
 740     else if (old)
 741       rte_trace_out(D_ROUTES, c, old, "removed");
 742   }
 743
 744   p->rt_notify(p, c, net, new, old);
 745 }
 746
 747 static void
 748 rt_notify_basic(struct channel *c, net *net, rte *new, rte *old, int refeed)
 749 {
 750   // struct proto *p = c->proto;
 751   rte *new_free = NULL;
 752
 753   if (new)
 754     c->stats.exp_updates_received++;
 755   else
 756     c->stats.exp_withdraws_received++;
 757
 758   if (new)
 759     new = export_filter(c, new, &new_free, 0);
 760
 761   if (old && !bmap_test(&c->export_map, old->id))
 762     old = NULL;
 763
 764   if (!new && !old)
 765     return;
 766
 767   do_rt_notify(c, net, new, old, refeed);
 768
 769   /* Discard temporary rte */
 770   if (new_free)
 771     rte_free(new_free);
 772 }
 773
 774 static void
 775 rt_notify_accepted(struct channel *c, net *net, rte *new_changed, rte *old_changed, int refeed)
 776 {
 777   // struct proto *p = c->proto;
 778   rte *new_best = NULL;
 779   rte *old_best = NULL;
 780   rte *new_free = NULL;
 781   int new_first = 0;
 782
 783   /*
 784    * We assume that there are no changes in net route order except (added)
 785    * new_changed and (removed) old_changed. Therefore, the function is not
 786    * compatible with deterministic_med (where nontrivial reordering can happen
 787    * as a result of a route change) and with recomputation of recursive routes
 788    * due to next hop update (where many routes can be changed in one step).
 789    *
 790    * Note that we need this assumption just for optimizations, we could just
 791    * run full new_best recomputation otherwise.
 792    *
 793    * There are three cases:
 794    * feed or old_best is old_changed -> we need to recompute new_best
 795    * old_best is before new_changed -> new_best is old_best, ignore
 796    * old_best is after new_changed -> try new_changed, otherwise old_best
 797    */
 798
 799   if (net->routes)
 800     c->stats.exp_updates_received++;
 801   else
 802     c->stats.exp_withdraws_received++;
 803
 804   /* Find old_best - either old_changed, or route for net->routes */
 805   if (old_changed && bmap_test(&c->export_map, old_changed->id))
 806     old_best = old_changed;
 807   else
 808   {
 809     for (rte *r = net->routes; rte_is_valid(r); r = r->next)
 810     {
 811       if (bmap_test(&c->export_map, r->id))
 812       {
 813         old_best = r;
 814         break;
 815       }
 816
 817       /* Note if new_changed found before old_best */
 818       if (r == new_changed)
 819         new_first = 1;
 820     }
 821   }
 822
 823   /* Find new_best */
 824   if ((new_changed == old_changed) || (old_best == old_changed))
 825   {
 826     /* Feed or old_best changed -> find first accepted by filters */
 827     for (rte *r = net->routes; rte_is_valid(r); r = r->next)
 828       if (new_best = export_filter(c, r, &new_free, 0))
 829         break;
 830   }
 831   else
 832   {
 833     /* Other cases -> either new_changed, or old_best (and nothing changed) */
 834     if (new_first && (new_changed = export_filter(c, new_changed, &new_free, 0)))
 835       new_best = new_changed;
 836     else
 837       return;
 838   }
 839
 840   if (!new_best && !old_best)
 841     return;
 842
 843   do_rt_notify(c, net, new_best, old_best, refeed);
 844
 845   /* Discard temporary rte */
 846   if (new_free)
 847     rte_free(new_free);
 848 }
 849
 850
 851 static struct nexthop *
 852 nexthop_merge_rta(struct nexthop *nhs, rta *a, linpool *pool, int max)
 853 {
 854   return nexthop_merge(nhs, &(a->nh), 1, 0, max, pool);
 855 }
 856
 857 rte *
 858 rt_export_merged(struct channel *c, net *net, rte **rt_free, linpool *pool, int silent)
 859 {
 860   // struct proto *p = c->proto;
 861   struct nexthop *nhs = NULL;
 862   rte *best0, *best, *rt0, *rt, *tmp;
 863
 864   best0 = net->routes;
 865   *rt_free = NULL;
 866
 867   if (!rte_is_valid(best0))
 868     return NULL;
 869
 870   best = export_filter_(c, best0, rt_free, pool, silent);
 871
 872   if (!best || !rte_is_reachable(best))
 873     return best;
 874
 875   for (rt0 = best0->next; rt0; rt0 = rt0->next)
 876   {
 877     if (!rte_mergable(best0, rt0))
 878       continue;
 879
 880     rt = export_filter_(c, rt0, &tmp, pool, 1);
 881
 882     if (!rt)
 883       continue;
 884
 885     if (rte_is_reachable(rt))
 886       nhs = nexthop_merge_rta(nhs, rt->attrs, pool, c->merge_limit);
 887
 888     if (tmp)
 889       rte_free(tmp);
 890   }
 891
 892   if (nhs)
 893   {
 894     nhs = nexthop_merge_rta(nhs, best->attrs, pool, c->merge_limit);
 895
 896     if (nhs->next)
 897     {
 898       best = rte_cow_rta(best, pool);
 899       nexthop_link(best->attrs, nhs);
 900     }
 901   }
 902
 903   if (best != best0)
 904     *rt_free = best;
 905
 906   return best;
 907 }
 908
 909 static void
 910 rt_notify_merged(struct channel *c, net *net, rte *new_changed, rte *old_changed,
 911                  rte *new_best, rte *old_best, int refeed)
 912 {
 913   // struct proto *p = c->proto;
 914   rte *new_free = NULL;
 915
 916   /* We assume that all rte arguments are either NULL or rte_is_valid() */
 917
 918   /* This check should be done by the caller */
 919   if (!new_best && !old_best)
 920     return;
 921
 922   /* Check whether the change is relevant to the merged route */
 923   if ((new_best == old_best) &&
 924       (new_changed != old_changed) &&
 925       !rte_mergable(new_best, new_changed) &&
 926       !rte_mergable(old_best, old_changed))
 927     return;
 928
 929   if (new_best)
 930     c->stats.exp_updates_received++;
 931   else
 932     c->stats.exp_withdraws_received++;
 933
 934   /* Prepare new merged route */
 935   if (new_best)
 936     new_best = rt_export_merged(c, net, &new_free, rte_update_pool, 0);
 937
 938   /* Check old merged route */
 939   if (old_best && !bmap_test(&c->export_map, old_best->id))
 940     old_best = NULL;
 941
 942   if (!new_best && !old_best)
 943     return;
 944
 945   do_rt_notify(c, net, new_best, old_best, refeed);
 946
 947   /* Discard temporary rte */
 948   if (new_free)
 949     rte_free(new_free);
 950 }
 951
 952
 953 /**
 954  * rte_announce - announce a routing table change
 955  * @tab: table the route has been added to
 956  * @type: type of route announcement (RA_UNDEF or RA_ANY)
 957  * @net: network in question
 958  * @new: the new or changed route
 959  * @old: the previous route replaced by the new one
 960  * @new_best: the new best route for the same network
 961  * @old_best: the previous best route for the same network
 962  *
 963  * This function gets a routing table update and announces it to all protocols
 964  * that are connected to the same table by their channels.
 965  *
 966  * There are two ways of how routing table changes are announced. First, there
 967  * is a change of just one route in @net (which may caused a change of the best
 968  * route of the network). In this case @new and @old describes the changed route
 969  * and @new_best and @old_best describes best routes. Other routes are not
 970  * affected, but in sorted table the order of other routes might change.
 971  *
 972  * Second, There is a bulk change of multiple routes in @net, with shared best
 973  * route selection. In such case separate route changes are described using
 974  * @type of %RA_ANY, with @new and @old specifying the changed route, while
 975  * @new_best and @old_best are NULL. After that, another notification is done
 976  * where @new_best and @old_best are filled (may be the same), but @new and @old
 977  * are NULL.
 978  *
 979  * The function announces the change to all associated channels. For each
 980  * channel, an appropriate preprocessing is done according to channel &ra_mode.
 981  * For example, %RA_OPTIMAL channels receive just changes of best routes.
 982  *
 983  * In general, we first call preexport() hook of a protocol, which performs
 984  * basic checks on the route (each protocol has a right to veto or force accept
 985  * of the route before any filter is asked). Then we consult an export filter
 986  * of the channel and verify the old route in an export map of the channel.
 987  * Finally, the rt_notify() hook of the protocol gets called.
 988  *
 989  * Note that there are also calls of rt_notify() hooks due to feed, but that is
 990  * done outside of scope of rte_announce().
 991  */
 992 static void
 993 rte_announce(rtable *tab, uint type, net *net, rte *new, rte *old,
 994              rte *new_best, rte *old_best)
 995 {
 996   if (!rte_is_valid(new))
 997     new = NULL;
 998
 999   if (!rte_is_valid(old))
1000     old = NULL;
1001
1002   if (!rte_is_valid(new_best))
1003     new_best = NULL;
1004
1005   if (!rte_is_valid(old_best))
1006     old_best = NULL;
1007
1008   if (!new && !old && !new_best && !old_best)
1009     return;
1010
1011   if (new_best != old_best)
1012   {
1013     if (new_best)
1014       new_best->sender->stats.pref_routes++;
1015     if (old_best)
1016       old_best->sender->stats.pref_routes--;
1017
1018     if (tab->hostcache)
1019       rt_notify_hostcache(tab, net);
1020
1021     if (!EMPTY_LIST(tab->flowspec_links))
1022       rt_flowspec_notify(tab, net);
1023   }
1024
1025   rt_schedule_notify(tab);
1026
1027   struct channel *c; node *n;
1028   WALK_LIST2(c, n, tab->channels, table_node)
1029   {
1030     if (c->export_state == ES_DOWN)
1031       continue;
1032
1033     if (type && (type != c->ra_mode))
1034       continue;
1035
1036     switch (c->ra_mode)
1037     {
1038     case RA_OPTIMAL:
1039       if (new_best != old_best)
1040         rt_notify_basic(c, net, new_best, old_best, 0);
1041       break;
1042
1043     case RA_ANY:
1044       if (new != old)
1045         rt_notify_basic(c, net, new, old, 0);
1046       break;
1047
1048     case RA_ACCEPTED:
1049       /*
1050        * The (new != old) condition is problematic here, as it would break
1051        * the second usage pattern (announcement after bulk change, used in
1052        * rt_next_hop_update_net(), which sends both new and old as NULL).
1053        *
1054        * But recursive next hops do not work with sorted tables anyways,
1055        * such configuration is forbidden in BGP and not supported in
1056        * rt_notify_accepted().
1057        *
1058        * The condition is needed to eliminate spurious announcements where
1059        * both old and new routes are not valid (so they are NULL).
1060        */
1061       if (new != old)
1062         rt_notify_accepted(c, net, new, old, 0);
1063       break;
1064
1065     case RA_MERGED:
1066       rt_notify_merged(c, net, new, old, new_best, old_best, 0);
1067       break;
1068     }
1069   }
1070 }
1071
1072 static inline int
1073 rte_validate(rte *e)
1074 {
1075   int c;
1076   net *n = e->net;
1077
1078   if (!net_validate(n->n.addr))
1079   {
1080     log(L_WARN "Ignoring bogus prefix %N received via %s",
1081         n->n.addr, e->sender->proto->name);
1082     return 0;
1083   }
1084
1085   /* FIXME: better handling different nettypes */
1086   c = !net_is_flow(n->n.addr) ?
1087     net_classify(n->n.addr): (IADDR_HOST | SCOPE_UNIVERSE);
1088   if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
1089   {
1090     log(L_WARN "Ignoring bogus route %N received via %s",
1091         n->n.addr, e->sender->proto->name);
1092     return 0;
1093   }
1094
1095   if (net_type_match(n->n.addr, NB_DEST) == !e->attrs->dest)
1096   {
1097     /* Exception for flowspec that failed validation */
1098     if (net_is_flow(n->n.addr) && (e->attrs->dest == RTD_UNREACHABLE))
1099       return 1;
1100
1101     log(L_WARN "Ignoring route %N with invalid dest %d received via %s",
1102         n->n.addr, e->attrs->dest, e->sender->proto->name);
1103     return 0;
1104   }
1105
1106   if ((e->attrs->dest == RTD_UNICAST) && !nexthop_is_sorted(&(e->attrs->nh)))
1107   {
1108     log(L_WARN "Ignoring unsorted multipath route %N received via %s",
1109         n->n.addr, e->sender->proto->name);
1110     return 0;
1111   }
1112
1113   return 1;
1114 }
1115
1116 /**
1117  * rte_free - delete a &rte
1118  * @e: &rte to be deleted
1119  *
1120  * rte_free() deletes the given &rte from the routing table it's linked to.
1121  */
1122 void
1123 rte_free(rte *e)
1124 {
1125   rt_unlock_source(e->src);
1126   if (rta_is_cached(e->attrs))
1127     rta_free(e->attrs);
1128   sl_free(e);
1129 }
1130
1131 static inline void
1132 rte_free_quick(rte *e)
1133 {
1134   rt_unlock_source(e->src);
1135   rta_free(e->attrs);
1136   sl_free(e);
1137 }
1138
1139 int
1140 rte_same(rte *x, rte *y)
1141 {
1142   /* rte.flags / rte.pflags are not checked, as they are internal to rtable */
1143   return
1144     x->attrs == y->attrs &&
1145     x->src == y->src &&
1146     rte_is_filtered(x) == rte_is_filtered(y);
1147 }
1148
1149 static inline int rte_is_ok(rte *e) { return e && !rte_is_filtered(e); }
1150
1151 static void
1152 rte_recalculate(struct channel *c, net *net, rte *new, struct rte_src *src)
1153 {
1154   struct proto *p = c->proto;
1155   struct rtable *table = c->table;
1156   struct proto_stats *stats = &c->stats;
1157   static struct tbf rl_pipe = TBF_DEFAULT_LOG_LIMITS;
1158   rte *before_old = NULL;
1159   rte *old_best = net->routes;
1160   rte *old = NULL;
1161   rte **k;
1162
1163   k = &net->routes;                     /* Find and remove original route from the same protocol */
1164   while (old = *k)
1165     {
1166       if (old->src == src)
1167         {
1168           /* If there is the same route in the routing table but from
1169            * a different sender, then there are two paths from the
1170            * source protocol to this routing table through transparent
1171            * pipes, which is not allowed.
1172            *
1173            * We log that and ignore the route. If it is withdraw, we
1174            * ignore it completely (there might be 'spurious withdraws',
1175            * see FIXME in do_rte_announce())
1176            */
1177           if (old->sender->proto != p)
1178             {
1179               if (new)
1180                 {
1181                   log_rl(&rl_pipe, L_ERR "Pipe collision detected when sending %N to table %s",
1182                       net->n.addr, table->name);
1183                   rte_free_quick(new);
1184                 }
1185               return;
1186             }
1187
1188           if (new && rte_same(old, new))
1189             {
1190               /* No changes, ignore the new route and refresh the old one */
1191
1192               old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
1193
1194               if (!rte_is_filtered(new))
1195                 {
1196                   stats->imp_updates_ignored++;
1197                   rte_trace_in(D_ROUTES, c, new, "ignored");
1198                 }
1199
1200               rte_free_quick(new);
1201               return;
1202             }
1203           *k = old->next;
1204           table->rt_count--;
1205           break;
1206         }
1207       k = &old->next;
1208       before_old = old;
1209     }
1210
1211   /* Save the last accessed position */
1212   rte **pos = k;
1213
1214   if (!old)
1215     before_old = NULL;
1216
1217   if (!old && !new)
1218     {
1219       stats->imp_withdraws_ignored++;
1220       return;
1221     }
1222
1223   int new_ok = rte_is_ok(new);
1224   int old_ok = rte_is_ok(old);
1225
1226   struct channel_limit *l = &c->rx_limit;
1227   if (l->action && !old && new && !c->in_table)
1228     {
1229       u32 all_routes = stats->imp_routes + stats->filt_routes;
1230
1231       if (all_routes >= l->limit)
1232         channel_notify_limit(c, l, PLD_RX, all_routes);
1233
1234       if (l->state == PLS_BLOCKED)
1235         {
1236           /* In receive limit the situation is simple, old is NULL so
1237              we just free new and exit like nothing happened */
1238
1239           stats->imp_updates_ignored++;
1240           rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
1241           rte_free_quick(new);
1242           return;
1243         }
1244     }
1245
1246   l = &c->in_limit;
1247   if (l->action && !old_ok && new_ok)
1248     {
1249       if (stats->imp_routes >= l->limit)
1250         channel_notify_limit(c, l, PLD_IN, stats->imp_routes);
1251
1252       if (l->state == PLS_BLOCKED)
1253         {
1254           /* In import limit the situation is more complicated. We
1255              shouldn't just drop the route, we should handle it like
1256              it was filtered. We also have to continue the route
1257              processing if old or new is non-NULL, but we should exit
1258              if both are NULL as this case is probably assumed to be
1259              already handled. */
1260
1261           stats->imp_updates_ignored++;
1262           rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
1263
1264           if (c->in_keep_filtered)
1265             new->flags |= REF_FILTERED;
1266           else
1267             { rte_free_quick(new); new = NULL; }
1268
1269           /* Note that old && !new could be possible when
1270              c->in_keep_filtered changed in the recent past. */
1271
1272           if (!old && !new)
1273             return;
1274
1275           new_ok = 0;
1276           goto skip_stats1;
1277         }
1278     }
1279
1280   if (new_ok)
1281     stats->imp_updates_accepted++;
1282   else if (old_ok)
1283     stats->imp_withdraws_accepted++;
1284   else
1285     stats->imp_withdraws_ignored++;
1286
1287   if (old_ok || new_ok)
1288     table->last_rt_change = current_time();
1289
1290  skip_stats1:
1291
1292   if (new)
1293     rte_is_filtered(new) ? stats->filt_routes++ : stats->imp_routes++;
1294   if (old)
1295     rte_is_filtered(old) ? stats->filt_routes-- : stats->imp_routes--;
1296
1297   if (table->config->sorted)
1298     {
1299       /* If routes are sorted, just insert new route to appropriate position */
1300       if (new)
1301         {
1302           if (before_old && !rte_better(new, before_old))
1303             k = &before_old->next;
1304           else
1305             k = &net->routes;
1306
1307           for (; *k; k=&(*k)->next)
1308             if (rte_better(new, *k))
1309               break;
1310
1311           new->next = *k;
1312           *k = new;
1313
1314           table->rt_count++;
1315         }
1316     }
1317   else
1318     {
1319       /* If routes are not sorted, find the best route and move it on
1320          the first position. There are several optimized cases. */
1321
1322       if (src->proto->rte_recalculate && src->proto->rte_recalculate(table, net, new, old, old_best))
1323         goto do_recalculate;
1324
1325       if (new && rte_better(new, old_best))
1326         {
1327           /* The first case - the new route is cleary optimal,
1328              we link it at the first position */
1329
1330           new->next = net->routes;
1331           net->routes = new;
1332
1333           table->rt_count++;
1334         }
1335       else if (old == old_best)
1336         {
1337           /* The second case - the old best route disappeared, we add the
1338              new route (if we have any) to the list (we don't care about
1339              position) and then we elect the new optimal route and relink
1340              that route at the first position and announce it. New optimal
1341              route might be NULL if there is no more routes */
1342
1343         do_recalculate:
1344           /* Add the new route to the list */
1345           if (new)
1346             {
1347               new->next = *pos;
1348               *pos = new;
1349
1350               table->rt_count++;
1351             }
1352
1353           /* Find a new optimal route (if there is any) */
1354           if (net->routes)
1355             {
1356               rte **bp = &net->routes;
1357               for (k=&(*bp)->next; *k; k=&(*k)->next)
1358                 if (rte_better(*k, *bp))
1359                   bp = k;
1360
1361               /* And relink it */
1362               rte *best = *bp;
1363               *bp = best->next;
1364               best->next = net->routes;
1365               net->routes = best;
1366             }
1367         }
1368       else if (new)
1369         {
1370           /* The third case - the new route is not better than the old
1371              best route (therefore old_best != NULL) and the old best
1372              route was not removed (therefore old_best == net->routes).
1373              We just link the new route to the old/last position. */
1374
1375           new->next = *pos;
1376           *pos = new;
1377
1378           table->rt_count++;
1379         }
1380       /* The fourth (empty) case - suboptimal route was removed, nothing to do */
1381     }
1382
1383   if (new)
1384     {
1385       new->lastmod = current_time();
1386
1387       if (!old)
1388         {
1389           new->id = hmap_first_zero(&table->id_map);
1390           hmap_set(&table->id_map, new->id);
1391         }
1392       else
1393         new->id = old->id;
1394     }
1395
1396   /* Log the route change */
1397   if ((c->debug & D_ROUTES) || (p->debug & D_ROUTES))
1398     {
1399       if (new_ok)
1400         rte_trace(c, new, '>', new == net->routes ? "added [best]" : "added");
1401       else if (old_ok)
1402         {
1403           if (old != old_best)
1404             rte_trace(c, old, '>', "removed");
1405           else if (rte_is_ok(net->routes))
1406             rte_trace(c, old, '>', "removed [replaced]");
1407           else
1408             rte_trace(c, old, '>', "removed [sole]");
1409         }
1410     }
1411
1412   /* Propagate the route change */
1413   rte_announce(table, RA_UNDEF, net, new, old, net->routes, old_best);
1414
1415   if (!net->routes &&
1416       (table->gc_counter++ >= table->config->gc_threshold))
1417     rt_kick_prune_timer(table);
1418
1419   if (old_ok && p->rte_remove)
1420     p->rte_remove(net, old);
1421   if (new_ok && p->rte_insert)
1422     p->rte_insert(net, new);
1423
1424   if (old)
1425     {
1426       if (!new)
1427         hmap_clear(&table->id_map, old->id);
1428
1429       rte_free_quick(old);
1430     }
1431 }
1432
1433 static int rte_update_nest_cnt;         /* Nesting counter to allow recursive updates */
1434
1435 static inline void
1436 rte_update_lock(void)
1437 {
1438   rte_update_nest_cnt++;
1439 }
1440
1441 static inline void
1442 rte_update_unlock(void)
1443 {
1444   if (!--rte_update_nest_cnt)
1445     lp_flush(rte_update_pool);
1446 }
1447
1448 /**
1449  * rte_update - enter a new update to a routing table
1450  * @table: table to be updated
1451  * @c: channel doing the update
1452  * @net: network node
1453  * @p: protocol submitting the update
1454  * @src: protocol originating the update
1455  * @new: a &rte representing the new route or %NULL for route removal.
1456  *
1457  * This function is called by the routing protocols whenever they discover
1458  * a new route or wish to update/remove an existing route. The right announcement
1459  * sequence is to build route attributes first (either un-cached with @aflags set
1460  * to zero or a cached one using rta_lookup(); in this case please note that
1461  * you need to increase the use count of the attributes yourself by calling
1462  * rta_clone()), call rte_get_temp() to obtain a temporary &rte, fill in all
1463  * the appropriate data and finally submit the new &rte by calling rte_update().
1464  *
1465  * @src specifies the protocol that originally created the route and the meaning
1466  * of protocol-dependent data of @new. If @new is not %NULL, @src have to be the
1467  * same value as @new->attrs->proto. @p specifies the protocol that called
1468  * rte_update(). In most cases it is the same protocol as @src. rte_update()
1469  * stores @p in @new->sender;
1470  *
1471  * When rte_update() gets any route, it automatically validates it (checks,
1472  * whether the network and next hop address are valid IP addresses and also
1473  * whether a normal routing protocol doesn't try to smuggle a host or link
1474  * scope route to the table), converts all protocol dependent attributes stored
1475  * in the &rte to temporary extended attributes, consults import filters of the
1476  * protocol to see if the route should be accepted and/or its attributes modified,
1477  * stores the temporary attributes back to the &rte.
1478  *
1479  * Now, having a "public" version of the route, we
1480  * automatically find any old route defined by the protocol @src
1481  * for network @n, replace it by the new one (or removing it if @new is %NULL),
1482  * recalculate the optimal route for this destination and finally broadcast
1483  * the change (if any) to all routing protocols by calling rte_announce().
1484  *
1485  * All memory used for attribute lists and other temporary allocations is taken
1486  * from a special linear pool @rte_update_pool and freed when rte_update()
1487  * finishes.
1488  */
1489
1490 void
1491 rte_update2(struct channel *c, const net_addr *n, rte *new, struct rte_src *src)
1492 {
1493   struct proto *p = c->proto;
1494   struct proto_stats *stats = &c->stats;
1495   const struct filter *filter = c->in_filter;
1496   struct mpls_fec *fec = NULL;
1497   net *nn;
1498
1499   ASSERT(c->channel_state == CS_UP);
1500
1501   rte_update_lock();
1502   if (new)
1503     {
1504       /* Create a temporary table node */
1505       nn = alloca(sizeof(net) + n->length);
1506       memset(nn, 0, sizeof(net) + n->length);
1507       net_copy(nn->n.addr, n);
1508
1509       new->net = nn;
1510       new->sender = c;
1511
1512       stats->imp_updates_received++;
1513       if (!rte_validate(new))
1514         {
1515           rte_trace_in(D_FILTERS, c, new, "invalid");
1516           stats->imp_updates_invalid++;
1517           goto drop;
1518         }
1519
1520       if (filter == FILTER_REJECT)
1521         {
1522           stats->imp_updates_filtered++;
1523           rte_trace_in(D_FILTERS, c, new, "filtered out");
1524
1525           if (! c->in_keep_filtered)
1526             goto drop;
1527
1528           /* new is a private copy, i could modify it */
1529           new->flags |= REF_FILTERED;
1530         }
1531       else if (filter)
1532         {
1533           int fr = f_run(filter, &new, rte_update_pool, 0);
1534           if (fr > F_ACCEPT)
1535           {
1536             stats->imp_updates_filtered++;
1537             rte_trace_in(D_FILTERS, c, new, "filtered out");
1538
1539             if (! c->in_keep_filtered)
1540               goto drop;
1541
1542             new->flags |= REF_FILTERED;
1543           }
1544         }
1545
1546       if (p->mpls_map)
1547         {
1548           if (mpls_handle_rte(p->mpls_map, n, new, rte_update_pool, &fec) < 0)
1549             {
1550               rte_trace_in(D_FILTERS, c, new, "invalid");
1551               stats->imp_updates_invalid++;
1552               goto drop;
1553             }
1554         }
1555
1556       if (!rta_is_cached(new->attrs)) /* Need to copy attributes */
1557         new->attrs = rta_lookup(new->attrs);
1558       new->flags |= REF_COW;
1559
1560       /* Use the actual struct network, not the dummy one */
1561       nn = net_get(c->table, n);
1562       new->net = nn;
1563     }
1564   else
1565     {
1566       stats->imp_withdraws_received++;
1567
1568       if (!(nn = net_find(c->table, n)) || !src)
1569         {
1570           stats->imp_withdraws_ignored++;
1571           rte_update_unlock();
1572           return;
1573         }
1574     }
1575
1576  recalc:
1577   /* And recalculate the best route */
1578   rte_recalculate(c, nn, new, src);
1579
1580   if (p->mpls_map)
1581     mpls_handle_rte_cleanup(p->mpls_map, &fec);
1582
1583   rte_update_unlock();
1584   return;
1585
1586  drop:
1587   rte_free(new);
1588   new = NULL;
1589   if (nn = net_find(c->table, n))
1590     goto recalc;
1591
1592   rte_update_unlock();
1593 }
1594
1595 /* Independent call to rte_announce(), used from next hop
1596    recalculation, outside of rte_update(). new must be non-NULL */
1597 static inline void
1598 rte_announce_i(rtable *tab, uint type, net *net, rte *new, rte *old,
1599                rte *new_best, rte *old_best)
1600 {
1601   rte_update_lock();
1602   rte_announce(tab, type, net, new, old, new_best, old_best);
1603   rte_update_unlock();
1604 }
1605
1606 static inline void
1607 rte_discard(rte *old)   /* Non-filtered route deletion, used during garbage collection */
1608 {
1609   rte_update_lock();
1610   rte_recalculate(old->sender, old->net, NULL, old->src);
1611   rte_update_unlock();
1612 }
1613
1614 /* Modify existing route by protocol hook, used for long-lived graceful restart */
1615 static inline void
1616 rte_modify(rte *old)
1617 {
1618   rte_update_lock();
1619
1620   rte *new = old->sender->proto->rte_modify(old, rte_update_pool);
1621   if (new != old)
1622   {
1623     if (new)
1624     {
1625       if (!rta_is_cached(new->attrs))
1626         new->attrs = rta_lookup(new->attrs);
1627       new->flags = (old->flags & ~REF_MODIFY) | REF_COW;
1628     }
1629
1630     rte_recalculate(old->sender, old->net, new, old->src);
1631   }
1632
1633   rte_update_unlock();
1634 }
1635
1636 /* Check rtable for best route to given net whether it would be exported do p */
1637 int
1638 rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter)
1639 {
1640   struct proto *p = c->proto;
1641   net *n = net_find(t, a);
1642   rte *rt = n ? n->routes : NULL;
1643
1644   if (!rte_is_valid(rt))
1645     return 0;
1646
1647   rte_update_lock();
1648
1649   /* Rest is stripped down export_filter() */
1650   int v = p->preexport ? p->preexport(c, rt) : 0;
1651   if (v == RIC_PROCESS)
1652     v = (f_run(filter, &rt, rte_update_pool, FF_SILENT) <= F_ACCEPT);
1653
1654   /* Discard temporary rte */
1655   if (rt != n->routes)
1656     rte_free(rt);
1657
1658   rte_update_unlock();
1659
1660   return v > 0;
1661 }
1662
1663
1664 /**
1665  * rt_refresh_begin - start a refresh cycle
1666  * @t: related routing table
1667  * @c related channel
1668  *
1669  * This function starts a refresh cycle for given routing table and announce
1670  * hook. The refresh cycle is a sequence where the protocol sends all its valid
1671  * routes to the routing table (by rte_update()). After that, all protocol
1672  * routes (more precisely routes with @c as @sender) not sent during the
1673  * refresh cycle but still in the table from the past are pruned. This is
1674  * implemented by marking all related routes as stale by REF_STALE flag in
1675  * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD
1676  * flag in rt_refresh_end() and then removing such routes in the prune loop.
1677  */
1678 void
1679 rt_refresh_begin(rtable *t, struct channel *c)
1680 {
1681   if (c->debug & D_EVENTS)
1682     log(L_TRACE "%s.%s: Route refresh begin", c->proto->name, c->name);
1683
1684   FIB_WALK(&t->fib, net, n)
1685     {
1686       rte *e;
1687       for (e = n->routes; e; e = e->next)
1688         if (e->sender == c)
1689           e->flags |= REF_STALE;
1690     }
1691   FIB_WALK_END;
1692 }
1693
1694 /**
1695  * rt_refresh_end - end a refresh cycle
1696  * @t: related routing table
1697  * @c: related channel
1698  *
1699  * This function ends a refresh cycle for given routing table and announce
1700  * hook. See rt_refresh_begin() for description of refresh cycles.
1701  */
1702 void
1703 rt_refresh_end(rtable *t, struct channel *c)
1704 {
1705   if (c->debug & D_EVENTS)
1706     log(L_TRACE "%s.%s: Route refresh end", c->proto->name, c->name);
1707
1708   int prune = 0;
1709
1710   FIB_WALK(&t->fib, net, n)
1711     {
1712       rte *e;
1713       for (e = n->routes; e; e = e->next)
1714         if ((e->sender == c) && (e->flags & REF_STALE))
1715           {
1716             e->flags |= REF_DISCARD;
1717             prune = 1;
1718           }
1719     }
1720   FIB_WALK_END;
1721
1722   if (prune)
1723     rt_schedule_prune(t);
1724 }
1725
1726 void
1727 rt_modify_stale(rtable *t, struct channel *c)
1728 {
1729   int prune = 0;
1730
1731   FIB_WALK(&t->fib, net, n)
1732     {
1733       rte *e;
1734       for (e = n->routes; e; e = e->next)
1735         if ((e->sender == c) && (e->flags & REF_STALE) && !(e->flags & REF_FILTERED))
1736           {
1737             e->flags |= REF_MODIFY;
1738             prune = 1;
1739           }
1740     }
1741   FIB_WALK_END;
1742
1743   if (prune)
1744     rt_schedule_prune(t);
1745 }
1746
1747 /**
1748  * rte_dump - dump a route
1749  * @e: &rte to be dumped
1750  *
1751  * This functions dumps contents of a &rte to debug output.
1752  */
1753 void
1754 rte_dump(struct dump_request *dreq, rte *e)
1755 {
1756   net *n = e->net;
1757   RDUMP("%-1N ", n->n.addr);
1758   RDUMP("PF=%02x ", e->pflags);
1759   rta_dump(dreq, e->attrs);
1760   RDUMP("\n");
1761 }
1762
1763 /**
1764  * rt_dump - dump a routing table
1765  * @t: routing table to be dumped
1766  *
1767  * This function dumps contents of a given routing table to debug output.
1768  */
1769 void
1770 rt_dump(struct dump_request *dreq, rtable *t)
1771 {
1772   RDUMP("Dump of routing table <%s>\n", t->name);
1773 #ifdef DEBUGGING
1774   fib_check(&t->fib);
1775 #endif
1776   FIB_WALK(&t->fib, net, n)
1777     {
1778       rte *e;
1779       for(e=n->routes; e; e=e->next)
1780         rte_dump(dreq, e);
1781     }
1782   FIB_WALK_END;
1783   RDUMP("\n");
1784 }
1785
1786 /**
1787  * rt_dump_all - dump all routing tables
1788  *
1789  * This function dumps contents of all routing tables to debug output.
1790  */
1791 void
1792 rt_dump_all(struct dump_request *dreq)
1793 {
1794   rtable *t;
1795   node *n;
1796
1797   WALK_LIST2(t, n, routing_tables, n)
1798     rt_dump(dreq, t);
1799 }
1800
1801 static inline void
1802 rt_schedule_hcu(rtable *tab)
1803 {
1804   if (tab->hcu_scheduled)
1805     return;
1806
1807   tab->hcu_scheduled = 1;
1808   ev_schedule(tab->rt_event);
1809 }
1810
1811 static inline void
1812 rt_schedule_nhu(rtable *tab)
1813 {
1814   if (tab->nhu_state == NHU_CLEAN)
1815     ev_schedule(tab->rt_event);
1816
1817   /* state change:
1818    *   NHU_CLEAN   -> NHU_SCHEDULED
1819    *   NHU_RUNNING -> NHU_DIRTY
1820    */
1821   tab->nhu_state |= NHU_SCHEDULED;
1822 }
1823
1824 void
1825 rt_schedule_prune(rtable *tab)
1826 {
1827   if (tab->prune_state == 0)
1828     ev_schedule(tab->rt_event);
1829
1830   /* state change 0->1, 2->3 */
1831   tab->prune_state |= 1;
1832 }
1833
1834
1835 static void
1836 rt_event(void *ptr)
1837 {
1838   rtable *tab = ptr;
1839
1840   rt_lock_table(tab);
1841
1842   if (tab->hcu_scheduled)
1843     rt_update_hostcache(tab);
1844
1845   if (tab->nhu_state)
1846     rt_next_hop_update(tab);
1847
1848   if (tab->prune_state)
1849     rt_prune_table(tab);
1850
1851   rt_unlock_table(tab);
1852 }
1853
1854
1855 static void
1856 rt_prune_timer(timer *t)
1857 {
1858   rtable *tab = t->data;
1859
1860   if (tab->gc_counter >= tab->config->gc_threshold)
1861     rt_schedule_prune(tab);
1862 }
1863
1864 static void
1865 rt_kick_prune_timer(rtable *tab)
1866 {
1867   /* Return if prune is already scheduled */
1868   if (tm_active(tab->prune_timer) || (tab->prune_state & 1))
1869     return;
1870
1871   /* Randomize GC period to +/- 50% */
1872   btime gc_period = tab->config->gc_period;
1873   gc_period = (gc_period / 2) + (random_u32() % (uint) gc_period);
1874   tm_start(tab->prune_timer, gc_period);
1875 }
1876
1877
1878 static inline btime
1879 rt_settled_time(rtable *tab)
1880 {
1881   ASSUME(tab->base_settle_time != 0);
1882
1883   return MIN(tab->last_rt_change + tab->config->min_settle_time,
1884              tab->base_settle_time + tab->config->max_settle_time);
1885 }
1886
1887 static void
1888 rt_settle_timer(timer *t)
1889 {
1890   rtable *tab = t->data;
1891
1892   if (!tab->base_settle_time)
1893     return;
1894
1895   btime settled_time = rt_settled_time(tab);
1896   if (current_time() < settled_time)
1897   {
1898     tm_set(tab->settle_timer, settled_time);
1899     return;
1900   }
1901
1902   /* Settled */
1903   tab->base_settle_time = 0;
1904
1905   struct rt_subscription *s;
1906   WALK_LIST(s, tab->subscribers)
1907     s->hook(s);
1908 }
1909
1910 static void
1911 rt_kick_settle_timer(rtable *tab)
1912 {
1913   tab->base_settle_time = current_time();
1914
1915   if (!tab->settle_timer)
1916     tab->settle_timer = tm_new_init(tab->rp, rt_settle_timer, tab, 0, 0);
1917
1918   if (!tm_active(tab->settle_timer))
1919     tm_set(tab->settle_timer, rt_settled_time(tab));
1920 }
1921
1922 static inline void
1923 rt_schedule_notify(rtable *tab)
1924 {
1925   if (EMPTY_LIST(tab->subscribers))
1926     return;
1927
1928   if (tab->base_settle_time)
1929     return;
1930
1931   rt_kick_settle_timer(tab);
1932 }
1933
1934 void
1935 rt_subscribe(rtable *tab, struct rt_subscription *s)
1936 {
1937   s->tab = tab;
1938   rt_lock_table(tab);
1939   add_tail(&tab->subscribers, &s->n);
1940 }
1941
1942 void
1943 rt_unsubscribe(struct rt_subscription *s)
1944 {
1945   rem_node(&s->n);
1946   rt_unlock_table(s->tab);
1947 }
1948
1949 static struct rt_flowspec_link *
1950 rt_flowspec_find_link(rtable *src, rtable *dst)
1951 {
1952   struct rt_flowspec_link *ln;
1953   WALK_LIST(ln, src->flowspec_links)
1954     if ((ln->src == src) && (ln->dst == dst))
1955       return ln;
1956
1957   return NULL;
1958 }
1959
1960 void
1961 rt_flowspec_link(rtable *src, rtable *dst)
1962 {
1963   ASSERT(rt_is_ip(src));
1964   ASSERT(rt_is_flow(dst));
1965
1966   struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst);
1967
1968   if (!ln)
1969   {
1970     rt_lock_table(src);
1971     rt_lock_table(dst);
1972
1973     ln = mb_allocz(src->rp, sizeof(struct rt_flowspec_link));
1974     ln->src = src;
1975     ln->dst = dst;
1976     add_tail(&src->flowspec_links, &ln->n);
1977   }
1978
1979   ln->uc++;
1980 }
1981
1982 void
1983 rt_flowspec_unlink(rtable *src, rtable *dst)
1984 {
1985   struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst);
1986
1987   ASSERT(ln && (ln->uc > 0));
1988
1989   ln->uc--;
1990
1991   if (!ln->uc)
1992   {
1993     rem_node(&ln->n);
1994     mb_free(ln);
1995
1996     rt_unlock_table(src);
1997     rt_unlock_table(dst);
1998   }
1999 }
2000
2001 static void
2002 rt_flowspec_notify(rtable *src, net *net)
2003 {
2004   /* Only IP tables are src links */
2005   ASSERT(rt_is_ip(src));
2006
2007   struct rt_flowspec_link *ln;
2008   WALK_LIST(ln, src->flowspec_links)
2009   {
2010     rtable *dst = ln->dst;
2011     ASSERT(rt_is_flow(dst));
2012
2013     /* No need to inspect it further if recalculation is already active */
2014     if ((dst->nhu_state == NHU_SCHEDULED) || (dst->nhu_state == NHU_DIRTY))
2015       continue;
2016
2017     if (trie_match_net(dst->flowspec_trie, net->n.addr))
2018       rt_schedule_nhu(dst);
2019   }
2020 }
2021
2022 static void
2023 rt_flowspec_reset_trie(rtable *tab)
2024 {
2025   linpool *lp = tab->flowspec_trie->lp;
2026   int ipv4 = tab->flowspec_trie->ipv4;
2027
2028   lp_flush(lp);
2029   tab->flowspec_trie = f_new_trie(lp, 0);
2030   tab->flowspec_trie->ipv4 = ipv4;
2031 }
2032
2033 static void
2034 rt_free(resource *_r)
2035 {
2036   rtable *r = (rtable *) _r;
2037
2038   DBG("Deleting routing table %s\n", r->name);
2039   ASSERT_DIE(r->use_count == 0);
2040
2041   if (r->internal)
2042     return;
2043
2044   r->config->table = NULL;
2045   rem_node(&r->n);
2046
2047   if (r->hostcache)
2048     rt_free_hostcache(r);
2049
2050   /* Freed automagically by the resource pool
2051   fib_free(&r->fib);
2052   hmap_free(&r->id_map);
2053   rfree(r->rt_event);
2054   rfree(r->settle_timer);
2055   mb_free(r);
2056   */
2057 }
2058
2059 static void
2060 rt_res_dump(struct dump_request *dreq, resource *_r)
2061 {
2062   rtable *r = (rtable *) _r;
2063   RDUMP("name \"%s\", addr_type=%s, rt_count=%u, use_count=%d\n",
2064       r->name, net_label[r->addr_type], r->rt_count, r->use_count);
2065 }
2066
2067 static struct resclass rt_class = {
2068   .name = "Routing table",
2069   .size = sizeof(struct rtable),
2070   .free = rt_free,
2071   .dump = rt_res_dump,
2072   .lookup = NULL,
2073   .memsize = NULL,
2074 };
2075
2076 rtable *
2077 rt_setup(pool *pp, struct rtable_config *cf)
2078 {
2079   pool *p = rp_newf(pp, "Routing table %s", cf->name);
2080
2081   rtable *t = ralloc(p, &rt_class);
2082   t->rp = p;
2083
2084   t->name = cf->name;
2085   t->config = cf;
2086   t->addr_type = cf->addr_type;
2087   t->debug = cf->debug;
2088
2089   fib_init(&t->fib, p, t->addr_type, sizeof(net), OFFSETOF(net, n), 0, NULL);
2090
2091   if (cf->trie_used)
2092   {
2093     t->trie = f_new_trie(lp_new_default(p), 0);
2094     t->trie->ipv4 = net_val_match(t->addr_type, NB_IP4 | NB_VPN4 | NB_ROA4);
2095
2096     t->fib.init = net_init_with_trie;
2097   }
2098
2099   init_list(&t->channels);
2100   init_list(&t->flowspec_links);
2101   init_list(&t->subscribers);
2102
2103   hmap_init(&t->id_map, p, 1024);
2104   hmap_set(&t->id_map, 0);
2105
2106   if (!(t->internal = cf->internal))
2107   {
2108     t->rt_event = ev_new_init(p, rt_event, t);
2109     t->prune_timer = tm_new_init(p, rt_prune_timer, t, 0, 0);
2110     t->last_rt_change = t->gc_time = current_time();
2111
2112     if (rt_is_flow(t))
2113     {
2114       t->flowspec_trie = f_new_trie(lp_new_default(p), 0);
2115       t->flowspec_trie->ipv4 = (t->addr_type == NET_FLOW4);
2116     }
2117   }
2118
2119   return t;
2120 }
2121
2122 /**
2123  * rt_init - initialize routing tables
2124  *
2125  * This function is called during BIRD startup. It initializes the
2126  * routing table module.
2127  */
2128 void
2129 rt_init(void)
2130 {
2131   rta_init();
2132   rt_table_pool = rp_new(&root_pool, "Routing tables");
2133   rte_update_pool = lp_new_default(rt_table_pool);
2134   rte_slab = sl_new(rt_table_pool, sizeof(rte));
2135   init_list(&routing_tables);
2136 }
2137
2138
2139 /**
2140  * rt_prune_table - prune a routing table
2141  *
2142  * The prune loop scans routing tables and removes routes belonging to flushing
2143  * protocols, discarded routes and also stale network entries. It is called from
2144  * rt_event(). The event is rescheduled if the current iteration do not finish
2145  * the table. The pruning is directed by the prune state (@prune_state),
2146  * specifying whether the prune cycle is scheduled or running, and there
2147  * is also a persistent pruning iterator (@prune_fit).
2148  *
2149  * The prune loop is used also for channel flushing. For this purpose, the
2150  * channels to flush are marked before the iteration and notified after the
2151  * iteration.
2152  */
2153 static void
2154 rt_prune_table(rtable *tab)
2155 {
2156   struct fib_iterator *fit = &tab->prune_fit;
2157   int limit = 2000;
2158
2159   struct channel *c;
2160   node *n, *x;
2161
2162   DBG("Pruning route table %s\n", tab->name);
2163 #ifdef DEBUGGING
2164   fib_check(&tab->fib);
2165 #endif
2166
2167   if (tab->prune_state == 0)
2168     return;
2169
2170   if (tab->prune_state == 1)
2171   {
2172     /* Mark channels to flush */
2173     WALK_LIST2(c, n, tab->channels, table_node)
2174       if (c->channel_state == CS_FLUSHING)
2175         c->flush_active = 1;
2176
2177     FIB_ITERATE_INIT(fit, &tab->fib);
2178     tab->prune_state = 2;
2179
2180     tab->gc_counter = 0;
2181     tab->gc_time = current_time();
2182
2183     if (tab->prune_trie)
2184     {
2185       /* Init prefix trie pruning */
2186       tab->trie_new = f_new_trie(lp_new_default(tab->rp), 0);
2187       tab->trie_new->ipv4 = tab->trie->ipv4;
2188     }
2189   }
2190
2191 again:
2192   FIB_ITERATE_START(&tab->fib, fit, net, n)
2193     {
2194       rte *e;
2195
2196     rescan:
2197       if (limit <= 0)
2198       {
2199         FIB_ITERATE_PUT(fit);
2200         ev_schedule(tab->rt_event);
2201         return;
2202       }
2203
2204       for (e=n->routes; e; e=e->next)
2205       {
2206         if (e->sender->flush_active || (e->flags & REF_DISCARD))
2207           {
2208             rte_discard(e);
2209             limit--;
2210
2211             goto rescan;
2212           }
2213
2214         if (e->flags & REF_MODIFY)
2215           {
2216             rte_modify(e);
2217             limit--;
2218
2219             goto rescan;
2220           }
2221       }
2222
2223       if (!n->routes)           /* Orphaned FIB entry */
2224         {
2225           FIB_ITERATE_PUT(fit);
2226           fib_delete(&tab->fib, n);
2227           goto again;
2228         }
2229
2230       if (tab->trie_new)
2231       {
2232         trie_add_prefix(tab->trie_new, n->n.addr, n->n.addr->pxlen, n->n.addr->pxlen);
2233         limit--;
2234       }
2235     }
2236   FIB_ITERATE_END;
2237
2238 #ifdef DEBUGGING
2239   fib_check(&tab->fib);
2240 #endif
2241
2242   /* state change 2->0, 3->1 */
2243   tab->prune_state &= 1;
2244
2245   if (tab->trie_new)
2246   {
2247     /* Finish prefix trie pruning */
2248
2249     if (!tab->trie_lock_count)
2250     {
2251       rfree(tab->trie->lp);
2252     }
2253     else
2254     {
2255       ASSERT(!tab->trie_old);
2256       tab->trie_old = tab->trie;
2257       tab->trie_old_lock_count = tab->trie_lock_count;
2258       tab->trie_lock_count = 0;
2259     }
2260
2261     tab->trie = tab->trie_new;
2262     tab->trie_new = NULL;
2263     tab->prune_trie = 0;
2264   }
2265   else
2266   {
2267     /* Schedule prefix trie pruning */
2268     if (tab->trie && !tab->trie_old && (tab->trie->prefix_count > (2 * tab->fib.entries)))
2269     {
2270       /* state change 0->1, 2->3 */
2271       tab->prune_state |= 1;
2272       tab->prune_trie = 1;
2273     }
2274   }
2275
2276   if (tab->prune_state > 0)
2277     ev_schedule(tab->rt_event);
2278
2279   /* FIXME: This should be handled in a better way */
2280   rt_prune_sources();
2281
2282   /* Close flushed channels */
2283   WALK_LIST2_DELSAFE(c, n, x, tab->channels, table_node)
2284     if (c->flush_active)
2285       {
2286         c->flush_active = 0;
2287         channel_set_state(c, CS_DOWN);
2288       }
2289
2290   return;
2291 }
2292
2293 /**
2294  * rt_lock_trie - lock a prefix trie of a routing table
2295  * @tab: routing table with prefix trie to be locked
2296  *
2297  * The prune loop may rebuild the prefix trie and invalidate f_trie_walk_state
2298  * structures. Therefore, asynchronous walks should lock the prefix trie using
2299  * this function. That allows the prune loop to rebuild the trie, but postpones
2300  * its freeing until all walks are done (unlocked by rt_unlock_trie()).
2301  *
2302  * Return a current trie that will be locked, the value should be passed back to
2303  * rt_unlock_trie() for unlocking.
2304  *
2305  */
2306 struct f_trie *
2307 rt_lock_trie(rtable *tab)
2308 {
2309   ASSERT(tab->trie);
2310
2311   tab->trie_lock_count++;
2312   return tab->trie;
2313 }
2314
2315 /**
2316  * rt_unlock_trie - unlock a prefix trie of a routing table
2317  * @tab: routing table with prefix trie to be locked
2318  * @trie: value returned by matching rt_lock_trie()
2319  *
2320  * Done for trie locked by rt_lock_trie() after walk over the trie is done.
2321  * It may free the trie and schedule next trie pruning.
2322  */
2323 void
2324 rt_unlock_trie(rtable *tab, struct f_trie *trie)
2325 {
2326   ASSERT(trie);
2327
2328   if (trie == tab->trie)
2329   {
2330     /* Unlock the current prefix trie */
2331     ASSERT(tab->trie_lock_count);
2332     tab->trie_lock_count--;
2333   }
2334   else if (trie == tab->trie_old)
2335   {
2336     /* Unlock the old prefix trie */
2337     ASSERT(tab->trie_old_lock_count);
2338     tab->trie_old_lock_count--;
2339
2340     /* Free old prefix trie that is no longer needed */
2341     if (!tab->trie_old_lock_count)
2342     {
2343       rfree(tab->trie_old->lp);
2344       tab->trie_old = NULL;
2345
2346       /* Kick prefix trie pruning that was postponed */
2347       if (tab->trie && (tab->trie->prefix_count > (2 * tab->fib.entries)))
2348       {
2349         tab->prune_trie = 1;
2350         rt_schedule_prune(tab);
2351       }
2352     }
2353   }
2354   else
2355     log(L_BUG "Invalid arg to rt_unlock_trie()");
2356 }
2357
2358
2359 void
2360 rt_preconfig(struct config *c)
2361 {
2362   init_list(&c->tables);
2363
2364   rt_new_table(cf_get_symbol(c, "master4"), NET_IP4);
2365   rt_new_table(cf_get_symbol(c, "master6"), NET_IP6);
2366 }
2367
2368 void
2369 rt_postconfig(struct config *c)
2370 {
2371   uint num_tables = list_length(&c->tables);
2372   btime def_gc_period = 400 MS * num_tables;
2373   def_gc_period = MAX(def_gc_period, 10 S);
2374   def_gc_period = MIN(def_gc_period, 600 S);
2375
2376   struct rtable_config *rc;
2377   WALK_LIST(rc, c->tables)
2378     if (rc->gc_period == (uint) -1)
2379       rc->gc_period = (uint) def_gc_period;
2380 }
2381
2382
2383 /*
2384  * Some functions for handing internal next hop updates
2385  * triggered by rt_schedule_nhu().
2386  */
2387
2388 void
2389 rta_apply_hostentry(rta *a, struct hostentry *he, mpls_label_stack *mls)
2390 {
2391   a->hostentry = he;
2392   a->dest = he->dest;
2393   a->igp_metric = he->igp_metric;
2394
2395   if (a->dest != RTD_UNICAST)
2396   {
2397     /* No nexthop */
2398 no_nexthop:
2399     a->nh = (struct nexthop) {};
2400     if (mls)
2401     { /* Store the label stack for later changes */
2402       a->nh.labels_orig = a->nh.labels = mls->len;
2403       memcpy(a->nh.label, mls->stack, mls->len * sizeof(u32));
2404     }
2405     return;
2406   }
2407
2408   if (((!mls) || (!mls->len)) && he->nexthop_linkable)
2409   { /* Just link the nexthop chain, no label append happens. */
2410     memcpy(&(a->nh), &(he->src->nh), nexthop_size(&(he->src->nh)));
2411     return;
2412   }
2413
2414   struct nexthop *nhp = NULL, *nhr = NULL;
2415   int skip_nexthop = 0;
2416
2417   for (struct nexthop *nh = &(he->src->nh); nh; nh = nh->next)
2418   {
2419     if (skip_nexthop)
2420       skip_nexthop--;
2421     else
2422     {
2423       nhr = nhp;
2424       nhp = (nhp ? (nhp->next = lp_alloc(rte_update_pool, NEXTHOP_MAX_SIZE)) : &(a->nh));
2425     }
2426
2427     memset(nhp, 0, NEXTHOP_MAX_SIZE);
2428     nhp->iface = nh->iface;
2429     nhp->weight = nh->weight;
2430
2431     if (mls)
2432     {
2433       nhp->labels = nh->labels + mls->len;
2434       nhp->labels_orig = mls->len;
2435       if (nhp->labels <= MPLS_MAX_LABEL_STACK)
2436       {
2437         memcpy(nhp->label, nh->label, nh->labels * sizeof(u32)); /* First the hostentry labels */
2438         memcpy(&(nhp->label[nh->labels]), mls->stack, mls->len * sizeof(u32)); /* Then the bottom labels */
2439       }
2440       else
2441       {
2442         log(L_WARN "Sum of label stack sizes %d + %d = %d exceedes allowed maximum (%d)",
2443             nh->labels, mls->len, nhp->labels, MPLS_MAX_LABEL_STACK);
2444         skip_nexthop++;
2445         continue;
2446       }
2447     }
2448     else if (nh->labels)
2449     {
2450       nhp->labels = nh->labels;
2451       nhp->labels_orig = 0;
2452       memcpy(nhp->label, nh->label, nh->labels * sizeof(u32));
2453     }
2454
2455     if (ipa_nonzero(nh->gw))
2456     {
2457       nhp->gw = nh->gw;                 /* Router nexthop */
2458       nhp->flags |= (nh->flags & RNF_ONLINK);
2459     }
2460     else if (!(nh->iface->flags & IF_MULTIACCESS) || (nh->iface->flags & IF_LOOPBACK))
2461       nhp->gw = IPA_NONE;               /* PtP link - no need for nexthop */
2462     else if (ipa_nonzero(he->link))
2463       nhp->gw = he->link;               /* Device nexthop with link-local address known */
2464     else
2465       nhp->gw = he->addr;               /* Device nexthop with link-local address unknown */
2466   }
2467
2468   if (skip_nexthop)
2469     if (nhr)
2470       nhr->next = NULL;
2471     else
2472     {
2473       a->dest = RTD_UNREACHABLE;
2474       log(L_WARN "No valid nexthop remaining, setting route unreachable");
2475       goto no_nexthop;
2476     }
2477 }
2478
2479 static inline int
2480 rta_next_hop_outdated(rta *a)
2481 {
2482   struct hostentry *he = a->hostentry;
2483
2484   if (!he)
2485     return 0;
2486
2487   if (!he->src)
2488     return a->dest != RTD_UNREACHABLE;
2489
2490   return (a->dest != he->dest) || (a->igp_metric != he->igp_metric) ||
2491     (!he->nexthop_linkable) || !nexthop_same(&(a->nh), &(he->src->nh));
2492 }
2493
2494 static inline rte *
2495 rt_next_hop_update_rte(rtable *tab UNUSED, rte *old)
2496 {
2497   if (!rta_next_hop_outdated(old->attrs))
2498     return NULL;
2499
2500   rta *a = alloca(RTA_MAX_SIZE);
2501   memcpy(a, old->attrs, rta_size(old->attrs));
2502
2503   mpls_label_stack mls = { .len = a->nh.labels_orig };
2504   memcpy(mls.stack, &a->nh.label[a->nh.labels - mls.len], mls.len * sizeof(u32));
2505
2506   rta_apply_hostentry(a, old->attrs->hostentry, &mls);
2507   a->cached = 0;
2508
2509   rte *e = sl_alloc(rte_slab);
2510   memcpy(e, old, sizeof(rte));
2511   e->attrs = rta_lookup(a);
2512   rt_lock_source(e->src);
2513
2514   return e;
2515 }
2516
2517
2518 #ifdef CONFIG_BGP
2519
2520 static inline int
2521 net_flow_has_dst_prefix(const net_addr *n)
2522 {
2523   ASSUME(net_is_flow(n));
2524
2525   if (n->pxlen)
2526     return 1;
2527
2528   if (n->type == NET_FLOW4)
2529   {
2530     const net_addr_flow4 *n4 = (void *) n;
2531     return (n4->length > sizeof(net_addr_flow4)) && (n4->data[0] == FLOW_TYPE_DST_PREFIX);
2532   }
2533   else
2534   {
2535     const net_addr_flow6 *n6 = (void *) n;
2536     return (n6->length > sizeof(net_addr_flow6)) && (n6->data[0] == FLOW_TYPE_DST_PREFIX);
2537   }
2538 }
2539
2540 static inline int
2541 rta_as_path_is_empty(rta *a)
2542 {
2543   eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH));
2544   return !e || (as_path_getlen(e->u.ptr) == 0);
2545 }
2546
2547 static inline u32
2548 rta_get_first_asn(rta *a)
2549 {
2550   eattr *e = ea_find(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_AS_PATH));
2551   u32 asn;
2552
2553   return (e && as_path_get_first_regular(e->u.ptr, &asn)) ? asn : 0;
2554 }
2555
2556 int
2557 rt_flowspec_check(rtable *tab_ip, rtable *tab_flow, const net_addr *n, rta *a, int interior)
2558 {
2559   ASSERT(rt_is_ip(tab_ip));
2560   ASSERT(rt_is_flow(tab_flow));
2561   ASSERT(tab_ip->trie);
2562
2563   /* RFC 8955 6. a) Flowspec has defined dst prefix */
2564   if (!net_flow_has_dst_prefix(n))
2565     return 0;
2566
2567   /* RFC 9117 4.1. Accept  AS_PATH is empty (fr */
2568   if (interior && rta_as_path_is_empty(a))
2569     return 1;
2570
2571
2572   /* RFC 8955 6. b) Flowspec and its best-match route have the same originator */
2573
2574   /* Find flowspec dst prefix */
2575   net_addr dst;
2576   if (n->type == NET_FLOW4)
2577     net_fill_ip4(&dst, net4_prefix(n), net4_pxlen(n));
2578   else
2579     net_fill_ip6(&dst, net6_prefix(n), net6_pxlen(n));
2580
2581   /* Find best-match BGP unicast route for flowspec dst prefix */
2582   net *nb = net_route(tab_ip, &dst);
2583   rte *rb = nb ? nb->routes : NULL;
2584
2585   /* Register prefix to trie for tracking further changes */
2586   int max_pxlen = (n->type == NET_FLOW4) ? IP4_MAX_PREFIX_LENGTH : IP6_MAX_PREFIX_LENGTH;
2587   trie_add_prefix(tab_flow->flowspec_trie, &dst, (nb ? nb->n.addr->pxlen : 0), max_pxlen);
2588
2589   /* No best-match BGP route -> no flowspec */
2590   if (!rb || (rb->attrs->source != RTS_BGP))
2591     return 0;
2592
2593   /* Find ORIGINATOR_ID values */
2594   u32 orig_a = ea_get_int(a->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0);
2595   u32 orig_b = ea_get_int(rb->attrs->eattrs, EA_CODE(PROTOCOL_BGP, BA_ORIGINATOR_ID), 0);
2596
2597   /* Originator is either ORIGINATOR_ID (if present), or BGP neighbor address (if not) */
2598   if ((orig_a != orig_b) || (!orig_a && !orig_b && !ipa_equal(a->from, rb->attrs->from)))
2599     return 0;
2600
2601
2602   /* Find ASN of the best-match route, for use in next checks */
2603   u32 asn_b = rta_get_first_asn(rb->attrs);
2604   if (!asn_b)
2605     return 0;
2606
2607   /* RFC 9117 4.2. For EBGP, flowspec and its best-match route are from the same AS */
2608   if (!interior && (rta_get_first_asn(a) != asn_b))
2609     return 0;
2610
2611   /* RFC 8955 6. c) More-specific routes are from the same AS as the best-match route */
2612   TRIE_WALK(tab_ip->trie, subnet, &dst)
2613   {
2614     net *nc = net_find_valid(tab_ip, &subnet);
2615     if (!nc)
2616       continue;
2617
2618     rte *rc = nc->routes;
2619     if (rc->attrs->source != RTS_BGP)
2620       return 0;
2621
2622     if (rta_get_first_asn(rc->attrs) != asn_b)
2623       return 0;
2624   }
2625   TRIE_WALK_END;
2626
2627   return 1;
2628 }
2629
2630 #endif /* CONFIG_BGP */
2631
2632 static rte *
2633 rt_flowspec_update_rte(rtable *tab, rte *r)
2634 {
2635 #ifdef CONFIG_BGP
2636   if ((r->attrs->source != RTS_BGP) || (r->sender->proto != r->src->proto))
2637     return NULL;
2638
2639   struct bgp_channel *bc = (struct bgp_channel *) r->sender;
2640   if (!bc->base_table)
2641     return NULL;
2642
2643   const net_addr *n = r->net->n.addr;
2644   struct bgp_proto *p = (void *) r->src->proto;
2645   int valid = rt_flowspec_check(bc->base_table, tab, n, r->attrs, p->is_interior);
2646   int dest = valid ? RTD_NONE : RTD_UNREACHABLE;
2647
2648   if (dest == r->attrs->dest)
2649     return NULL;
2650
2651   rta *a = alloca(RTA_MAX_SIZE);
2652   memcpy(a, r->attrs, rta_size(r->attrs));
2653   a->dest = dest;
2654   a->cached = 0;
2655
2656   rte *new = sl_alloc(rte_slab);
2657   memcpy(new, r, sizeof(rte));
2658   new->attrs = rta_lookup(a);
2659   rt_lock_source(new->src);
2660
2661   return new;
2662 #else
2663   return NULL;
2664 #endif
2665 }
2666
2667
2668 static inline int
2669 rt_next_hop_update_net(rtable *tab, net *n)
2670 {
2671   rte **k, *e, *new, *old_best, **new_best;
2672   int count = 0;
2673   int free_old_best = 0;
2674
2675   old_best = n->routes;
2676   if (!old_best)
2677     return 0;
2678
2679   for (k = &n->routes; e = *k; k = &e->next)
2680   {
2681     if (!net_is_flow(n->n.addr))
2682       new = rt_next_hop_update_rte(tab, e);
2683     else
2684       new = rt_flowspec_update_rte(tab, e);
2685
2686     if (new)
2687       {
2688         *k = new;
2689
2690         rte_trace_in(D_ROUTES, new->sender, new, "updated");
2691         rte_announce_i(tab, RA_ANY, n, new, e, NULL, NULL);
2692
2693         /* Call a pre-comparison hook */
2694         /* Not really an efficient way to compute this */
2695         if (e->src->proto->rte_recalculate)
2696           e->src->proto->rte_recalculate(tab, n, new, e, NULL);
2697
2698         if (e != old_best)
2699           rte_free_quick(e);
2700         else /* Freeing of the old best rte is postponed */
2701           free_old_best = 1;
2702
2703         e = new;
2704         count++;
2705       }
2706   }
2707
2708   if (!count)
2709     return 0;
2710
2711   /* Find the new best route */
2712   new_best = NULL;
2713   for (k = &n->routes; e = *k; k = &e->next)
2714     {
2715       if (!new_best || rte_better(e, *new_best))
2716         new_best = k;
2717     }
2718
2719   /* Relink the new best route to the first position */
2720   new = *new_best;
2721   if (new != n->routes)
2722     {
2723       *new_best = new->next;
2724       new->next = n->routes;
2725       n->routes = new;
2726     }
2727
2728   /* Announce the new best route */
2729   if (new != old_best)
2730     rte_trace_in(D_ROUTES, new->sender, new, "updated [best]");
2731
2732   /* Propagate changes */
2733   rte_announce_i(tab, RA_UNDEF, n, NULL, NULL, n->routes, old_best);
2734
2735   if (free_old_best)
2736     rte_free_quick(old_best);
2737
2738   return count;
2739 }
2740
2741 static void
2742 rt_next_hop_update(rtable *tab)
2743 {
2744   struct fib_iterator *fit = &tab->nhu_fit;
2745   int max_feed = 32;
2746
2747   if (tab->nhu_state == NHU_CLEAN)
2748     return;
2749
2750   if (tab->nhu_state == NHU_SCHEDULED)
2751     {
2752       FIB_ITERATE_INIT(fit, &tab->fib);
2753       tab->nhu_state = NHU_RUNNING;
2754
2755       if (tab->flowspec_trie)
2756         rt_flowspec_reset_trie(tab);
2757     }
2758
2759   FIB_ITERATE_START(&tab->fib, fit, net, n)
2760     {
2761       if (max_feed <= 0)
2762         {
2763           FIB_ITERATE_PUT(fit);
2764           ev_schedule(tab->rt_event);
2765           return;
2766         }
2767       max_feed -= rt_next_hop_update_net(tab, n);
2768     }
2769   FIB_ITERATE_END;
2770
2771   /* State change:
2772    *   NHU_DIRTY   -> NHU_SCHEDULED
2773    *   NHU_RUNNING -> NHU_CLEAN
2774    */
2775   tab->nhu_state &= 1;
2776
2777   if (tab->nhu_state != NHU_CLEAN)
2778     ev_schedule(tab->rt_event);
2779 }
2780
2781
2782 struct rtable_config *
2783 rt_new_table(struct symbol *s, uint addr_type)
2784 {
2785   /* Hack that allows to 'redefine' the master table */
2786   if ((s->class == SYM_TABLE) &&
2787       (s->table == new_config->def_tables[addr_type]) &&
2788       ((addr_type == NET_IP4) || (addr_type == NET_IP6)))
2789     return s->table;
2790
2791   struct rtable_config *c = cfg_allocz(sizeof(struct rtable_config));
2792
2793   cf_define_symbol(new_config, s, SYM_TABLE, table, c);
2794   c->name = s->name;
2795   c->addr_type = addr_type;
2796   c->gc_threshold = 1000;
2797   c->gc_period = (uint) -1;     /* set in rt_postconfig() */
2798   c->min_settle_time = 1 S;
2799   c->max_settle_time = 20 S;
2800   c->debug = new_config->table_default_debug;
2801
2802   add_tail(&new_config->tables, &c->n);
2803
2804   /* First table of each type is kept as default */
2805   if (! new_config->def_tables[addr_type])
2806     new_config->def_tables[addr_type] = c;
2807
2808   return c;
2809 }
2810
2811 /**
2812  * rt_lock_table - lock a routing table
2813  * @r: routing table to be locked
2814  *
2815  * Lock a routing table, because it's in use by a protocol,
2816  * preventing it from being freed when it gets undefined in a new
2817  * configuration.
2818  */
2819 void
2820 rt_lock_table(rtable *r)
2821 {
2822   r->use_count++;
2823 }
2824
2825 /**
2826  * rt_unlock_table - unlock a routing table
2827  * @r: routing table to be unlocked
2828  *
2829  * Unlock a routing table formerly locked by rt_lock_table(),
2830  * that is decrease its use count and delete it if it's scheduled
2831  * for deletion by configuration changes.
2832  */
2833 void
2834 rt_unlock_table(rtable *r)
2835 {
2836   if (!--r->use_count && r->deleted)
2837     {
2838       struct config *conf = r->deleted;
2839
2840       /* Delete the routing table by freeing its pool */
2841       rt_shutdown(r);
2842       config_del_obstacle(conf);
2843     }
2844 }
2845
2846 static int
2847 rt_reconfigure(rtable *tab, struct rtable_config *new, struct rtable_config *old)
2848 {
2849   if ((new->addr_type != old->addr_type) ||
2850       (new->sorted != old->sorted) ||
2851       (new->trie_used != old->trie_used))
2852     return 0;
2853
2854   DBG("\t%s: same\n", new->name);
2855   new->table = tab;
2856   tab->name = new->name;
2857   tab->config = new;
2858   tab->debug = new->debug;
2859
2860   return 1;
2861 }
2862
2863 static struct rtable_config *
2864 rt_find_table_config(struct config *cf, char *name)
2865 {
2866   struct symbol *sym = cf_find_symbol(cf, name);
2867   return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL;
2868 }
2869
2870 /**
2871  * rt_commit - commit new routing table configuration
2872  * @new: new configuration
2873  * @old: original configuration or %NULL if it's boot time config
2874  *
2875  * Scan differences between @old and @new configuration and modify
2876  * the routing tables according to these changes. If @new defines a
2877  * previously unknown table, create it, if it omits a table existing
2878  * in @old, schedule it for deletion (it gets deleted when all protocols
2879  * disconnect from it by calling rt_unlock_table()), if it exists
2880  * in both configurations, leave it unchanged.
2881  */
2882 void
2883 rt_commit(struct config *new, struct config *old)
2884 {
2885   struct rtable_config *o, *r;
2886
2887   DBG("rt_commit:\n");
2888   if (old)
2889     {
2890       WALK_LIST(o, old->tables)
2891         {
2892           rtable *tab = o->table;
2893           if (tab->deleted)
2894             continue;
2895
2896           r = rt_find_table_config(new, o->name);
2897           if (r && !new->shutdown && rt_reconfigure(tab, r, o))
2898             continue;
2899
2900           DBG("\t%s: deleted\n", o->name);
2901           tab->deleted = old;
2902           config_add_obstacle(old);
2903           rt_lock_table(tab);
2904           rt_unlock_table(tab);
2905         }
2906     }
2907
2908   WALK_LIST(r, new->tables)
2909     if (!r->table)
2910       {
2911         r->table = rt_setup(rt_table_pool, r);
2912         DBG("\t%s: created\n", r->name);
2913         add_tail(&routing_tables, &r->table->n);
2914       }
2915   DBG("\tdone\n");
2916 }
2917
2918 static inline void
2919 do_feed_channel(struct channel *c, net *n, rte *e)
2920 {
2921   rte_update_lock();
2922   if (c->ra_mode == RA_ACCEPTED)
2923     rt_notify_accepted(c, n, NULL, NULL, c->refeeding);
2924   else if (c->ra_mode == RA_MERGED)
2925     rt_notify_merged(c, n, NULL, NULL, e, e, c->refeeding);
2926   else /* RA_BASIC */
2927     rt_notify_basic(c, n, e, e, c->refeeding);
2928   rte_update_unlock();
2929 }
2930
2931 /**
2932  * rt_feed_channel - advertise all routes to a channel
2933  * @c: channel to be fed
2934  *
2935  * This function performs one pass of advertisement of routes to a channel that
2936  * is in the ES_FEEDING state. It is called by the protocol code as long as it
2937  * has something to do. (We avoid transferring all the routes in single pass in
2938  * order not to monopolize CPU time.)
2939  */
2940 int
2941 rt_feed_channel(struct channel *c)
2942 {
2943   struct fib_iterator *fit = &c->feed_fit;
2944   int max_feed = 256;
2945
2946   ASSERT(c->export_state == ES_FEEDING);
2947
2948   if (!c->feed_active)
2949     {
2950       FIB_ITERATE_INIT(fit, &c->table->fib);
2951       c->feed_active = 1;
2952     }
2953
2954   FIB_ITERATE_START(&c->table->fib, fit, net, n)
2955     {
2956       rte *e = n->routes;
2957       if (max_feed <= 0)
2958         {
2959           FIB_ITERATE_PUT(fit);
2960           return 0;
2961         }
2962
2963       if ((c->ra_mode == RA_OPTIMAL) ||
2964           (c->ra_mode == RA_ACCEPTED) ||
2965           (c->ra_mode == RA_MERGED))
2966         if (rte_is_valid(e))
2967           {
2968             /* In the meantime, the protocol may fell down */
2969             if (c->export_state != ES_FEEDING)
2970               goto done;
2971
2972             do_feed_channel(c, n, e);
2973             max_feed--;
2974           }
2975
2976       if (c->ra_mode == RA_ANY)
2977         for(e = n->routes; e; e = e->next)
2978           {
2979             /* In the meantime, the protocol may fell down */
2980             if (c->export_state != ES_FEEDING)
2981               goto done;
2982
2983             if (!rte_is_valid(e))
2984               continue;
2985
2986             do_feed_channel(c, n, e);
2987             max_feed--;
2988           }
2989     }
2990   FIB_ITERATE_END;
2991
2992 done:
2993   c->feed_active = 0;
2994   return 1;
2995 }
2996
2997 /**
2998  * rt_feed_baby_abort - abort protocol feeding
2999  * @c: channel
3000  *
3001  * This function is called by the protocol code when the protocol stops or
3002  * ceases to exist during the feeding.
3003  */
3004 void
3005 rt_feed_channel_abort(struct channel *c)
3006 {
3007   if (c->feed_active)
3008     {
3009       /* Unlink the iterator */
3010       fit_get(&c->table->fib, &c->feed_fit);
3011       c->feed_active = 0;
3012     }
3013 }
3014
3015
3016 /*
3017  *      Import table
3018  */
3019
3020 int
3021 rte_update_in(struct channel *c, const net_addr *n, rte *new, struct rte_src *src)
3022 {
3023   struct rtable *tab = c->in_table;
3024   rte *old, **pos;
3025   net *net;
3026
3027   if (new)
3028   {
3029     net = net_get(tab, n);
3030
3031     if (!rta_is_cached(new->attrs))
3032       new->attrs = rta_lookup(new->attrs);
3033   }
3034   else
3035   {
3036     net = net_find(tab, n);
3037
3038     if (!net)
3039       goto drop_withdraw;
3040   }
3041
3042   /* Find the old rte */
3043   for (pos = &net->routes; old = *pos; pos = &old->next)
3044     if (old->src == src)
3045     {
3046       if (new && rte_same(old, new))
3047       {
3048         /* Refresh the old rte, continue with update to main rtable */
3049         if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY))
3050         {
3051           old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
3052
3053           return 1;
3054         }
3055
3056         goto drop_update;
3057       }
3058
3059       /* Move iterator if needed */
3060       if (old == c->reload_next_rte)
3061         c->reload_next_rte = old->next;
3062
3063       /* Remove the old rte */
3064       *pos = old->next;
3065       tab->rt_count--;
3066       break;
3067     }
3068
3069   if (!old && !new)
3070     goto drop_withdraw;
3071
3072   struct channel_limit *l = &c->rx_limit;
3073   if (l->action && !old && new)
3074   {
3075     if (tab->rt_count >= l->limit)
3076       channel_notify_limit(c, l, PLD_RX, tab->rt_count);
3077
3078     if (l->state == PLS_BLOCKED)
3079     {
3080       /* Required by rte_trace_in() */
3081       new->net = net;
3082
3083       rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
3084       goto drop_update;
3085     }
3086   }
3087
3088   if (new)
3089   {
3090     /* Insert the new rte */
3091     rte *e = rte_do_cow(new);
3092     e->flags |= REF_COW;
3093     e->net = net;
3094     e->sender = c;
3095     e->lastmod = current_time();
3096     e->next = *pos;
3097     *pos = new = e;
3098     tab->rt_count++;
3099
3100     if (!old)
3101     {
3102       new->id = hmap_first_zero(&tab->id_map);
3103       hmap_set(&tab->id_map, new->id);
3104     }
3105     else
3106       new->id = old->id;
3107   }
3108
3109   rte_announce(tab, RA_ANY, net, new, old, NULL, NULL);
3110
3111   if (old)
3112   {
3113     if (!new)
3114       hmap_clear(&tab->id_map, old->id);
3115
3116     rte_free_quick(old);
3117   }
3118
3119   if (!net->routes)
3120     fib_delete(&tab->fib, net);
3121
3122   return 1;
3123
3124 drop_update:
3125   c->stats.imp_updates_received++;
3126   c->stats.imp_updates_ignored++;
3127   rte_free(new);
3128
3129   if (!net->routes)
3130     fib_delete(&tab->fib, net);
3131
3132   return 0;
3133
3134 drop_withdraw:
3135   c->stats.imp_withdraws_received++;
3136   c->stats.imp_withdraws_ignored++;
3137   return 0;
3138 }
3139
3140 int
3141 rt_reload_channel(struct channel *c)
3142 {
3143   struct rtable *tab = c->in_table;
3144   struct fib_iterator *fit = &c->reload_fit;
3145   int max_feed = 64;
3146
3147   ASSERT(c->channel_state == CS_UP);
3148
3149   if (!c->reload_active)
3150   {
3151     FIB_ITERATE_INIT(fit, &tab->fib);
3152     c->reload_active = 1;
3153   }
3154
3155   do {
3156     for (rte *e = c->reload_next_rte; e; e = e->next)
3157     {
3158       if (max_feed-- <= 0)
3159       {
3160         c->reload_next_rte = e;
3161         debug("%s channel reload burst split (max_feed=%d)", c->proto->name, max_feed);
3162         return 0;
3163       }
3164
3165       rte_update2(c, e->net->n.addr, rte_do_cow(e), e->src);
3166     }
3167
3168     c->reload_next_rte = NULL;
3169
3170     FIB_ITERATE_START(&tab->fib, fit, net, n)
3171     {
3172       if (c->reload_next_rte = n->routes)
3173       {
3174         FIB_ITERATE_PUT_NEXT(fit, &tab->fib);
3175         break;
3176       }
3177     }
3178     FIB_ITERATE_END;
3179   }
3180   while (c->reload_next_rte);
3181
3182   c->reload_active = 0;
3183   return 1;
3184 }
3185
3186 void
3187 rt_reload_channel_abort(struct channel *c)
3188 {
3189   if (c->reload_active)
3190   {
3191     /* Unlink the iterator */
3192     fit_get(&c->in_table->fib, &c->reload_fit);
3193     c->reload_next_rte = NULL;
3194     c->reload_active = 0;
3195   }
3196 }
3197
3198 void
3199 rt_prune_sync(rtable *t, int all)
3200 {
3201   struct fib_iterator fit;
3202
3203   FIB_ITERATE_INIT(&fit, &t->fib);
3204
3205 again:
3206   FIB_ITERATE_START(&t->fib, &fit, net, n)
3207   {
3208     rte *e, **ee = &n->routes;
3209
3210     while (e = *ee)
3211     {
3212       if (all || (e->flags & (REF_STALE | REF_DISCARD)))
3213       {
3214         *ee = e->next;
3215         rte_free_quick(e);
3216         t->rt_count--;
3217       }
3218       else
3219         ee = &e->next;
3220     }
3221
3222     if (all || !n->routes)
3223     {
3224       FIB_ITERATE_PUT(&fit);
3225       fib_delete(&t->fib, n);
3226       goto again;
3227     }
3228   }
3229   FIB_ITERATE_END;
3230 }
3231
3232
3233 /*
3234  *      Export table
3235  */
3236
3237 int
3238 rte_update_out(struct channel *c, const net_addr *n, rte *new, rte *old0, int refeed)
3239 {
3240   struct rtable *tab = c->out_table;
3241   struct rte_src *src;
3242   rte *old, **pos;
3243   net *net;
3244
3245   if (new)
3246   {
3247     net = net_get(tab, n);
3248     src = new->src;
3249
3250     if (!rta_is_cached(new->attrs))
3251       new->attrs = rta_lookup(new->attrs);
3252   }
3253   else
3254   {
3255     net = net_find(tab, n);
3256     src = old0->src;
3257
3258     if (!net)
3259       goto drop_withdraw;
3260   }
3261
3262   /* Find the old rte */
3263   for (pos = &net->routes; old = *pos; pos = &old->next)
3264     if ((c->ra_mode != RA_ANY) || (old->src == src))
3265     {
3266       if (new && rte_same(old, new))
3267       {
3268         /* REF_STALE / REF_DISCARD not used in export table */
3269         /*
3270         if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY))
3271         {
3272           old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
3273           return 1;
3274         }
3275         */
3276
3277         goto drop_update;
3278       }
3279
3280       /* Remove the old rte */
3281       *pos = old->next;
3282       rte_free_quick(old);
3283       tab->rt_count--;
3284
3285       break;
3286     }
3287
3288   if (!new)
3289   {
3290     if (!old)
3291       goto drop_withdraw;
3292
3293     if (!net->routes)
3294       fib_delete(&tab->fib, net);
3295
3296     return 1;
3297   }
3298
3299   /* Insert the new rte */
3300   rte *e = rte_do_cow(new);
3301   e->flags |= REF_COW;
3302   e->net = net;
3303   e->sender = c;
3304   e->lastmod = current_time();
3305   e->next = *pos;
3306   *pos = e;
3307   tab->rt_count++;
3308   return 1;
3309
3310 drop_update:
3311   return refeed;
3312
3313 drop_withdraw:
3314   return 0;
3315 }
3316
3317
3318 /*
3319  *      Hostcache
3320  */
3321
3322 static inline u32
3323 hc_hash(ip_addr a, rtable *dep)
3324 {
3325   return ipa_hash(a) ^ ptr_hash(dep);
3326 }
3327
3328 static inline void
3329 hc_insert(struct hostcache *hc, struct hostentry *he)
3330 {
3331   uint k = he->hash_key >> hc->hash_shift;
3332   he->next = hc->hash_table[k];
3333   hc->hash_table[k] = he;
3334 }
3335
3336 static inline void
3337 hc_remove(struct hostcache *hc, struct hostentry *he)
3338 {
3339   struct hostentry **hep;
3340   uint k = he->hash_key >> hc->hash_shift;
3341
3342   for (hep = &hc->hash_table[k]; *hep != he; hep = &(*hep)->next);
3343   *hep = he->next;
3344 }
3345
3346 #define HC_DEF_ORDER 10
3347 #define HC_HI_MARK *4
3348 #define HC_HI_STEP 2
3349 #define HC_HI_ORDER 16                  /* Must be at most 16 */
3350 #define HC_LO_MARK /5
3351 #define HC_LO_STEP 2
3352 #define HC_LO_ORDER 10
3353
3354 static void
3355 hc_alloc_table(struct hostcache *hc, pool *p, unsigned order)
3356 {
3357   uint hsize = 1 << order;
3358   hc->hash_order = order;
3359   hc->hash_shift = 32 - order;
3360   hc->hash_max = (order >= HC_HI_ORDER) ? ~0U : (hsize HC_HI_MARK);
3361   hc->hash_min = (order <= HC_LO_ORDER) ?  0U : (hsize HC_LO_MARK);
3362
3363   hc->hash_table = mb_allocz(p, hsize * sizeof(struct hostentry *));
3364 }
3365
3366 static void
3367 hc_resize(struct hostcache *hc, pool *p, unsigned new_order)
3368 {
3369   struct hostentry **old_table = hc->hash_table;
3370   struct hostentry *he, *hen;
3371   uint old_size = 1 << hc->hash_order;
3372   uint i;
3373
3374   hc_alloc_table(hc, p, new_order);
3375   for (i = 0; i < old_size; i++)
3376     for (he = old_table[i]; he != NULL; he=hen)
3377       {
3378         hen = he->next;
3379         hc_insert(hc, he);
3380       }
3381   mb_free(old_table);
3382 }
3383
3384 static struct hostentry *
3385 hc_new_hostentry(struct hostcache *hc, pool *p, ip_addr a, ip_addr ll, rtable *dep, unsigned k)
3386 {
3387   struct hostentry *he = sl_alloc(hc->slab);
3388
3389   *he = (struct hostentry) {
3390     .addr = a,
3391     .link = ll,
3392     .tab = dep,
3393     .hash_key = k,
3394   };
3395
3396   add_tail(&hc->hostentries, &he->ln);
3397   hc_insert(hc, he);
3398
3399   hc->hash_items++;
3400   if (hc->hash_items > hc->hash_max)
3401     hc_resize(hc, p, hc->hash_order + HC_HI_STEP);
3402
3403   return he;
3404 }
3405
3406 static void
3407 hc_delete_hostentry(struct hostcache *hc, pool *p, struct hostentry *he)
3408 {
3409   rta_free(he->src);
3410
3411   rem_node(&he->ln);
3412   hc_remove(hc, he);
3413   sl_free(he);
3414
3415   hc->hash_items--;
3416   if (hc->hash_items < hc->hash_min)
3417     hc_resize(hc, p, hc->hash_order - HC_LO_STEP);
3418 }
3419
3420 static void
3421 rt_init_hostcache(rtable *tab)
3422 {
3423   struct hostcache *hc = mb_allocz(tab->rp, sizeof(struct hostcache));
3424   init_list(&hc->hostentries);
3425
3426   hc->hash_items = 0;
3427   hc_alloc_table(hc, tab->rp, HC_DEF_ORDER);
3428   hc->slab = sl_new(tab->rp, sizeof(struct hostentry));
3429
3430   hc->lp = lp_new(tab->rp);
3431   hc->trie = f_new_trie(hc->lp, 0);
3432
3433   tab->hostcache = hc;
3434 }
3435
3436 static void
3437 rt_free_hostcache(rtable *tab)
3438 {
3439   struct hostcache *hc = tab->hostcache;
3440
3441   node *n;
3442   WALK_LIST(n, hc->hostentries)
3443     {
3444       struct hostentry *he = SKIP_BACK(struct hostentry, ln, n);
3445       rta_free(he->src);
3446
3447       if (he->uc)
3448         log(L_ERR "Hostcache is not empty in table %s", tab->name);
3449     }
3450
3451   /* Freed automagically by the resource pool
3452   rfree(hc->slab);
3453   rfree(hc->lp);
3454   mb_free(hc->hash_table);
3455   mb_free(hc);
3456   */
3457 }
3458
3459 static void
3460 rt_notify_hostcache(rtable *tab, net *net)
3461 {
3462   if (tab->hcu_scheduled)
3463     return;
3464
3465   if (trie_match_net(tab->hostcache->trie, net->n.addr))
3466     rt_schedule_hcu(tab);
3467 }
3468
3469 static int
3470 if_local_addr(ip_addr a, struct iface *i)
3471 {
3472   struct ifa *b;
3473
3474   WALK_LIST(b, i->addrs)
3475     if (ipa_equal(a, b->ip))
3476       return 1;
3477
3478   return 0;
3479 }
3480
3481 u32
3482 rt_get_igp_metric(rte *rt)
3483 {
3484   eattr *ea = ea_find(rt->attrs->eattrs, EA_GEN_IGP_METRIC);
3485
3486   if (ea)
3487     return ea->u.data;
3488
3489   if (rt->attrs->source == RTS_DEVICE)
3490     return 0;
3491
3492   if (rt->src->proto->rte_igp_metric)
3493     return rt->src->proto->rte_igp_metric(rt);
3494
3495   return IGP_METRIC_UNKNOWN;
3496 }
3497
3498 static int
3499 rt_update_hostentry(rtable *tab, struct hostentry *he)
3500 {
3501   rta *old_src = he->src;
3502   int direct = 0;
3503   int pxlen = 0;
3504
3505   /* Reset the hostentry */
3506   he->src = NULL;
3507   he->dest = RTD_UNREACHABLE;
3508   he->nexthop_linkable = 0;
3509   he->igp_metric = 0;
3510
3511   net_addr he_addr;
3512   net_fill_ip_host(&he_addr, he->addr);
3513   net *n = net_route(tab, &he_addr);
3514   if (n)
3515     {
3516       rte *e = n->routes;
3517       rta *a = e->attrs;
3518       word pref = a->pref;
3519
3520       for (rte *ee = n->routes; ee; ee = ee->next)
3521         if ((ee->attrs->pref >= pref) && ee->attrs->hostentry)
3522         {
3523           /* Recursive route should not depend on another recursive route */
3524           log(L_WARN "Next hop address %I resolvable through recursive route for %N",
3525               he->addr, n->n.addr);
3526           goto done;
3527         }
3528
3529       pxlen = n->n.addr->pxlen;
3530
3531       if (a->dest == RTD_UNICAST)
3532         {
3533           for (struct nexthop *nh = &(a->nh); nh; nh = nh->next)
3534             if (ipa_zero(nh->gw))
3535               {
3536                 if (if_local_addr(he->addr, nh->iface))
3537                   {
3538                     /* The host address is a local address, this is not valid */
3539                     log(L_WARN "Next hop address %I is a local address of iface %s",
3540                         he->addr, nh->iface->name);
3541                     goto done;
3542                   }
3543
3544                 direct++;
3545               }
3546         }
3547
3548       he->src = rta_clone(a);
3549       he->dest = a->dest;
3550       he->nexthop_linkable = !direct;
3551       he->igp_metric = rt_get_igp_metric(e);
3552     }
3553
3554 done:
3555   /* Add a prefix range to the trie */
3556   trie_add_prefix(tab->hostcache->trie, &he_addr, pxlen, he_addr.pxlen);
3557
3558   rta_free(old_src);
3559   return old_src != he->src;
3560 }
3561
3562 static void
3563 rt_update_hostcache(rtable *tab)
3564 {
3565   struct hostcache *hc = tab->hostcache;
3566   struct hostentry *he;
3567   node *n, *x;
3568
3569   /* Reset the trie */
3570   lp_flush(hc->lp);
3571   hc->trie = f_new_trie(hc->lp, 0);
3572
3573   WALK_LIST_DELSAFE(n, x, hc->hostentries)
3574     {
3575       he = SKIP_BACK(struct hostentry, ln, n);
3576       if (!he->uc)
3577         {
3578           hc_delete_hostentry(hc, tab->rp, he);
3579           continue;
3580         }
3581
3582       if (rt_update_hostentry(tab, he))
3583         rt_schedule_nhu(he->tab);
3584     }
3585
3586   tab->hcu_scheduled = 0;
3587 }
3588
3589 struct hostentry *
3590 rt_get_hostentry(rtable *tab, ip_addr a, ip_addr ll, rtable *dep)
3591 {
3592   ip_addr link = ipa_zero(ll) ? a : ll;
3593   struct hostentry *he;
3594
3595   if (!tab->hostcache)
3596     rt_init_hostcache(tab);
3597
3598   u32 k = hc_hash(a, dep);
3599   struct hostcache *hc = tab->hostcache;
3600   for (he = hc->hash_table[k >> hc->hash_shift]; he != NULL; he = he->next)
3601     if (ipa_equal(he->addr, a) && ipa_equal(he->link, link) && (he->tab == dep))
3602       return he;
3603
3604   he = hc_new_hostentry(hc, tab->rp, a, link, dep, k);
3605   he->owner = tab;
3606   rt_update_hostentry(tab, he);
3607   return he;
3608 }
3609
3610
3611 /*
3612  *  Documentation for functions declared inline in route.h
3613  */
3614 #if 0
3615
3616 /**
3617  * net_find - find a network entry
3618  * @tab: a routing table
3619  * @addr: address of the network
3620  *
3621  * net_find() looks up the given network in routing table @tab and
3622  * returns a pointer to its &net entry or %NULL if no such network
3623  * exists.
3624  */
3625 static inline net *net_find(rtable *tab, net_addr *addr)
3626 { DUMMY; }
3627
3628 /**
3629  * net_get - obtain a network entry
3630  * @tab: a routing table
3631  * @addr: address of the network
3632  *
3633  * net_get() looks up the given network in routing table @tab and
3634  * returns a pointer to its &net entry. If no such entry exists, it's
3635  * created.
3636  */
3637 static inline net *net_get(rtable *tab, net_addr *addr)
3638 { DUMMY; }
3639
3640 /**
3641  * rte_cow - copy a route for writing
3642  * @r: a route entry to be copied
3643  *
3644  * rte_cow() takes a &rte and prepares it for modification. The exact action
3645  * taken depends on the flags of the &rte -- if it's a temporary entry, it's
3646  * just returned unchanged, else a new temporary entry with the same contents
3647  * is created.
3648  *
3649  * The primary use of this function is inside the filter machinery -- when
3650  * a filter wants to modify &rte contents (to change the preference or to
3651  * attach another set of attributes), it must ensure that the &rte is not
3652  * shared with anyone else (and especially that it isn't stored in any routing
3653  * table).
3654  *
3655  * Result: a pointer to the new writable &rte.
3656  */
3657 static inline rte * rte_cow(rte *r)
3658 { DUMMY; }
3659
3660 #endif