Merge commit '97476e00' into thread-next

author Maria Matejka <mq@ucw.cz>

Wed, 3 Aug 2022 12:07:53 +0000 (14:07 +0200)

committer Maria Matejka <mq@ucw.cz>

Wed, 3 Aug 2022 12:07:53 +0000 (14:07 +0200)
author Maria Matejka <mq@ucw.cz>
Wed, 3 Aug 2022 12:07:53 +0000 (14:07 +0200)
committer Maria Matejka <mq@ucw.cz>
Wed, 3 Aug 2022 12:07:53 +0000 (14:07 +0200)
diff --cc lib/route.h

index cf3c70ba924f36d3aeed76ca19851ebd15bad699,2d691215643f9079a6d7044d7ee12e865a85571a..97e2e0532625cf8f958e08f86623665264d97d64
--- 1/lib/route.h
--- 2/lib/route.h
+++ b/lib/route.h
@@@ -10,10 -10,7 +10,12 @@@
   #ifndef _BIRD_LIB_ROUTE_H_
   #define _BIRD_LIB_ROUTE_H_
   
++#undef RT_SOURCE_DEBUG
++
   #include "lib/type.h"
+ +#include "lib/rcu.h"
+ +#include "lib/hash.h"
+ +#include "lib/event.h"
   
   struct network;
   struct proto;
@@@ -46,101 -43,19 +48,114 @@@ static inline int rte_is_filtered(rte *
   
   struct rte_src {
     struct rte_src *next;                       /* Hash chain */
- -  struct proto *proto;                        /* Protocol the source is based on */
+ +  struct rte_owner *owner;            /* Route source owner */
     u32 private_id;                     /* Private ID, assigned by the protocol */
     u32 global_id;                      /* Globally unique ID of the source */
- -  unsigned uc;                                /* Use count */
+ +  _Atomic u64 uc;                     /* Use count */
+ +};
+ +
+ +struct rte_owner_class {
+ +  void (*get_route_info)(struct rte *, byte *buf); /* Get route information (for `show route' command) */
+ +  int (*rte_better)(struct rte *, struct rte *);
+ +  int (*rte_mergable)(struct rte *, struct rte *);
+ +  u32 (*rte_igp_metric)(const rte *);
+ +};
+ +
+ +struct rte_owner {
+ +  struct rte_owner_class *class;
+ +  int (*rte_recalculate)(struct rtable *, struct network *, struct rte *, struct rte *, struct rte *);
+ +  HASH(struct rte_src) hash;
+ +  const char *name;
+ +  u32 hash_key;
+ +  u32 uc;
+ +  event_list *list;
+ +  event *prune;
+ +  event *stop;
   };
   
+ +DEFINE_DOMAIN(attrs);
+ +extern DOMAIN(attrs) attrs_domain;
+ +
+ +#define RTA_LOCK       LOCK_DOMAIN(attrs, attrs_domain)
+ +#define RTA_UNLOCK     UNLOCK_DOMAIN(attrs, attrs_domain)
+ +
+ +#define RTE_SRC_PU_SHIFT      44
+ +#define RTE_SRC_IN_PROGRESS   (1ULL << RTE_SRC_PU_SHIFT)
+ +
+ +/* Get a route source. This also locks the source, therefore the caller has to
+ + * unlock the source after the route has been propagated. */
+ +struct rte_src *rt_get_source_o(struct rte_owner *o, u32 id);
+ +#define rt_get_source(p, id)  rt_get_source_o(&(p)->sources, (id))
   
- -struct rte_src *rt_find_source(struct proto *p, u32 id);
- -struct rte_src *rt_get_source(struct proto *p, u32 id);
   struct rte_src *rt_find_source_global(u32 id);
- -static inline void rt_lock_source(struct rte_src *src) { src->uc++; }
- -static inline void rt_unlock_source(struct rte_src *src) { src->uc--; }
- -void rt_prune_sources(void);
+ +
++#ifdef RT_SOURCE_DEBUG
++#define rt_lock_source _rt_lock_source_internal
++#define rt_unlock_source _rt_unlock_source_internal
++#endif
++
+ +static inline void rt_lock_source(struct rte_src *src)
+ +{
+ +  /* Locking a source is trivial; somebody already holds it so we just increase
+ +   * the use count. Nothing can be freed underneath our hands. */
+ +  u64 uc = atomic_fetch_add_explicit(&src->uc, 1, memory_order_acq_rel);
+ +  ASSERT_DIE(uc > 0);
+ +}
+ +
+ +static inline void rt_unlock_source(struct rte_src *src)
+ +{
+ +  /* Unlocking is tricky. We do it lockless so at the same time, the prune
+ +   * event may be running, therefore if the unlock gets us to zero, it must be
+ +   * the last thing in this routine, otherwise the prune routine may find the
+ +   * source's usecount zeroed, freeing it prematurely.
+ +   *
+ +   * The usecount is split into two parts:
+ +   * the top 20 bits are an in-progress indicator
+ +   * the bottom 44 bits keep the actual usecount.
+ +   *
+ +   * Therefore at most 1 million of writers can simultaneously unlock the same
+ +   * source, while at most ~17T different routes can reference it. Both limits
+ +   * are insanely high from the 2022 point of view. Let's suppose that when 17T
+ +   * routes or 1M writers get real, we get also 128bit atomic variables in the
+ +   * C norm. */
+ +
+ +  /* First, we push the in-progress indicator */
+ +  u64 uc = atomic_fetch_add_explicit(&src->uc, RTE_SRC_IN_PROGRESS, memory_order_acq_rel);
+ +
+ +  /* Then we split the indicator to its parts. Remember, we got the value before the operation happened. */
+ +  u64 pending = (uc >> RTE_SRC_PU_SHIFT) + 1;
+ +  uc &= RTE_SRC_IN_PROGRESS - 1;
+ +
+ +  /* We per-use the RCU critical section indicator to make the prune event wait
+ +   * until we finish here in the rare case we get preempted. */
+ +  rcu_read_lock();
+ +
+ +  /* Obviously, there can't be more pending unlocks than the usecount itself */
+ +  if (uc == pending)
+ +    /* If we're the last unlocker, schedule the owner's prune event */
+ +    ev_send(src->owner->list, src->owner->prune);
+ +  else
+ +    ASSERT_DIE(uc > pending);
+ +
+ +  /* And now, finally, simultaneously pop the in-progress indicator and the
+ +   * usecount, possibly allowing the source pruning routine to free this structure */
+ +  atomic_fetch_sub_explicit(&src->uc, RTE_SRC_IN_PROGRESS + 1, memory_order_acq_rel);
+ +
+ +  /* ... and to reduce the load a bit, the source pruning routine will better wait for
+ +   * RCU synchronization instead of a busy loop. */
+ +  rcu_read_unlock();
+ +}
+ +
++#ifdef RT_SOURCE_DEBUG
++#undef rt_lock_source
++#undef rt_unlock_source
++
++#define rt_lock_source(x) ( log(L_INFO "Lock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_lock_source_internal(x) )
++#define rt_unlock_source(x) ( log(L_INFO "Unlock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_unlock_source_internal(x) )
++#endif
++
+ +void rt_init_sources(struct rte_owner *, const char *name, event_list *list);
+ +void rt_destroy_sources(struct rte_owner *, event *);
   
   /*
    *    Route Attributes
diff --cc proto/bgp/attrs.c

index e96b175da11a56a1ff99cbc91771669b9031f05d,2ff3f3f399dbb5fd8331297bc4a60a560f39fe63..a1f5791a6ed3197cdd0ad4d3638f98587030d47f
--- 1/proto/bgp/attrs.c
--- 2/proto/bgp/attrs.c
+++ b/proto/bgp/attrs.c
@@@ -1661,20 -1650,10 +1650,11 @@@ bgp_init_prefix_table(struct bgp_channe
     c->prefix_slab = alen ? sl_new(c->pool, sizeof(struct bgp_prefix) + alen) : NULL;
   }
   
- void
- bgp_free_prefix_table(struct bgp_channel *c)
- {
-   HASH_FREE(c->prefix_hash);
- 
-   rfree(c->prefix_slab);
-   c->prefix_slab = NULL;
- }
- 
   static struct bgp_prefix *
- bgp_get_prefix(struct bgp_channel *c, const net_addr *net, struct rte_src *src)
- -bgp_get_prefix(struct bgp_pending_tx *c, const net_addr *net, u32 path_id, int add_path_tx)
++bgp_get_prefix(struct bgp_pending_tx *c, const net_addr *net, struct rte_src *src, int add_path_tx)
   {
-   u32 path_id_hash = c->add_path_tx ? path_id : 0;
+ +  u32 path_id = src->global_id;
+   u32 path_id_hash = add_path_tx ? path_id : 0;
     /* We must use a different hash function than the rtable */
     u32 hash = u32_hash(net_hash(net) ^ u32_hash(path_id_hash));
     struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id_hash, hash);
@@@ -1794,7 -1771,47 +1775,49 @@@ bgp_done_prefix(struct bgp_channel *c, 
       /* Prefixes belonging to the withdraw bucket are freed always */
     }
   
-   bgp_free_prefix(c, px);
+   bgp_free_prefix(c->ptx, px);
+ }
+ 
+ static void
+ bgp_pending_tx_rfree(resource *r)
+ {
- -  UNUSED struct bgp_pending_tx *ptx = SKIP_BACK(struct bgp_pending_tx, r, r);
++  struct bgp_pending_tx *ptx = SKIP_BACK(struct bgp_pending_tx, r, r);
+ 
- -  /* Do nothing for now. */
++  HASH_WALK(ptx->prefix_hash, next, n)
++    rt_unlock_source(rt_find_source_global(n->path_id));
++  HASH_WALK_END;
+ }
+ 
+ static void bgp_pending_tx_dump(resource *r UNUSED) { debug("\n"); }
+ 
+ static struct resclass bgp_pending_tx_class = {
+   .name = "BGP Pending TX",
+   .size = sizeof(struct bgp_pending_tx),
+   .free = bgp_pending_tx_rfree,
+   .dump = bgp_pending_tx_dump,
+ };
+ 
+ void
+ bgp_init_pending_tx(struct bgp_channel *c)
+ {
+   ASSERT_DIE(!c->ptx);
+ 
+   pool *p = rp_new(c->pool, "BGP Pending TX");
+   c->ptx = ralloc(p, &bgp_pending_tx_class);
+   c->ptx->pool = p;
+ 
+   bgp_init_bucket_table(c->ptx);
+   bgp_init_prefix_table(c);
+ }
+ 
+ void
+ bgp_free_pending_tx(struct bgp_channel *c)
+ {
+   ASSERT_DIE(c->ptx);
+   ASSERT_DIE(c->ptx->pool);
+ 
+   rfree(c->ptx->pool);
+   c->ptx = NULL;
   }
   
   
@@@ -2134,16 -2154,16 +2158,16 @@@ bgp_rt_notify(struct proto *P, struct c
         log(L_ERR "%s: Invalid route %N withdrawn", p->p.name, n);
   
       /* If attributes are invalid, we fail back to withdraw */
-     buck = attrs ? bgp_get_bucket(c, attrs) : bgp_get_withdraw_bucket(c);
+     buck = attrs ? bgp_get_bucket(c->ptx, attrs) : bgp_get_withdraw_bucket(c->ptx);
- -    path = new->src->global_id;
+ +    path = new->src;
     }
     else
     {
-     buck = bgp_get_withdraw_bucket(c);
+     buck = bgp_get_withdraw_bucket(c->ptx);
- -    path = old->src->global_id;
+ +    path = old->src;
     }
   
-   if (bgp_update_prefix(c, bgp_get_prefix(c, n, path), buck))
+   if (bgp_update_prefix(c, bgp_get_prefix(c->ptx, n, path, c->add_path_tx), buck))
       bgp_schedule_packet(p->conn, c, PKT_UPDATE);
   }
   
diff --cc proto/bgp/bgp.c

index d240112c623d15cac424ae0fd71184f6b0683449,e6d3f12548d5c20ce337df154a015783306b910a..65673e3791eacec1e0bd93a214312b009d2b890d
--- 1/proto/bgp/bgp.c
--- 2/proto/bgp/bgp.c
+++ b/proto/bgp/bgp.c
@@@ -518,6 -518,6 +518,12 @@@ bgp_stop(struct bgp_proto *p, int subco
     p->uncork_ev->data = NULL;
     bgp_graceful_close_conn(&p->outgoing_conn, subcode, data, len);
     bgp_graceful_close_conn(&p->incoming_conn, subcode, data, len);
++
++  struct bgp_channel *c;
++  WALK_LIST(c, p->p.channels)
++    if (c->ptx)
++      bgp_free_pending_tx(c);
++
     ev_schedule(p->event);
   }
   
diff --cc proto/bgp/bgp.h
Simple merge
diff --cc proto/bgp/packets.c
Simple merge
author	Maria Matejka <mq@ucw.cz>
	Wed, 3 Aug 2022 12:07:53 +0000 (14:07 +0200)
committer	Maria Matejka <mq@ucw.cz>
	Wed, 3 Aug 2022 12:07:53 +0000 (14:07 +0200)
		1	2
lib/route.h	patch \|	diff1 \|	diff2 \|	blob \| history
proto/bgp/attrs.c	patch \|	diff1 \|	diff2 \|	blob \| history
proto/bgp/bgp.c	patch \|	diff1 \|	diff2 \|	blob \| history
proto/bgp/bgp.h	patch \|	diff1 \|	diff2 \|	blob \| history
proto/bgp/packets.c	patch \|	diff1 \|	diff2 \|	blob \| history