#ifndef _BIRD_LIB_ROUTE_H_
#define _BIRD_LIB_ROUTE_H_
++#undef RT_SOURCE_DEBUG
++
#include "lib/type.h"
+#include "lib/rcu.h"
+#include "lib/hash.h"
+#include "lib/event.h"
struct network;
struct proto;
struct rte_src {
struct rte_src *next; /* Hash chain */
- struct proto *proto; /* Protocol the source is based on */
+ struct rte_owner *owner; /* Route source owner */
u32 private_id; /* Private ID, assigned by the protocol */
u32 global_id; /* Globally unique ID of the source */
- unsigned uc; /* Use count */
+ _Atomic u64 uc; /* Use count */
+};
+
+struct rte_owner_class {
+ void (*get_route_info)(struct rte *, byte *buf); /* Get route information (for `show route' command) */
+ int (*rte_better)(struct rte *, struct rte *);
+ int (*rte_mergable)(struct rte *, struct rte *);
+ u32 (*rte_igp_metric)(const rte *);
+};
+
+struct rte_owner {
+ struct rte_owner_class *class;
+ int (*rte_recalculate)(struct rtable *, struct network *, struct rte *, struct rte *, struct rte *);
+ HASH(struct rte_src) hash;
+ const char *name;
+ u32 hash_key;
+ u32 uc;
+ event_list *list;
+ event *prune;
+ event *stop;
};
+DEFINE_DOMAIN(attrs);
+extern DOMAIN(attrs) attrs_domain;
+
+#define RTA_LOCK LOCK_DOMAIN(attrs, attrs_domain)
+#define RTA_UNLOCK UNLOCK_DOMAIN(attrs, attrs_domain)
+
+#define RTE_SRC_PU_SHIFT 44
+#define RTE_SRC_IN_PROGRESS (1ULL << RTE_SRC_PU_SHIFT)
+
+/* Get a route source. This also locks the source, therefore the caller has to
+ * unlock the source after the route has been propagated. */
+struct rte_src *rt_get_source_o(struct rte_owner *o, u32 id);
+#define rt_get_source(p, id) rt_get_source_o(&(p)->sources, (id))
-struct rte_src *rt_find_source(struct proto *p, u32 id);
-struct rte_src *rt_get_source(struct proto *p, u32 id);
struct rte_src *rt_find_source_global(u32 id);
-static inline void rt_lock_source(struct rte_src *src) { src->uc++; }
-static inline void rt_unlock_source(struct rte_src *src) { src->uc--; }
-void rt_prune_sources(void);
+
++#ifdef RT_SOURCE_DEBUG
++#define rt_lock_source _rt_lock_source_internal
++#define rt_unlock_source _rt_unlock_source_internal
++#endif
++
+static inline void rt_lock_source(struct rte_src *src)
+{
+ /* Locking a source is trivial; somebody already holds it so we just increase
+ * the use count. Nothing can be freed underneath our hands. */
+ u64 uc = atomic_fetch_add_explicit(&src->uc, 1, memory_order_acq_rel);
+ ASSERT_DIE(uc > 0);
+}
+
+static inline void rt_unlock_source(struct rte_src *src)
+{
+ /* Unlocking is tricky. We do it lockless so at the same time, the prune
+ * event may be running, therefore if the unlock gets us to zero, it must be
+ * the last thing in this routine, otherwise the prune routine may find the
+ * source's usecount zeroed, freeing it prematurely.
+ *
+ * The usecount is split into two parts:
+ * the top 20 bits are an in-progress indicator
+ * the bottom 44 bits keep the actual usecount.
+ *
+ * Therefore at most 1 million of writers can simultaneously unlock the same
+ * source, while at most ~17T different routes can reference it. Both limits
+ * are insanely high from the 2022 point of view. Let's suppose that when 17T
+ * routes or 1M writers get real, we get also 128bit atomic variables in the
+ * C norm. */
+
+ /* First, we push the in-progress indicator */
+ u64 uc = atomic_fetch_add_explicit(&src->uc, RTE_SRC_IN_PROGRESS, memory_order_acq_rel);
+
+ /* Then we split the indicator to its parts. Remember, we got the value before the operation happened. */
+ u64 pending = (uc >> RTE_SRC_PU_SHIFT) + 1;
+ uc &= RTE_SRC_IN_PROGRESS - 1;
+
+ /* We per-use the RCU critical section indicator to make the prune event wait
+ * until we finish here in the rare case we get preempted. */
+ rcu_read_lock();
+
+ /* Obviously, there can't be more pending unlocks than the usecount itself */
+ if (uc == pending)
+ /* If we're the last unlocker, schedule the owner's prune event */
+ ev_send(src->owner->list, src->owner->prune);
+ else
+ ASSERT_DIE(uc > pending);
+
+ /* And now, finally, simultaneously pop the in-progress indicator and the
+ * usecount, possibly allowing the source pruning routine to free this structure */
+ atomic_fetch_sub_explicit(&src->uc, RTE_SRC_IN_PROGRESS + 1, memory_order_acq_rel);
+
+ /* ... and to reduce the load a bit, the source pruning routine will better wait for
+ * RCU synchronization instead of a busy loop. */
+ rcu_read_unlock();
+}
+
++#ifdef RT_SOURCE_DEBUG
++#undef rt_lock_source
++#undef rt_unlock_source
++
++#define rt_lock_source(x) ( log(L_INFO "Lock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_lock_source_internal(x) )
++#define rt_unlock_source(x) ( log(L_INFO "Unlock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_unlock_source_internal(x) )
++#endif
++
+void rt_init_sources(struct rte_owner *, const char *name, event_list *list);
+void rt_destroy_sources(struct rte_owner *, event *);
/*
* Route Attributes
c->prefix_slab = alen ? sl_new(c->pool, sizeof(struct bgp_prefix) + alen) : NULL;
}
- void
- bgp_free_prefix_table(struct bgp_channel *c)
- {
- HASH_FREE(c->prefix_hash);
-
- rfree(c->prefix_slab);
- c->prefix_slab = NULL;
- }
-
static struct bgp_prefix *
- bgp_get_prefix(struct bgp_channel *c, const net_addr *net, struct rte_src *src)
-bgp_get_prefix(struct bgp_pending_tx *c, const net_addr *net, u32 path_id, int add_path_tx)
++bgp_get_prefix(struct bgp_pending_tx *c, const net_addr *net, struct rte_src *src, int add_path_tx)
{
- u32 path_id_hash = c->add_path_tx ? path_id : 0;
+ u32 path_id = src->global_id;
+ u32 path_id_hash = add_path_tx ? path_id : 0;
/* We must use a different hash function than the rtable */
u32 hash = u32_hash(net_hash(net) ^ u32_hash(path_id_hash));
struct bgp_prefix *px = HASH_FIND(c->prefix_hash, PXH, net, path_id_hash, hash);
/* Prefixes belonging to the withdraw bucket are freed always */
}
- bgp_free_prefix(c, px);
+ bgp_free_prefix(c->ptx, px);
+ }
+
+ static void
+ bgp_pending_tx_rfree(resource *r)
+ {
- UNUSED struct bgp_pending_tx *ptx = SKIP_BACK(struct bgp_pending_tx, r, r);
++ struct bgp_pending_tx *ptx = SKIP_BACK(struct bgp_pending_tx, r, r);
+
- /* Do nothing for now. */
++ HASH_WALK(ptx->prefix_hash, next, n)
++ rt_unlock_source(rt_find_source_global(n->path_id));
++ HASH_WALK_END;
+ }
+
+ static void bgp_pending_tx_dump(resource *r UNUSED) { debug("\n"); }
+
+ static struct resclass bgp_pending_tx_class = {
+ .name = "BGP Pending TX",
+ .size = sizeof(struct bgp_pending_tx),
+ .free = bgp_pending_tx_rfree,
+ .dump = bgp_pending_tx_dump,
+ };
+
+ void
+ bgp_init_pending_tx(struct bgp_channel *c)
+ {
+ ASSERT_DIE(!c->ptx);
+
+ pool *p = rp_new(c->pool, "BGP Pending TX");
+ c->ptx = ralloc(p, &bgp_pending_tx_class);
+ c->ptx->pool = p;
+
+ bgp_init_bucket_table(c->ptx);
+ bgp_init_prefix_table(c);
+ }
+
+ void
+ bgp_free_pending_tx(struct bgp_channel *c)
+ {
+ ASSERT_DIE(c->ptx);
+ ASSERT_DIE(c->ptx->pool);
+
+ rfree(c->ptx->pool);
+ c->ptx = NULL;
}
log(L_ERR "%s: Invalid route %N withdrawn", p->p.name, n);
/* If attributes are invalid, we fail back to withdraw */
- buck = attrs ? bgp_get_bucket(c, attrs) : bgp_get_withdraw_bucket(c);
+ buck = attrs ? bgp_get_bucket(c->ptx, attrs) : bgp_get_withdraw_bucket(c->ptx);
- path = new->src->global_id;
+ path = new->src;
}
else
{
- buck = bgp_get_withdraw_bucket(c);
+ buck = bgp_get_withdraw_bucket(c->ptx);
- path = old->src->global_id;
+ path = old->src;
}
- if (bgp_update_prefix(c, bgp_get_prefix(c, n, path), buck))
+ if (bgp_update_prefix(c, bgp_get_prefix(c->ptx, n, path, c->add_path_tx), buck))
bgp_schedule_packet(p->conn, c, PKT_UPDATE);
}