Also converted all _Bool's to bool.
CF_DEFINES
- static _Bool this_sadr_from_hack_active;
++static bool this_sadr_from_hack_active;
+
static void
check_u16(uint val)
{
--- /dev/null
- _Bool task_still_in_limit(void);
- _Bool task_before_halftime(void);
+/*
+ * BIRD -- I/O and event loop
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#ifndef _BIRD_IO_LOOP_H_
+#define _BIRD_IO_LOOP_H_
+
+#include "nest/bird.h"
+#include "lib/lists.h"
+#include "lib/locking.h"
+#include "lib/resource.h"
+#include "lib/event.h"
+#include "lib/socket.h"
+
+extern struct birdloop main_birdloop;
+
+/* Currently running birdloop */
+extern _Thread_local struct birdloop *this_birdloop;
+
+/* Check that the task has enough time to do a bit more */
- _Bool birdloop_inside(struct birdloop *loop);
++bool task_still_in_limit(void);
++bool task_before_halftime(void);
+
+#define MAYBE_DEFER_TASK(target, event, fmt, args...) do { \
+ if (!task_still_in_limit()) { \
+ if (atomic_load_explicit(&global_runtime, memory_order_relaxed)->latency_debug & DL_SCHEDULING) \
+ log(L_TRACE "Deferring " fmt, ##args); \
+ return ev_send(target, event); \
+ } } while (0)
+
+/* Start a new birdloop owned by given pool and domain */
+struct birdloop *birdloop_new(pool *p, uint order, btime max_latency, const char *fmt, ...);
+
+/* Stop the loop. At the end, the @stopped callback is called unlocked in tail
+ * position to finish cleanup. Run birdloop_free() from that callback to free
+ * the loop itself. */
+void birdloop_stop(struct birdloop *loop, void (*stopped)(void *data), void *data);
+void birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *data);
+void birdloop_free(struct birdloop *loop);
+
+/* Run this event in this thread's priority event list */
+void ev_send_this_thread(event *e);
+
+/* Get birdloop's time heap */
+struct timeloop *birdloop_time_loop(struct birdloop *loop);
+#define birdloop_domain(l) (birdloop_time_loop((l))->domain)
+
+/* Get birdloop's pool */
+pool *birdloop_pool(struct birdloop *loop);
+
+/* Enter and exit the birdloop */
+void birdloop_enter(struct birdloop *loop);
+void birdloop_leave(struct birdloop *loop);
+
++bool birdloop_inside(struct birdloop *loop);
+
+void birdloop_mask_wakeups(struct birdloop *loop);
+void birdloop_unmask_wakeups(struct birdloop *loop);
+
+void birdloop_link(struct birdloop *loop);
+void birdloop_unlink(struct birdloop *loop);
+
+void birdloop_ping(struct birdloop *loop);
+
+/* Setup sockets */
+void birdloop_add_socket(struct birdloop *, struct birdsock *);
+void birdloop_remove_socket(struct birdloop *, struct birdsock *);
+
+void birdloop_init(void);
+
+#endif /* _BIRD_IO_LOOP_H_ */
#define EMPTY_LIST(list) (!(list).head->next)
- static inline _Bool
++static inline bool
+enlisted(node *n)
+{
+ switch ((!!n->next) + (!!n->prev))
+ {
+ case 0: return 0;
+ case 2: return 1;
+ case 1: bug("Garbled event list node");
+ }
+
+ bug("Maths is broken. And you should see a new heaven and a new earth: for the first heaven and the first earth had been passed away.");
+}
#ifndef _BIRD_LISTS_C_
#define LIST_INLINE static inline
--- /dev/null
- _Bool f = 0;
+/*
+ * BIRD Library -- Generic lock-free structures
+ *
+ * (c) 2023--2024 Maria Matejka <mq@jmq.cz>
+ * (c) 2023--2024 CZ.NIC, z.s.p.o.
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#include "lib/birdlib.h"
+#include "lib/lockfree.h"
+
+#define LOCAL_DEBUG
+
+void lfuc_unlock_deferred(struct deferred_call *dc)
+{
+ SKIP_BACK_DECLARE(struct lfuc_unlock_queue_item, luqi, dc, dc);
+ lfuc_unlock_immediately(luqi->c, luqi->el, luqi->ev);
+}
+
+#if 0
+#define lfjour_debug(...) log(L_TRACE __VA_ARGS__)
+#define lfjour_debug_detailed(...) log(L_TRACE __VA_ARGS__)
+#elif 0
+#define lfjour_debug(...) log(L_TRACE __VA_ARGS__)
+#define lfjour_debug_detailed(...)
+#else
+#define lfjour_debug(...)
+#define lfjour_debug_detailed(...)
+#endif
+
+#define LBI(j, b, p) ((struct lfjour_item *)(((void *) (b)->_block) + ((j)->item_size * (p))))
+#define LBP(j, b, i) ({ \
+ off_t off = ((void *) (i)) - ((void *) (b)->_block); \
+ u32 s = (j)->item_size; \
+ ASSERT_DIE(off < page_size); \
+ ASSERT_DIE((off % s) == 0); \
+ off / s; \
+ })
+
+struct lfjour_item *
+lfjour_push_prepare(struct lfjour *j)
+{
+ ASSERT_DIE(!j->domain || DG_IS_LOCKED(j->domain));
+ ASSERT_DIE(!j->open);
+
+ if (EMPTY_TLIST(lfjour_block, &j->pending) &&
+ EMPTY_TLIST(lfjour_recipient, &j->recipients))
+ return NULL;
+
+ struct lfjour_block *block = NULL;
+ u32 end = 0;
+
+ if (!EMPTY_TLIST(lfjour_block, &j->pending))
+ {
+ block = j->pending.last;
+ end = atomic_load_explicit(&block->end, memory_order_relaxed);
+ if (end >= j->item_count)
+ {
+ ASSERT_DIE(end == j->item_count);
+ block = NULL;
+ end = 0;
+ }
+ }
+
+ if (!block)
+ {
+ block = alloc_page();
+ lfjour_debug("lfjour(%p)_push_prepare: allocating block %p", j, block);
+ *block = (struct lfjour_block) {};
+ lfjour_block_add_tail(&j->pending, block);
+ }
+
+ struct lfjour_item *i = LBI(j, block, end);
+ *i = (struct lfjour_item) {
+ .seq = j->next_seq++,
+ };
+
+ return j->open = i;
+}
+
+void
+lfjour_push_commit(struct lfjour *j)
+{
+ ASSERT_DIE(!j->domain || DG_IS_LOCKED(j->domain));
+ ASSERT_DIE(j->open);
+ struct lfjour_block *b = PAGE_HEAD(j->open);
+ ASSERT_DIE(b == j->pending.last);
+
+ lfjour_debug("lfjour(%p)_push_commit of %p, seq=%lu", j, j->open, j->open->seq);
+
+ u32 end = atomic_fetch_add_explicit(&b->end, 1, memory_order_release);
+ ASSERT_DIE(j->open == LBI(j, b, end));
+
+ if (end == 0)
+ {
+ struct lfjour_block *prev = b->n.prev;
++ bool f = 0;
+ if (prev)
+ ASSERT_DIE(atomic_compare_exchange_strong_explicit(&prev->not_last, &f, 1,
+ memory_order_release, memory_order_relaxed));
+ }
+
+ /* Store the first item to announce (only if this is actually the first one). */
+ struct lfjour_item *null_item = NULL;
+ if (atomic_compare_exchange_strong_explicit(
+ &j->first, &null_item, j->open,
+ memory_order_acq_rel, memory_order_relaxed))
+ {
+ lfjour_debug("lfjour(%p) first set", j);
+ }
+
+ j->open = NULL;
+
+ if (!ev_active(&j->announce_kick_event))
+ ev_send_loop(j->loop, &j->announce_kick_event);
+}
+
+static struct lfjour_item *
+lfjour_get_next(struct lfjour *j, const struct lfjour_item *last)
+{
+ /* This is lockless, no domain checks. */
+ if (!last)
+ {
+ struct lfjour_item *first = atomic_load_explicit(&j->first, memory_order_acquire);
+ return first;
+ }
+
+ struct lfjour_block *block = PAGE_HEAD(last);
+ ASSERT_DIE(block);
+ u32 end = atomic_load_explicit(&block->end, memory_order_acquire);
+ u32 pos = LBP(j, block, last);
+ ASSERT_DIE(pos < end);
+
+ /* Next is in the same block. */
+ if (++pos < end)
+ return LBI(j, block, pos);
+
+ /* There is another block. */
+ if (atomic_load_explicit(&block->not_last, memory_order_acquire))
+ {
+ /* To avoid rare race conditions, we shall check the current block end once again */
+ u32 new_end = atomic_load_explicit(&block->end, memory_order_acquire);
+ ASSERT_DIE(new_end >= end);
+ if (new_end > end)
+ return LBI(j, block, pos);
+
+ /* Nothing in the previous one, let's move to the next block.
+ * This is OK to do non-atomically because of the not_last flag. */
+ block = block->n.next;
+ return LBI(j, block, 0);
+ }
+
+ /* There is nothing more. */
+ return NULL;
+}
+
+struct lfjour_item *
+lfjour_get(struct lfjour_recipient *r)
+{
+ struct lfjour *j = lfjour_of_recipient(r);
+
+ const struct lfjour_item *last = r->cur;
+ struct lfjour_item *next = NULL;
+
+ if (last)
+ next = lfjour_get_next(j, r->cur);
+ else
+ {
+ /* The last pointer may get cleaned up under our hands.
+ * Indicating that we're using it, by RCU read. */
+
+ rcu_read_lock();
+ last = atomic_load_explicit(&r->last, memory_order_acquire);
+ next = lfjour_get_next(j, last);
+ rcu_read_unlock();
+ }
+
+ if (last)
+ {
+ lfjour_debug_detailed("lfjour(%p)_get(recipient=%p) returns %p, seq=%lu, last %p",
+ j, r, next, next ? next->seq : 0ULL, last);
+ }
+ else
+ {
+ lfjour_debug("lfjour(%p)_get(recipient=%p) returns %p, seq=%lu, clean",
+ j, r, next, next ? next->seq : 0ULL);
+ }
+
+ if (!next)
+ return NULL;
+
+ if (!r->first_holding_seq)
+ r->first_holding_seq = next->seq;
+
+ return r->cur = next;
+}
+
+void lfjour_release(struct lfjour_recipient *r, const struct lfjour_item *it)
+{
+ /* Find out what we actually released last */
+ rcu_read_lock();
+ const struct lfjour_item *last = atomic_load_explicit(&r->last, memory_order_acquire);
+ struct lfjour_block *last_block = last ? PAGE_HEAD(last) : NULL;
+ rcu_read_unlock();
+
+ /* This is lockless, no domain checks. */
+ ASSERT_DIE(r->cur);
+
+ /* Partial or full release? */
+ ASSERT_DIE(r->first_holding_seq);
+ ASSERT_DIE(it->seq >= r->first_holding_seq);
+ if (it->seq < r->cur->seq)
+ {
+ lfjour_debug("lfjour(%p)_release(recipient=%p) of %p, partial upto seq=%lu",
+ j, r, it, it->seq);
+ r->first_holding_seq = it->seq + 1;
+ atomic_store_explicit(&r->last, it, memory_order_release);
+ return;
+ }
+
+ struct lfjour_block *block = PAGE_HEAD(r->cur);
+ u32 end = atomic_load_explicit(&block->end, memory_order_acquire);
+
+ struct lfjour *j = lfjour_of_recipient(r);
+ u32 pos = LBP(j, block, r->cur);
+ ASSERT_DIE(pos < end);
+
+ /* Releasing this export for cleanup routine */
+ if (pos + 1 == end)
+ {
+ lfjour_debug("lfjour(%p)_release(recipient=%p) of %p, seq=%lu (end)",
+ j, r, r->cur, r->cur->seq);
+ }
+ else
+ {
+ lfjour_debug_detailed("lfjour(%p)_release(recipient=%p) of %p, seq=%lu (mid)",
+ j, r, r->cur, r->cur->seq);
+ }
+
+ atomic_store_explicit(&r->last, r->cur, memory_order_release);
+
+ /* The last block may be available to free */
+ if ((pos + 1 == end) || last && (last_block != block))
+ lfjour_schedule_cleanup(j);
+
+ r->first_holding_seq = 0;
+ r->cur = NULL;
+}
+
+void
+lfjour_announce_now(struct lfjour *j)
+{
+ ASSERT_DIE(birdloop_inside(j->loop));
+ settle_cancel(&j->announce_timer);
+ ev_postpone(&j->announce_kick_event);
+
+ if (EMPTY_TLIST(lfjour_recipient, &j->recipients))
+ return lfjour_schedule_cleanup(j);
+
+ WALK_TLIST(lfjour_recipient, r, &j->recipients)
+ if (r->event)
+ ev_send(r->target, r->event);
+}
+
+static void
+lfjour_announce_settle_hook(struct settle *s)
+{
+ return lfjour_announce_now(SKIP_BACK(struct lfjour, announce_timer, s));
+}
+
+static void
+lfjour_announce_kick_hook(void *_j)
+{
+ struct lfjour *j = _j;
+ settle_kick(&j->announce_timer, j->loop);
+}
+
+u64
+lfjour_pending_items(struct lfjour *j)
+{
+ ASSERT_DIE(!j->domain || DG_IS_LOCKED(j->domain));
+
+ struct lfjour_item *first = atomic_load_explicit(&j->first, memory_order_relaxed);
+ if (!first)
+ return 0;
+
+ ASSERT_DIE(j->next_seq > first->seq);
+ return j->next_seq - first->seq;
+}
+
+void
+lfjour_register(struct lfjour *j, struct lfjour_recipient *r)
+{
+ ASSERT_DIE(!j->domain || DG_IS_LOCKED(j->domain));
+ ASSERT_DIE(!r->event == !r->target);
+
+ atomic_store_explicit(&r->last, NULL, memory_order_relaxed);
+ ASSERT_DIE(!r->cur);
+
+ lfjour_recipient_add_tail(&j->recipients, r);
+}
+
+void
+lfjour_unregister(struct lfjour_recipient *r)
+{
+ struct lfjour *j = lfjour_of_recipient(r);
+ ASSERT_DIE(!j->domain || DG_IS_LOCKED(j->domain));
+
+ if (r->cur)
+ lfjour_release(r, r->cur);
+
+ lfjour_recipient_rem_node(&j->recipients, r);
+ lfjour_schedule_cleanup(j);
+}
+
+static inline void lfjour_cleanup_unlock_helper(struct domain_generic **dg)
+{
+ if (!*dg) return;
+ DG_UNLOCK(*dg);
+}
+
+static void
+lfjour_cleanup_hook(void *_j)
+{
+ struct lfjour *j = _j;
+
+ CLEANUP(lfjour_cleanup_unlock_helper) struct domain_generic *_locked = j->domain;
+ if (_locked) DG_LOCK(_locked);
+
+ u64 min_seq = ~((u64) 0);
+ const struct lfjour_item *last_item_to_free = NULL;
+ struct lfjour_item *first = atomic_load_explicit(&j->first, memory_order_acquire);
+
+ if (!first)
+ {
+ /* Nothing to cleanup, actually, just call the done callback */
+ ASSERT_DIE(EMPTY_TLIST(lfjour_block, &j->pending));
+ CALL(j->cleanup_done, j, 0, ~((u64) 0));
+ return;
+ }
+
+ WALK_TLIST(lfjour_recipient, r, &j->recipients)
+ {
+ const struct lfjour_item *last = atomic_load_explicit(&r->last, memory_order_acquire);
+
+ if (!last)
+ /* No last export means that the channel has exported nothing since last cleanup */
+ return;
+
+ else if (min_seq > last->seq)
+ {
+ min_seq = last->seq;
+ last_item_to_free = last;
+ }
+ }
+
+ /* Here we're sure that no receiver is going to use the first pointer soon.
+ * It is only used when the receiver's last pointer is NULL, which is avoided by the code above.
+ * Thus, we can just move the journal's first pointer forward. */
+ struct lfjour_item *next = last_item_to_free ? lfjour_get_next(j, last_item_to_free) : NULL;
+ atomic_store_explicit(&j->first, next, memory_order_release);
+
+ lfjour_debug("lfjour(%p) set first=%p (was %p)", j, next, first);
+
+ WALK_TLIST(lfjour_recipient, r, &j->recipients)
+ {
+ const struct lfjour_item *last = last_item_to_free;
+ /* This either succeeds if this item is the most-behind-one,
+ * or fails and gives us the actual last for debug output. */
+ if (atomic_compare_exchange_strong_explicit(
+ &r->last, &last, NULL,
+ memory_order_acq_rel, memory_order_acquire))
+ {
+ lfjour_debug("lfjour(%p)_cleanup(recipient=%p): store last=NULL", j, r);
+ }
+ else
+ {
+ lfjour_debug("lfjour(%p)_cleanup(recipient=%p): keep last=%p", j, r, last);
+ }
+ }
+
+ /* Now some recipients may have old last-pointers. We have to wait
+ * until they finish their routine, before we start cleaning up. */
+ synchronize_rcu();
+
+ u64 orig_first_seq = first->seq;
+
+ /* Now we do the actual cleanup */
+ while (first && (first->seq <= min_seq))
+ {
+ j->item_done(j, first);
+
+ /* Find next journal item */
+ struct lfjour_item *next = lfjour_get_next(j, first);
+ if (PAGE_HEAD(next) != PAGE_HEAD(first))
+ {
+ /* This was the last one in its block */
+ struct lfjour_block *block = PAGE_HEAD(first);
+ lfjour_debug("lfjour(%p)_cleanup: freeing block %p", j, block);
+ ASSERT_DIE(block == j->pending.first);
+
+ /* Free this block */
+ lfjour_block_rem_node(&j->pending, block);
+
+ /* Wait for possible pending readers of the block */
+ synchronize_rcu();
+
+ /* Now we can finally drop the block */
+#ifdef LOCAL_DEBUG
+ memset(block, 0xbe, page_size);
+#endif
+ free_page(block);
+
+ /* If no more blocks are remaining, we shall reset
+ * the sequence numbers */
+
+ if (EMPTY_TLIST(lfjour_block, &j->pending))
+ {
+ lfjour_debug("lfjour(%p)_cleanup: seq reset", j);
+ WALK_TLIST(lfjour_recipient, r, &j->recipients)
+ atomic_fetch_or_explicit(&r->recipient_flags, LFJOUR_R_SEQ_RESET, memory_order_acq_rel);
+
+ j->next_seq = 1;
+ }
+ }
+
+ /* And now move on to the next item */
+ first = next;
+ }
+
+ CALL(j->cleanup_done, j, orig_first_seq, first ? first->seq : ~((u64) 0));
+}
+
+void
+lfjour_init(struct lfjour *j, struct settle_config *scf)
+{
+ /* Expecting all other fields to be initialized to zeroes by the caller */
+ ASSERT_DIE(j->loop);
+ ASSERT_DIE(j->item_size >= sizeof(struct lfjour_item));
+
+ j->item_size = BIRD_CPU_ALIGN(j->item_size);
+ j->item_count = (page_size - sizeof(struct lfjour_block)) / j->item_size;
+
+ j->next_seq = 1;
+ j->announce_kick_event = (event) {
+ .hook = lfjour_announce_kick_hook,
+ .data = j,
+ };
+ j->announce_timer = SETTLE_INIT(scf, lfjour_announce_settle_hook, j);
+ j->cleanup_event = (event) {
+ .hook = lfjour_cleanup_hook,
+ .data = j,
+ };
+}
--- /dev/null
- static inline _Bool
+/*
+ * BIRD Library -- Generic lock-free structures
+ *
+ * (c) 2023--2024 Maria Matejka <mq@jmq.cz>
+ * (c) 2023--2024 CZ.NIC, z.s.p.o.
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#ifndef _BIRD_LOCKFREE_H_
+#define _BIRD_LOCKFREE_H_
+
+#include "lib/defer.h"
+#include "lib/event.h"
+#include "lib/rcu.h"
+#include "lib/settle.h"
+#include "lib/tlists.h"
+#include "lib/io-loop.h"
+
+#include <stdatomic.h>
+
+/**
+ * Lock-free usecounts.
+ */
+
+struct lfuc {
+ _Atomic u64 uc;
+};
+
+#define LFUC_PU_SHIFT 44
+#define LFUC_IN_PROGRESS (1ULL << LFUC_PU_SHIFT)
+
+/**
+ * lfuc_lock - increase an atomic usecount
+ * @c: the usecount structure
+ */
+static inline u64 lfuc_lock(struct lfuc *c)
+{
+ /* Locking is trivial; somebody already holds the underlying data structure
+ * so we just increase the use count. Nothing can be freed underneath our hands. */
+ u64 uc = atomic_fetch_add_explicit(&c->uc, 1, memory_order_acq_rel);
+ ASSERT_DIE(uc > 0);
+ return uc & (LFUC_IN_PROGRESS - 1);
+}
+
+/**
+ * lfuc_lock_revive - increase an atomic usecount even if it's zero
+ * @c: the usecount structure
+ *
+ * If the caller is sure that they can't collide with the prune routine,
+ * they can call this even on structures with already zeroed usecount.
+ * Handy for situations with flapping routes. Use only from the same
+ * loop as which runs the prune routine.
+ */
+static inline u64 lfuc_lock_revive(struct lfuc *c)
+{
+ u64 uc = atomic_fetch_add_explicit(&c->uc, 1, memory_order_acq_rel);
+ return uc & (LFUC_IN_PROGRESS - 1);
+}
+
+/**
+ * lfuc_unlock_immediately - decrease an atomic usecount
+ * @c: the usecount structure
+ * @el: prune event list
+ * @ev: prune event itself
+ *
+ * If the usecount reaches zero, a prune event is run to possibly free the object.
+ * The prune event MUST use lfuc_finished() to check the object state.
+ */
+static inline void lfuc_unlock_immediately(struct lfuc *c, event_list *el, event *ev)
+{
+ /* Unlocking is tricky. We do it lockless so at the same time, the prune
+ * event may be running, therefore if the unlock gets us to zero, it must be
+ * the last thing in this routine, otherwise the prune routine may find the
+ * source's usecount zeroed, freeing it prematurely.
+ *
+ * The usecount is split into two parts:
+ * the top 20 bits are an in-progress indicator
+ * the bottom 44 bits keep the actual usecount.
+ *
+ * Therefore at most 1 million of writers can simultaneously unlock the same
+ * structure, while at most ~17T different places can reference it. Both limits
+ * are insanely high from the 2022 point of view. Let's suppose that when 17T
+ * routes or 1M peers/tables get real, we get also 128bit atomic variables in the
+ * C norm. */
+
+ /* First, we push the in-progress indicator */
+ u64 uc = atomic_fetch_add_explicit(&c->uc, LFUC_IN_PROGRESS, memory_order_acq_rel);
+
+ /* Then we split the indicator to its parts. Remember, we got the value
+ * before the operation happened so we're re-doing the operation locally
+ * to get a view how the indicator _would_ look if nobody else was interacting.
+ */
+ u64 pending = (uc >> LFUC_PU_SHIFT) + 1;
+ uc &= LFUC_IN_PROGRESS - 1;
+
+ /* Obviously, there can't be more pending unlocks than the usecount itself */
+ if (uc == pending)
+ /* If we're the last unlocker (every owner is already unlocking), schedule
+ * the owner's prune event */
+ ev_send(el, ev);
+ else
+ ASSERT_DIE(uc > pending);
+
+ /* And now, finally, simultaneously pop the in-progress indicator and the
+ * usecount, possibly allowing the pruning routine to free this structure */
+ uc = atomic_fetch_sub_explicit(&c->uc, LFUC_IN_PROGRESS + 1, memory_order_acq_rel);
+
+// return uc - LFUC_IN_PROGRESS - 1;
+}
+
+struct lfuc_unlock_queue_item {
+ struct deferred_call dc;
+ struct lfuc *c;
+ event_list *el;
+ event *ev;
+};
+
+void lfuc_unlock_deferred(struct deferred_call *dc);
+
+static inline void lfuc_unlock(struct lfuc *c, event_list *el, event *ev)
+{
+ struct lfuc_unlock_queue_item luqi = {
+ .dc.hook = lfuc_unlock_deferred,
+ .c = c,
+ .el = el,
+ .ev = ev,
+ };
+
+ defer_call(&luqi.dc, sizeof luqi);
+}
+
+/**
+ * lfuc_finished - auxiliary routine for prune event
+ * @c: usecount structure
+ *
+ * This routine simply waits until all unlockers finish their job and leave
+ * the critical section of lfuc_unlock(). Then we decide whether the usecount
+ * is indeed zero or not, and therefore whether the structure is free to be freed.
+ */
- _Atomic _Bool not_last;
++static inline bool
+lfuc_finished(struct lfuc *c)
+{
+ u64 uc;
+ /* Wait until all unlockers finish */
+ while ((uc = atomic_load_explicit(&c->uc, memory_order_acquire)) >> LFUC_PU_SHIFT)
+ birdloop_yield();
+
+ /* All of them are now done and if the usecount is now zero, then we're
+ * the last place to reference the object and we can call it finished. */
+ return (uc == 0);
+}
+
+/**
+ * lfuc_init - auxiliary routine for usecount initialization
+ * @c: usecount structure
+ *
+ * Called on object initialization, sets the usecount to an initial one to make
+ * sure that the prune routine doesn't free it before somebody else references it.
+ */
+static inline void
+lfuc_init(struct lfuc *c)
+{
+ atomic_store_explicit(&c->uc, 1, memory_order_release);
+}
+
+
+/**
+ * Lock-free journal.
+ */
+
+/* Journal item. Put LFJOUR_ITEM_INHERIT(name) into your structure
+ * to inherit lfjour_item */
+#define LFJOUR_ITEM \
+ u64 seq; \
+
+struct lfjour_item {
+ LFJOUR_ITEM;
+};
+
+#define LFJOUR_ITEM_INHERIT(name) union { \
+ struct lfjour_item name; \
+ struct { LFJOUR_ITEM; }; \
+}
+
+/* Journal item block. Internal structure, no need to check out. */
+#define TLIST_PREFIX lfjour_block
+#define TLIST_TYPE struct lfjour_block
+#define TLIST_ITEM n
+#define TLIST_WANT_ADD_TAIL
+
+struct lfjour_block {
+ TLIST_DEFAULT_NODE;
+ _Atomic u32 end;
- static inline _Bool lfjour_reset_seqno(struct lfjour_recipient *r)
++ _Atomic bool not_last;
+
+ struct lfjour_item _block[0];
+};
+
+/* Defines lfjour_block_list */
+#include "lib/tlists.h"
+
+/* Journal recipient. Inherit this in your implementation. */
+#define TLIST_PREFIX lfjour_recipient
+#define TLIST_TYPE struct lfjour_recipient
+#define TLIST_ITEM n
+#define TLIST_WANT_ADD_TAIL
+#define TLIST_WANT_WALK
+
+struct lfjour_recipient {
+ TLIST_DEFAULT_NODE;
+ event *event; /* Event running when something is in the journal */
+ event_list *target; /* Event target */
+ const struct lfjour_item * _Atomic last; /* Last item processed */
+ u64 first_holding_seq; /* First item not released yet */
+ struct lfjour_item *cur; /* Processing this now */
+ _Atomic u64 recipient_flags; /* LFJOUR_R_* */
+};
+
+enum lfjour_recipient_flags {
+ LFJOUR_R_SEQ_RESET = 1, /* Signalling of sequence number reset */
+};
+
+/* Defines lfjour_recipient_list */
+#include "lib/tlists.h"
+
+/* Journal base structure. Include this. */
+struct lfjour {
+ struct domain_generic *domain; /* The journal itself belongs to this domain (if different from the loop) */
+ struct birdloop *loop; /* Cleanup loop */
+ u32 item_size, item_count; /* Allocation parameters */
+ struct lfjour_block_list pending; /* List of packed journal blocks */
+ struct lfjour_item * _Atomic first; /* First journal item to announce */
+ struct lfjour_item *open; /* Journal item in progress */
+ u64 next_seq; /* Next export to push has this ID */
+ struct lfjour_recipient_list recipients; /* Announce updates to these */
+ event announce_kick_event; /* Kicks announce_timer */
+ struct settle announce_timer; /* Announces changes to recipients */
+ event cleanup_event; /* Runs the journal cleanup routine */
+
+ /* Callback on item removal from journal */
+ void (*item_done)(struct lfjour *, struct lfjour_item *);
+
+ /* Callback when the cleanup routine is ending */
+ void (*cleanup_done)(struct lfjour *, u64 begin_seq, u64 end_seq);
+};
+
+struct lfjour_item *lfjour_push_prepare(struct lfjour *);
+void lfjour_push_commit(struct lfjour *);
+
+struct lfjour_item *lfjour_get(struct lfjour_recipient *);
+void lfjour_release(struct lfjour_recipient *, const struct lfjour_item *);
++static inline bool lfjour_reset_seqno(struct lfjour_recipient *r)
+{
+ return atomic_fetch_and_explicit(&r->recipient_flags, ~LFJOUR_R_SEQ_RESET, memory_order_acq_rel) & LFJOUR_R_SEQ_RESET;
+}
+
+void lfjour_announce_now(struct lfjour *);
+u64 lfjour_pending_items(struct lfjour *);
+
+static inline void lfjour_schedule_cleanup(struct lfjour *j)
+{ ev_send_loop(j->loop, &j->cleanup_event); }
+
+static inline void lfjour_do_cleanup_now(struct lfjour *j)
+{
+ /* This requires the caller to own the cleanup event loop */
+ ev_postpone(&j->cleanup_event);
+ j->cleanup_event.hook(j->cleanup_event.data);
+}
+
+void lfjour_register(struct lfjour *, struct lfjour_recipient *);
+void lfjour_unregister(struct lfjour_recipient *);
+static inline uint lfjour_count_recipients(struct lfjour *j)
+{ return TLIST_LENGTH(lfjour_recipient, &j->recipients); }
+
+void lfjour_init(struct lfjour *, struct settle_config *);
+
+
+static inline struct lfjour *lfjour_of_recipient(struct lfjour_recipient *r)
+{
+ struct lfjour_recipient_list *list = lfjour_recipient_enlisted(r);
+ return list ? SKIP_BACK(struct lfjour, recipients, list) : NULL;
+}
+#endif
--- /dev/null
- struct domain_generic *domain_new(uint order, _Bool allow_rcu);
+/*
+ * BIRD Library -- Locking
+ *
+ * (c) 2020--2021 Maria Matejka <mq@jmq.cz>
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#ifndef _BIRD_LOCKING_H_
+#define _BIRD_LOCKING_H_
+
+#include "lib/birdlib.h"
+#include "lib/macro.h"
+#include "lib/rcu.h"
+
+struct domain_generic;
+struct pool;
+
+#define LOCK_ORDER \
+ the_bird, \
+ meta, \
+ control, \
+ proto, \
+ service, \
+ rtable, \
+ attrs, \
+ logging, \
+ resource, \
+
+/* Here define the global lock order; first to last. */
+struct lock_order {
+#define LOCK_ORDER_EXPAND(p) struct domain_generic *p;
+ MACRO_FOREACH(LOCK_ORDER_EXPAND, LOCK_ORDER)
+#undef LOCK_ORDER_EXPAND
+};
+
+#define LOCK_ORDER_EXPAND(p) struct domain__##p { struct domain_generic *p; };
+ MACRO_FOREACH(LOCK_ORDER_EXPAND, LOCK_ORDER)
+#undef LOCK_ORDER_EXPAND
+
+extern _Thread_local struct lock_order locking_stack;
+extern _Thread_local struct domain_generic **last_locked;
+
+#define DOMAIN(type) struct domain__##type
+#define DOMAIN_ORDER(type) OFFSETOF(struct lock_order, type)
+
+#define DOMAIN_NEW(type) (DOMAIN(type)) { .type = domain_new(DOMAIN_ORDER(type), 1) }
+#define DOMAIN_NEW_RCU_SYNC(type) (DOMAIN(type)) { .type = domain_new(DOMAIN_ORDER(type), 0) }
- static inline void rws_mark(rw_spinlock *p, _Bool write, _Bool lock)
++struct domain_generic *domain_new(uint order, bool allow_rcu);
+
+#define DOMAIN_FREE(type, d) domain_free((d).type)
+void domain_free(struct domain_generic *);
+
+#define DOMAIN_NAME(type, d) domain_name((d).type)
+const char *domain_name(struct domain_generic *);
+
+#define DOMAIN_SETUP(type, d, n, p) domain_setup((d).type, n, p)
+void domain_setup(struct domain_generic *, const char *name, struct pool *);
+
+#define DOMAIN_NULL(type) (DOMAIN(type)) {}
+
+#define LOCK_DOMAIN(type, d) do_lock(((d).type), &(locking_stack.type))
+#define UNLOCK_DOMAIN(type, d) do_unlock(((d).type), &(locking_stack.type))
+
+#define DOMAIN_IS_LOCKED(type, d) (((d).type) == (locking_stack.type))
+#define DG_IS_LOCKED(d) ((d) == *(DG_LSP(d)))
+
+/* Internal for locking */
+void do_lock(struct domain_generic *dg, struct domain_generic **lsp);
+void do_unlock(struct domain_generic *dg, struct domain_generic **lsp);
+
+uint dg_order(struct domain_generic *dg);
+
+#define DG_LSP(d) ((struct domain_generic **) (((void *) &locking_stack) + dg_order(d)))
+#define DG_LOCK(d) do_lock(d, DG_LSP(d))
+#define DG_UNLOCK(d) do_unlock(d, DG_LSP(d))
+
+/* Use with care. To be removed in near future. */
+extern DOMAIN(the_bird) the_bird_domain;
+
+#define the_bird_lock() LOCK_DOMAIN(the_bird, the_bird_domain)
+#define the_bird_unlock() UNLOCK_DOMAIN(the_bird, the_bird_domain)
+#define the_bird_locked() DOMAIN_IS_LOCKED(the_bird, the_bird_domain)
+
+#define ASSERT_THE_BIRD_LOCKED ({ if (!the_bird_locked()) bug("The BIRD lock must be locked here: %s:%d", __FILE__, __LINE__); })
+
+/*
+ * RW spinlocks
+ */
+
+#define RWS_READ_PENDING_POS 0
+#define RWS_READ_ACTIVE_POS 20
+#define RWS_WRITE_PENDING_POS 40
+#define RWS_WRITE_ACTIVE_POS 56
+
+#define RWS_READ_PENDING (1ULL << RWS_READ_PENDING_POS)
+#define RWS_READ_ACTIVE (1ULL << RWS_READ_ACTIVE_POS)
+#define RWS_WRITE_PENDING (1ULL << RWS_WRITE_PENDING_POS)
+#define RWS_WRITE_ACTIVE (1ULL << RWS_WRITE_ACTIVE_POS)
+
+#define RWS_READ_PENDING_MASK (RWS_READ_ACTIVE - 1)
+#define RWS_READ_ACTIVE_MASK ((RWS_WRITE_PENDING - 1) & ~(RWS_READ_ACTIVE - 1))
+#define RWS_WRITE_PENDING_MASK ((RWS_WRITE_ACTIVE - 1) & ~(RWS_WRITE_PENDING - 1))
+#define RWS_WRITE_ACTIVE_MASK (~(RWS_WRITE_ACTIVE - 1))
+
+typedef struct {
+ u64 _Atomic spin;
+} rw_spinlock;
+
+#ifdef DEBUGGING
+#define MAX_RWS_AT_ONCE 32
+extern _Thread_local rw_spinlock *rw_spinlocks_taken[MAX_RWS_AT_ONCE];
+extern _Thread_local btime rw_spinlocks_time[MAX_RWS_AT_ONCE];
+extern _Thread_local u32 rw_spinlocks_taken_cnt;
+extern _Thread_local u32 rw_spinlocks_taken_write;
+
+/* Borrowed from lib/timer.h */
+btime current_time_now(void);
+
++static inline void rws_mark(rw_spinlock *p, bool write, bool lock)
+{
+ if (lock) {
+ ASSERT_DIE(rw_spinlocks_taken_cnt < MAX_RWS_AT_ONCE);
+ if (write)
+ rw_spinlocks_taken_write |= (1 << rw_spinlocks_taken_cnt);
+ else
+ rw_spinlocks_taken_write &= ~(1 << rw_spinlocks_taken_cnt);
+ rw_spinlocks_time[rw_spinlocks_taken_cnt] = current_time_now();
+ rw_spinlocks_taken[rw_spinlocks_taken_cnt++] = p;
+
+ }
+ else {
+ ASSERT_DIE(rw_spinlocks_taken_cnt > 0);
+ ASSERT_DIE(rw_spinlocks_taken[--rw_spinlocks_taken_cnt] == p);
+ ASSERT_DIE(!(rw_spinlocks_taken_write & (1 << rw_spinlocks_taken_cnt)) == !write);
+ btime tdif = current_time_now() - rw_spinlocks_time[rw_spinlocks_taken_cnt];
+ if (tdif > 1 S_)
+ log(L_WARN "Spent an alarming time %t s in spinlock %p (%s); "
+ "if this happens often to you, please contact the developers.",
+ tdif, p, write ? "write" : "read");
+ }
+}
+#else
+#define rws_mark(...)
+#endif
+
+static inline void rws_init(rw_spinlock *p)
+{
+ atomic_store_explicit(&p->spin, 0, memory_order_relaxed);
+}
+
+static inline void rws_read_lock(rw_spinlock *p)
+{
+ u64 old = atomic_fetch_add_explicit(&p->spin, RWS_READ_PENDING, memory_order_acquire);
+
+ while (1)
+ {
+ /* Wait until all writers end */
+ while (old & (RWS_WRITE_PENDING_MASK | RWS_WRITE_ACTIVE_MASK))
+ {
+ birdloop_yield();
+ old = atomic_load_explicit(&p->spin, memory_order_acquire);
+ }
+
+ /* Convert to active */
+ old = atomic_fetch_add_explicit(&p->spin, RWS_READ_ACTIVE - RWS_READ_PENDING, memory_order_acq_rel);
+
+ if (old & RWS_WRITE_ACTIVE_MASK)
+ /* Oh but some writer was faster */
+ old = atomic_fetch_sub_explicit(&p->spin, RWS_READ_ACTIVE - RWS_READ_PENDING, memory_order_acq_rel);
+ else
+ /* No writers, approved */
+ break;
+ }
+
+ rws_mark(p, 0, 1);
+}
+
+static inline void rws_read_unlock(rw_spinlock *p)
+{
+ rws_mark(p, 0, 0);
+ u64 old = atomic_fetch_sub_explicit(&p->spin, RWS_READ_ACTIVE, memory_order_release);
+ ASSERT_DIE(old & RWS_READ_ACTIVE_MASK);
+}
+
+static inline void rws_write_lock(rw_spinlock *p)
+{
+ u64 old = atomic_fetch_add_explicit(&p->spin, RWS_WRITE_PENDING, memory_order_acquire);
+
+ /* Wait until all active readers end */
+ while (1)
+ {
+ while (old & (RWS_READ_ACTIVE_MASK | RWS_WRITE_ACTIVE_MASK))
+ {
+ birdloop_yield();
+ old = atomic_load_explicit(&p->spin, memory_order_acquire);
+ }
+
+ /* Mark self as active */
+ u64 updated = atomic_fetch_or_explicit(&p->spin, RWS_WRITE_ACTIVE, memory_order_acquire);
+
+ /* And it's us */
+ if (!(updated & RWS_WRITE_ACTIVE))
+ {
+ if (updated & RWS_READ_ACTIVE_MASK)
+ /* But some reader was faster */
+ atomic_fetch_and_explicit(&p->spin, ~RWS_WRITE_ACTIVE, memory_order_release);
+ else
+ /* No readers, approved */
+ break;
+ }
+ }
+
+ /* It's us, then we aren't actually pending */
+ u64 updated = atomic_fetch_sub_explicit(&p->spin, RWS_WRITE_PENDING, memory_order_acquire);
+ ASSERT_DIE(updated & RWS_WRITE_PENDING_MASK);
+ rws_mark(p, 1, 1);
+}
+
+static inline void rws_write_unlock(rw_spinlock *p)
+{
+ rws_mark(p, 1, 0);
+ u64 old = atomic_fetch_and_explicit(&p->spin, ~RWS_WRITE_ACTIVE, memory_order_release);
+ ASSERT_DIE(old & RWS_WRITE_ACTIVE);
+}
+
+
+/*
+ * Unwind stored lock state helpers
+ */
+struct locking_unwind_status {
+ struct lock_order *desired;
+ enum {
+ LOCKING_UNWIND_SAME,
+ LOCKING_UNWIND_UNLOCK,
+ } state;
+};
+
+static inline struct locking_unwind_status locking_unwind_helper(struct locking_unwind_status status, uint order)
+{
+ struct domain_generic **lsp = ((void *) &locking_stack) + order;
+ struct domain_generic **dp = ((void *) status.desired) + order;
+
+ if (!status.state)
+ {
+ /* Just checking that the rest of the stack is consistent */
+ if (*lsp != *dp)
+ bug("Mangled lock unwind state at order %d", order);
+ }
+ else if (*dp)
+ /* Stored state expects locked */
+ if (*lsp == *dp)
+ /* Indeed is locked, switch to check mode */
+ status.state = 0;
+ else
+ /* Not locked or locked elsewhere */
+ bug("Mangled lock unwind state at order %d", order);
+ else if (*lsp)
+ /* Stored state expects unlocked but we're locked */
+ DG_UNLOCK(*lsp);
+
+ return status;
+}
+
+static inline void locking_unwind(struct lock_order *desired)
+{
+ struct locking_unwind_status status = {
+ .desired = desired,
+ .state = LOCKING_UNWIND_UNLOCK,
+ };
+
+#define LOCK_ORDER_POS_HELPER(x) DOMAIN_ORDER(x),
+#define LOCK_ORDER_POS MACRO_FOREACH(LOCK_ORDER_POS_HELPER, LOCK_ORDER)
+ MACRO_RPACK(locking_unwind_helper, status, LOCK_ORDER_POS);
+#undef LOCK_ORDER_POS_HELPER
+}
+
+/**
+ * Objects bound with domains
+ *
+ * First, we need some object to have its locked and unlocked part.
+ * This is accomplished typically by the following pattern:
+ *
+ * struct foo_public {
+ * ... // Public fields
+ * DOMAIN(bar) lock; // The assigned domain
+ * };
+ *
+ * struct foo_private {
+ * struct foo_public; // Importing public fields
+ * struct foo_private **locked_at; // Auxiliary field for locking routines
+ * ... // Private fields
+ * };
+ *
+ * typedef union foo {
+ * struct foo_public;
+ * struct foo_private priv;
+ * } foo;
+ *
+ * All persistently stored object pointers MUST point to the public parts.
+ * If accessing the locked object from embedded objects, great care must
+ * be applied to always SKIP_BACK to the public object version, not the
+ * private one.
+ *
+ * To access the private object parts, either the private object pointer
+ * is explicitly given to us, therefore assuming somewhere else the domain
+ * has been locked, or we have to lock the domain ourselves. To do that,
+ * there are some handy macros.
+ */
+
+#define LOBJ_LOCK_SIMPLE(_obj, _level) \
+ ({ LOCK_DOMAIN(_level, (_obj)->lock); &(_obj)->priv; })
+
+#define LOBJ_UNLOCK_SIMPLE(_obj, _level) \
+ UNLOCK_DOMAIN(_level, (_obj)->lock)
+
+/*
+ * These macros can be used to define specific macros for given class.
+ *
+ * #define FOO_LOCK_SIMPLE(foo) LOBJ_LOCK_SIMPLE(foo, bar)
+ * #define FOO_UNLOCK_SIMPLE(foo) LOBJ_UNLOCK_SIMPLE(foo, bar)
+ *
+ * Then these can be used like this:
+ *
+ * void foo_frobnicate(foo *f)
+ * {
+ * // Unlocked context
+ * ...
+ * struct foo_private *fp = FOO_LOCK_SIMPLE(f);
+ * // Locked context
+ * ...
+ * FOO_UNLOCK_SIMPLE(f);
+ * // Unlocked context
+ * ...
+ * }
+ *
+ * These simple calls have two major drawbacks. First, if you return
+ * from locked context, you don't unlock, which may lock you dead.
+ * And second, the foo_private pointer is still syntactically valid
+ * even after unlocking.
+ *
+ * To fight this, we need more magic and the switch should stay in that
+ * position.
+ *
+ * First, we need an auxiliary _function_ for unlocking. This function
+ * is intended to be called in a local variable cleanup context.
+ */
+
+#define LOBJ_UNLOCK_CLEANUP_NAME(_stem) _lobj__##_stem##_unlock_cleanup
+
+#define LOBJ_UNLOCK_CLEANUP(_stem, _level) \
+ static inline void LOBJ_UNLOCK_CLEANUP_NAME(_stem)(struct _stem##_private **obj) { \
+ if (!*obj) return; \
+ ASSERT_DIE(LOBJ_IS_LOCKED((*obj), _level)); \
+ ASSERT_DIE((*obj)->locked_at == obj); \
+ (*obj)->locked_at = NULL; \
+ UNLOCK_DOMAIN(_level, (*obj)->lock); \
+ }
+
+#define LOBJ_LOCK(_obj, _pobj, _stem, _level) \
+ CLEANUP(LOBJ_UNLOCK_CLEANUP_NAME(_stem)) struct _stem##_private *_pobj = LOBJ_LOCK_SIMPLE(_obj, _level); _pobj->locked_at = &_pobj;
+
+/*
+ * And now the usage of these macros. You first need to declare the auxiliary
+ * cleanup function.
+ *
+ * LOBJ_UNLOCK_CLEANUP(foo, bar);
+ *
+ * And then declare the lock-local macro:
+ *
+ * #define FOO_LOCK(foo, fpp) LOBJ_LOCK(foo, fpp, foo, bar)
+ *
+ * This construction then allows you to lock much more safely:
+ *
+ * void foo_frobnicate_safer(foo *f)
+ * {
+ * // Unlocked context
+ * ...
+ * do {
+ * FOO_LOCK(foo, fpp);
+ * // Locked context, fpp is valid here
+ *
+ * if (something) return; // This implicitly unlocks
+ * if (whatever) break; // This unlocks too
+ *
+ * // Finishing context with no unlock at all
+ * } while (0);
+ *
+ * // Here is fpp invalid and the object is back unlocked.
+ * ...
+ * }
+ *
+ * There is no explicit unlock statement. To unlock, simply leave the block
+ * with locked context.
+ *
+ * This may be made even nicer to use by employing a for-cycle.
+ */
+
+#define LOBJ_LOCKED(_obj, _pobj, _stem, _level) \
+ for (CLEANUP(LOBJ_UNLOCK_CLEANUP_NAME(_stem)) struct _stem##_private *_pobj = LOBJ_LOCK_SIMPLE(_obj, _level); \
+ _pobj ? (_pobj->locked_at = &_pobj) : NULL; \
+ LOBJ_UNLOCK_CLEANUP_NAME(_stem)(&_pobj), _pobj = NULL)
+
+/*
+ * This for-cycle employs heavy magic to hide as much of the boilerplate
+ * from the user as possibly needed. Here is how it works.
+ *
+ * First, the for-1 clause is executed, setting up _pobj, to the private
+ * object pointer. It has a cleanup hook set.
+ *
+ * Then, the for-2 clause is checked. As _pobj is non-NULL, _pobj->locked_at
+ * is initialized to the _pobj address to ensure that the cleanup hook unlocks
+ * the right object.
+ *
+ * Now the user block is executed. If it ends by break or return, the cleanup
+ * hook fires for _pobj, triggering object unlock.
+ *
+ * If the user block executed completely, the for-3 clause is run, executing
+ * the cleanup hook directly and then deactivating it by setting _pobj to NULL.
+ *
+ * Finally, the for-2 clause is checked again but now with _pobj being NULL,
+ * causing the loop to end. As the object has already been unlocked, nothing
+ * happens after leaving the context.
+ *
+ * #define FOO_LOCKED(foo, fpp) LOBJ_LOCKED(foo, fpp, foo, bar)
+ *
+ * Then the previous code can be modified like this:
+ *
+ * void foo_frobnicate_safer(foo *f)
+ * {
+ * // Unlocked context
+ * ...
+ * FOO_LOCKED(foo, fpp)
+ * {
+ * // Locked context, fpp is valid here
+ *
+ * if (something) return; // This implicitly unlocks
+ * if (whatever) break; // This unlocks too
+ *
+ * // Finishing context with no unlock at all
+ * }
+ *
+ * // Unlocked context
+ * ...
+ *
+ * // Locking once again without an explicit block
+ * FOO_LOCKED(foo, fpp)
+ * do_something(fpp);
+ *
+ * // Here is fpp invalid and the object is back unlocked.
+ * ...
+ * }
+ *
+ *
+ * For many reasons, a lock-check macro is handy.
+ *
+ * #define FOO_IS_LOCKED(foo) LOBJ_IS_LOCKED(foo, bar)
+ */
+
+#define LOBJ_IS_LOCKED(_obj, _level) DOMAIN_IS_LOCKED(_level, (_obj)->lock)
+
+/*
+ * An example implementation is available in lib/locking_test.c
+ */
+
+
+/*
+ * Please don't use this macro unless you at least try to prove that
+ * it's completely safe. It's a can of worms.
+ *
+ * NEVER RETURN OR BREAK FROM THIS MACRO, it will crash.
+ */
+
+#define LOBJ_UNLOCKED_TEMPORARILY(_obj, _pobj, _stem, _level) \
+ for (union _stem *_obj = SKIP_BACK(union _stem, priv, _pobj), **_lataux = (union _stem **) _pobj->locked_at; \
+ _obj ? (_pobj->locked_at = NULL, LOBJ_UNLOCK_SIMPLE(_obj, _level), _obj) : NULL; \
+ LOBJ_LOCK_SIMPLE(_obj, _level), _pobj->locked_at = (struct _stem##_private **) _lataux, _obj = NULL)
+
+/*
+ * Get the locked object when the lock is already taken
+ */
+
+#define LOBJ_PRIV(_obj, _level) \
+ ({ ASSERT_DIE(DOMAIN_IS_LOCKED(_level, (_obj)->lock)); &(_obj)->priv; })
+
+
+/*
+ * RCU retry unwinder
+ *
+ * Start a retriable operation with RCU_ANCHOR() and pass the _i object along
+ * with the code which may then call RCU_RETRY() to return back to RCU_ANCHOR
+ * and try again.
+ */
+
+struct rcu_unwinder {
+ struct lock_order locking_stack;
+ u32 retry;
+ u8 fast;
+ jmp_buf buf;
+};
+
+static inline void _rcu_unwinder_unlock_(struct rcu_unwinder *o UNUSED)
+{
+ rcu_read_unlock();
+}
+
+#define RCU_UNWIND_WARN 4096
+
+#define RCU_ANCHOR(_i) \
+ CLEANUP(_rcu_unwinder_unlock_) struct rcu_unwinder _s##_i = {}; \
+ struct rcu_unwinder *_i = &_s##_i; \
+ if (setjmp(_i->buf)) { \
+ rcu_read_unlock(); \
+ locking_unwind(&_i->locking_stack); \
+ if (_i->fast) _i->fast = 0; \
+ else { \
+ birdloop_yield(); \
+ if (!(++_i->retry % RCU_UNWIND_WARN)) \
+ log(L_WARN "Suspiciously many RCU_ANCHORs retried (%lu)" \
+ " at %s:%d", _i->retry, __FILE__, __LINE__); \
+ } \
+ } \
+ _i->locking_stack = locking_stack; \
+ rcu_read_lock(); \
+
+#define RCU_RETRY(_i) do { if (_i) longjmp(_i->buf, 1); else bug("No rcu retry allowed here"); } while (0)
+
+#define RCU_RETRY_FAST(_i) do { (_i)->fast++; RCU_RETRY(_i); } while (0)
+
+#define RCU_WONT_RETRY ((struct rcu_unwinder *) NULL)
+#endif
--- /dev/null
- for (_Bool sorted = 0; !sorted++; )
+#include "test/birdtest.h"
+#include "test/bt-utils.h"
+
+#include "lib/locking.h"
+#include <stdatomic.h>
+#include <pthread.h>
+
+#define FOO_PUBLIC \
+ const char *name; \
+ _Atomic uint counter; \
+ DOMAIN(proto) lock; \
+
+struct foo_private {
+ struct { FOO_PUBLIC; };
+ struct foo_private **locked_at;
+ uint private_counter;
+};
+
+typedef union foo {
+ struct { FOO_PUBLIC; };
+ struct foo_private priv;
+} foo;
+
+LOBJ_UNLOCK_CLEANUP(foo, proto);
+#define FOO_LOCK(_foo, _fpp) LOBJ_LOCK(_foo, _fpp, foo, proto)
+#define FOO_LOCKED(_foo, _fpp) LOBJ_LOCKED(_foo, _fpp, foo, proto)
+#define FOO_IS_LOCKED(_foo) LOBJ_IS_LOCKED(_foo, proto)
+
+static uint
+inc_public(foo *f)
+{
+ return atomic_fetch_add_explicit(&f->counter, 1, memory_order_relaxed) + 1;
+}
+
+static uint
+inc_private(foo *f)
+{
+ FOO_LOCKED(f, fp) return ++fp->private_counter;
+ bug("Returning always");
+}
+
+#define BLOCKCOUNT 4096
+#define THREADS 16
+#define REPEATS 128
+
+static void *
+thread_run(void *_foo)
+{
+ foo *f = _foo;
+
+ for (int i=0; i<REPEATS; i++)
+ if (i % 2)
+ for (int j=0; j<BLOCKCOUNT; j++)
+ inc_public(f);
+ else
+ for (int j=0; j<BLOCKCOUNT; j++)
+ inc_private(f);
+
+ return NULL;
+}
+
+static int
+t_locking(void)
+{
+ pthread_t thr[THREADS];
+ foo f = { .lock = DOMAIN_NEW(proto), };
+
+ for (int i=0; i<THREADS; i++)
+ bt_assert(pthread_create(&thr[i], NULL, thread_run, &f) == 0);
+
+ for (int i=0; i<THREADS; i++)
+ bt_assert(pthread_join(thr[i], NULL) == 0);
+
+ bt_assert(f.priv.private_counter == atomic_load_explicit(&f.counter, memory_order_relaxed));
+ bt_assert(f.priv.private_counter == THREADS * BLOCKCOUNT * REPEATS / 2);
+
+ return 1;
+}
+
+#define RWS_DATASIZE 333
+#define RWS_THREADS 128
+
+struct rws_test_data {
+ int data[RWS_DATASIZE];
+ rw_spinlock rws[RWS_DATASIZE];
+};
+
+static void *
+rwspin_thread_run(void *_rtd)
+{
+ struct rws_test_data *d = _rtd;
+
++ for (bool sorted = 0; !sorted++; )
+ {
+ for (int i=0; (i<RWS_DATASIZE-1) && sorted; i++)
+ {
+ rws_read_lock(&d->rws[i]);
+ rws_read_lock(&d->rws[i+1]);
+
+ ASSERT_DIE(d->data[i] >= 0);
+ ASSERT_DIE(d->data[i+1] >= 0);
+ if (d->data[i] > d->data[i+1])
+ sorted = 0;
+
+ rws_read_unlock(&d->rws[i+1]);
+ rws_read_unlock(&d->rws[i]);
+ }
+
+ for (int i=0; (i<RWS_DATASIZE-1); i++)
+ {
+ rws_write_lock(&d->rws[i]);
+ rws_write_lock(&d->rws[i+1]);
+
+ int first = d->data[i];
+ int second = d->data[i+1];
+
+ ASSERT_DIE(first >= 0);
+ ASSERT_DIE(second >= 0);
+
+ d->data[i] = d->data[i+1] = -1;
+
+ if (first > second)
+ {
+ d->data[i] = second;
+ d->data[i+1] = first;
+ }
+ else
+ {
+ d->data[i] = first;
+ d->data[i+1] = second;
+ }
+
+ rws_write_unlock(&d->rws[i+1]);
+ rws_write_unlock(&d->rws[i]);
+ }
+ }
+
+ return NULL;
+}
+
+static int
+t_rwspin(void)
+{
+ struct rws_test_data d;
+
+ /* Setup an array to sort */
+ for (int i=0; i<RWS_DATASIZE; i++)
+ d.data[i] = RWS_DATASIZE-i-1;
+
+ /* Spinlock for every place */
+ for (int i=0; i<RWS_DATASIZE; i++)
+ rws_init(&d.rws[i]);
+
+ /* Start the threads */
+ pthread_t thr[RWS_THREADS];
+ for (int i=0; i<RWS_THREADS; i++)
+ bt_assert(pthread_create(&thr[i], NULL, rwspin_thread_run, &d) == 0);
+
+ /* Wait for the threads */
+ for (int i=0; i<RWS_THREADS; i++)
+ bt_assert(pthread_join(thr[i], NULL) == 0);
+
+ for (int i=0; i<RWS_DATASIZE; i++)
+ bt_assert(d.data[i] == i);
+
+ return 1;
+}
+
+
+int
+main(int argc, char **argv)
+{
+ bt_init(argc, argv);
+ bt_bird_init();
+
+ bt_test_suite(t_locking, "Testing locks");
+ bt_test_suite(t_rwspin, "Testing rw spinlock");
+
+ return bt_exit_value();
+}
--- /dev/null
- static _Bool
+/*
+ * BIRD Internet Routing Daemon -- Semi-global index of nets
+ *
+ * (c) 2023 Maria Matejka <mq@jmq.cz>
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#include "lib/birdlib.h"
+#include "lib/netindex_private.h"
+
+#define NETINDEX_INIT_BLOCK_SIZE 128
+
+#define NETINDEX_KEY(n) (n)->hash, (n)->addr
+#define NETINDEX_NEXT(n) (n)->next
+#define NETINDEX_EQ(h,n,i,o) ((h == i) && net_equal(n,o))
+#define NETINDEX_FN(h,n) (h)
+#define NETINDEX_ORDER 12 /* Initial */
+
+#define NETINDEX_REHASH netindex_rehash
+#define NETINDEX_PARAMS /8, *2, 2, 2, 12, 28
+
+static void NETINDEX_REHASH(void *_v) {
+ log(L_TRACE "Netindex rehash: begin");
+ netindex_spinhash *v = _v;
+ int step;
+ SPINHASH_REHASH_PREPARE(v,NETINDEX,struct netindex,step);
+
+ log(L_TRACE "Netindex rehash: step=%d", step);
+ if (!step) return;
+
+ if (step > 0) SPINHASH_REHASH_UP(v,NETINDEX,struct netindex,step);
+ if (step < 0) SPINHASH_REHASH_DOWN(v,NETINDEX,struct netindex,-step);
+
+ log(L_TRACE "Netindex rehash: time to finish");
+ SPINHASH_REHASH_FINISH(v,NETINDEX);
+ log(L_TRACE "Netindex rehash: done");
+}
+
+static void netindex_hash_cleanup(void *netindex_hash);
+
+static struct netindex *
+net_lock_revive_unlock(netindex_hash *h, struct netindex *i)
+{
+ if (!i)
+ return NULL;
+
+ lfuc_lock_revive(&i->uc);
+ lfuc_unlock(&i->uc, h->cleanup_list, &h->cleanup_event);
+ return i;
+}
+
+/*
+ * Index initialization
+ */
+netindex_hash *
+netindex_hash_new(pool *sp, event_list *cleanup_target, u8 type)
+{
+ DOMAIN(attrs) dom = DOMAIN_NEW_RCU_SYNC(attrs);
+ LOCK_DOMAIN(attrs, dom);
+
+ pool *p = rp_new(sp, dom.attrs, "Network index");
+
+ struct netindex_hash_private *nh = mb_allocz(p, sizeof *nh);
+ nh->lock = dom;
+ nh->pool = p;
+ nh->net_type = type;
+
+ nh->slab = net_addr_length[type] ? sl_new(nh->pool, sizeof (struct netindex) + net_addr_length[type]) : NULL;
+
+ SPINHASH_INIT(nh->hash, NETINDEX, nh->pool, cleanup_target);
+ atomic_store_explicit(&nh->block_size, NETINDEX_INIT_BLOCK_SIZE, memory_order_release);
+ atomic_store_explicit(&nh->block,
+ mb_allocz(nh->pool, NETINDEX_INIT_BLOCK_SIZE * sizeof *nh->block),
+ memory_order_release);
+
+ hmap_init(&nh->id_map, nh->pool, 128);
+
+ nh->cleanup_list = cleanup_target;
+ nh->cleanup_event = (event) { .hook = netindex_hash_cleanup, nh };
+
+ UNLOCK_DOMAIN(attrs, dom);
+ return SKIP_BACK(netindex_hash, priv, nh);
+}
+
+static uint
+netindex_hash_cleanup_removed(struct netindex_hash_private *nh, struct netindex * _Atomic *block, struct netindex **removed, uint cnt)
+{
+ synchronize_rcu();
+
+ uint kept = 0;
+ for (uint q = 0; q < cnt; q++)
+ {
+ struct netindex *ni = removed[q];
+
+ /* Now no reader can possibly still have the old pointer,
+ * unless somebody found it inbetween and ref'd it. */
+ if (!lfuc_finished(&ni->uc))
+ {
+ /* Collision, return the netindex back. */
+ ASSERT_DIE(NULL == atomic_exchange_explicit(&block[ni->index], ni, memory_order_acq_rel));
+ SPINHASH_INSERT(nh->hash, NETINDEX, ni);
+ kept++;
+ continue;
+ }
+
+ /* Now the netindex is definitely obsolete, we can free it */
+ hmap_clear(&nh->id_map, ni->index);
+
+ if (nh->slab)
+ sl_free(ni);
+ else
+ mb_free(ni);
+ }
+
+ return kept;
+}
+
+static void
+netindex_hash_cleanup(void *_nh)
+{
+ struct netindex_hash_private *nh = _nh;
+
+ DOMAIN(attrs) dom = nh->lock;
+ LOCK_DOMAIN(attrs, dom);
+
+ uint kept = 0;
+
+ uint bs = atomic_load_explicit(&nh->block_size, memory_order_relaxed);
+ struct netindex * _Atomic *block = atomic_load_explicit(&nh->block, memory_order_relaxed);
+
+#define REMOVED_MAX 256
+ struct netindex *removed[REMOVED_MAX];
+ uint removed_cnt = 0;
+
+ for (uint i = 0; i < bs; i++)
+ {
+ struct netindex *ni = atomic_load_explicit(&block[i], memory_order_acquire);
+ if (!ni)
+ continue;
+
+ /* We may use the acquired netindex pointer as we are
+ * the only process which deletes them */
+ ASSERT_DIE(i == ni->index);
+
+ /* Check finished */
+ if (!lfuc_finished(&ni->uc))
+ {
+ kept++;
+ continue;
+ }
+
+ /* Looks finished, try dropping */
+ ASSERT_DIE(ni == atomic_exchange_explicit(&block[i], NULL, memory_order_acq_rel));
+ SPINHASH_REMOVE(nh->hash, NETINDEX, ni);
+
+ /* Store into the removed-block */
+ removed[removed_cnt++] = ni;
+
+ /* If removed-block is full, flush it */
+ if (removed_cnt == REMOVED_MAX)
+ {
+ kept += netindex_hash_cleanup_removed(nh, block, removed, removed_cnt);
+ removed_cnt = 0;
+ }
+ }
+
+ /* Flush remaining netindexes */
+ if (removed_cnt)
+ kept += netindex_hash_cleanup_removed(nh, block, removed, removed_cnt);
+
+ /* Return now unless we're deleted */
+ if (kept || !nh->deleted_event)
+ {
+ UNLOCK_DOMAIN(attrs, dom);
+ return;
+ }
+
+ ev_postpone(&nh->cleanup_event);
+
+ event *e = nh->deleted_event;
+ event_list *t = nh->deleted_target;
+
+ /* Check cleanliness */
+ SPINHASH_WALK(nh->hash, NETINDEX, i)
+ bug("Stray netindex in deleted hash");
+ SPINHASH_WALK_END;
+
+ /* Cleanup the spinhash itself */
+ SPINHASH_FREE(nh->hash);
+
+ /* Pool free is enough to drop everything else */
+ rp_free(nh->pool);
+
+ /* And only the lock remains */
+ UNLOCK_DOMAIN(attrs, dom);
+ DOMAIN_FREE(attrs, dom);
+
+ /* Notify the requestor */
+ ev_send(t, e);
+}
+
+void
+netindex_hash_delete(netindex_hash *h, event *e, event_list *t)
+{
+ NH_LOCK(h, hp);
+
+ hp->deleted_event = e;
+ hp->deleted_target = t;
+
+ ev_send(hp->cleanup_list, &hp->cleanup_event);
+}
+
+/*
+ * Private index manipulation
+ */
+static struct netindex *
+net_find_index_fragile(netindex_hash *nh, const net_addr *n)
+{
+ ASSERT_DIE(n->type == nh->net_type);
+
+ u32 h = net_hash(n);
+ return SPINHASH_FIND(nh->hash, NETINDEX, h, n);
+}
+
++static bool
+net_validate_index(netindex_hash *h, struct netindex *ni)
+{
+ struct netindex * _Atomic *block = atomic_load_explicit(&h->block, memory_order_relaxed);
+ u32 bs = atomic_load_explicit(&h->block_size, memory_order_relaxed);
+
+ ASSERT_DIE(ni->index < bs);
+ struct netindex *bni = atomic_load_explicit(&block[ni->index], memory_order_acquire);
+ return (bni == ni);
+}
+
+static struct netindex *
+net_new_index_locked(struct netindex_hash_private *hp, const net_addr *n)
+{
+ ASSERT_DIE(!hp->deleted_event);
+
+ u32 i = hmap_first_zero(&hp->id_map);
+ hmap_set(&hp->id_map, i);
+
+ struct netindex *ni = hp->slab ?
+ sl_alloc(hp->slab) :
+ mb_alloc(hp->pool, n->length + sizeof *ni);
+
+ *ni = (struct netindex) {
+ .hash = net_hash(n),
+ .index = i,
+ };
+ net_copy(ni->addr, n);
+
+ SPINHASH_INSERT(hp->hash, NETINDEX, ni);
+
+ struct netindex * _Atomic *block = atomic_load_explicit(&hp->block, memory_order_relaxed);
+ u32 bs = atomic_load_explicit(&hp->block_size, memory_order_relaxed);
+ u32 nbs = bs;
+ while (nbs <= i)
+ nbs *= 2;
+
+ if (nbs > bs)
+ {
+ struct netindex * _Atomic *nb = mb_alloc(hp->pool, bs * 2 * sizeof *nb);
+ memcpy(nb, block, bs * sizeof *nb);
+ memset(&nb[bs], 0, (nbs - bs) * sizeof *nb);
+
+ ASSERT_DIE(block == atomic_exchange_explicit(&hp->block, nb, memory_order_acq_rel));
+ ASSERT_DIE(bs == atomic_exchange_explicit(&hp->block_size, nbs, memory_order_acq_rel));
+ synchronize_rcu();
+
+ mb_free(block);
+ block = nb;
+
+ hp->block_epoch++;
+ }
+
+ ASSERT_DIE(i < nbs);
+ atomic_store_explicit(&block[i], ni, memory_order_release);
+
+ return ni;
+}
+
+
+/*
+ * Public entry points
+ */
+
+void net_lock_index(netindex_hash *h UNUSED, struct netindex *i)
+{
+// log(L_TRACE "Lock index %p", i);
+ lfuc_lock(&i->uc);
+}
+
+void net_unlock_index(netindex_hash *h, struct netindex *i)
+{
+// log(L_TRACE "Unlock index %p", i);
+ lfuc_unlock(&i->uc, h->cleanup_list, &h->cleanup_event);
+}
+
+struct netindex *
+net_find_index(netindex_hash *h, const net_addr *n)
+{
+ RCU_ANCHOR(u);
+ struct netindex *ni = net_find_index_fragile(h, n);
+ return (ni && net_validate_index(h, ni)) ? net_lock_revive_unlock(h, ni) : NULL;
+}
+
+struct netindex *
+net_get_index(netindex_hash *h, const net_addr *n)
+{
+ struct netindex *ni = net_find_index(h, n);
+ if (ni) return ni;
+
+ NH_LOCK(h, hp);
+
+ /* Somebody may have added one inbetween */
+ return net_lock_revive_unlock(h,
+ (net_find_index_fragile(h, n) ?:
+ net_new_index_locked(hp, n)));
+}
+
+struct netindex net_index_out_of_range;
+
+struct netindex *
+net_resolve_index(netindex_hash *h, u32 i)
+{
+ RCU_ANCHOR(u);
+
+ struct netindex * _Atomic *block = atomic_load_explicit(&h->block, memory_order_relaxed);
+ u32 bs = atomic_load_explicit(&h->block_size, memory_order_relaxed);
+
+ if (i >= bs)
+ return &net_index_out_of_range;
+
+ struct netindex *ni = atomic_load_explicit(&block[i], memory_order_acquire);
+ if (ni == NULL)
+ return NULL;
+
+ return net_lock_revive_unlock(h, ni);
+}
--- /dev/null
- _Bool critical = 0;
+/*
+ * BIRD Library -- Read-Copy-Update Basic Operations
+ *
+ * (c) 2021 Maria Matejka <mq@jmq.cz>
+ * (c) 2021 CZ.NIC z.s.p.o.
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ * Note: all the relevant patents shall be expired.
+ *
+ * Using the Supplementary Material for User-Level Implementations of Read-Copy-Update
+ * by Matthieu Desnoyers, Paul E. McKenney, Alan S. Stern, Michel R. Dagenais and Jonathan Walpole
+ * obtained from https://www.efficios.com/pub/rcu/urcu-supp-accepted.pdf
+ */
+
+#include "lib/rcu.h"
+#include "lib/io-loop.h"
+#include "lib/locking.h"
+
+_Atomic u64 rcu_global_phase = RCU_GP_PHASE;
+_Thread_local struct rcu_thread this_rcu_thread;
+_Thread_local uint rcu_blocked;
+
+static struct rcu_thread * _Atomic rcu_thread_list = NULL;
+
+static _Atomic uint rcu_thread_spinlock = 0;
+
+static int
+rcu_critical(struct rcu_thread *t, u64 phase)
+{
+ uint val = atomic_load_explicit(&t->ctl, memory_order_acquire);
+ return
+ (val & RCU_NEST_MASK) /* Active */
+ && ((val & ~RCU_NEST_MASK) <= phase); /* In an older phase */
+}
+
+void
+synchronize_rcu(void)
+{
+ if (!rcu_blocked && (last_locked > &locking_stack.meta))
+ bug("Forbidden to synchronize RCU unless an appropriate lock is taken");
+
+ /* Increment phase */
+ u64 phase = atomic_fetch_add_explicit(&rcu_global_phase, RCU_GP_PHASE, memory_order_acq_rel);
+
+ while (1) {
+ /* Spinlock */
+ while (atomic_exchange_explicit(&rcu_thread_spinlock, 1, memory_order_acq_rel))
+ birdloop_yield();
+
+ /* Check all threads */
++ bool critical = 0;
+ for (struct rcu_thread * _Atomic *tp = &rcu_thread_list, *t;
+ t = atomic_load_explicit(tp, memory_order_acquire);
+ tp = &t->next)
+ /* Found a critical */
+ if (critical = rcu_critical(t, phase))
+ break;
+
+ /* Unlock */
+ ASSERT_DIE(atomic_exchange_explicit(&rcu_thread_spinlock, 0, memory_order_acq_rel));
+
+ /* Done if no critical */
+ if (!critical)
+ return;
+
+ /* Wait and retry if critical */
+ birdloop_yield();
+ }
+}
+
+void
+rcu_thread_start(void)
+{
+ /* Insert this thread to the thread list, no spinlock is needed */
+ struct rcu_thread *next = atomic_load_explicit(&rcu_thread_list, memory_order_acquire);
+ do atomic_store_explicit(&this_rcu_thread.next, next, memory_order_relaxed);
+ while (!atomic_compare_exchange_strong_explicit(
+ &rcu_thread_list, &next, &this_rcu_thread,
+ memory_order_acq_rel, memory_order_acquire));
+}
+
+void
+rcu_thread_stop(void)
+{
+ /* Spinlock */
+ while (atomic_exchange_explicit(&rcu_thread_spinlock, 1, memory_order_acq_rel))
+ birdloop_yield();
+
+ /* Find this thread */
+ for (struct rcu_thread * _Atomic *tp = &rcu_thread_list, *t;
+ t = atomic_load_explicit(tp, memory_order_acquire);
+ tp = &t->next)
+ if (t == &this_rcu_thread)
+ {
+ /* Remove this thread */
+ atomic_store_explicit(tp, atomic_load_explicit(&t->next, memory_order_acquire), memory_order_release);
+
+ /* Unlock and go */
+ ASSERT_DIE(atomic_exchange_explicit(&rcu_thread_spinlock, 0, memory_order_acq_rel));
+ return;
+ }
+
+ bug("Failed to find a stopped rcu thread");
+}
+
+void
+rcu_init(void)
+{
+ rcu_thread_start();
+}
--- /dev/null
- static inline _Bool rcu_read_active(void)
+/*
+ * BIRD Library -- Read-Copy-Update Basic Operations
+ *
+ * (c) 2021 Maria Matejka <mq@jmq.cz>
+ * (c) 2021 CZ.NIC z.s.p.o.
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ * Note: all the relevant patents shall be expired.
+ */
+
+#ifndef _BIRD_RCU_H_
+#define _BIRD_RCU_H_
+
+#include "lib/birdlib.h"
+#include "lib/lists.h"
+#include <stdatomic.h>
+
+#define RCU_GP_PHASE 0x100
+#define RCU_NEST_MASK (RCU_GP_PHASE-1)
+#define RCU_NEST_CNT 1
+
+extern _Atomic u64 rcu_global_phase;
+
+struct rcu_thread {
+ struct rcu_thread * _Atomic next;
+ u64 local_ctl;
+ _Atomic u64 ctl;
+};
+
+extern _Thread_local struct rcu_thread this_rcu_thread;
+extern _Thread_local uint rcu_blocked;
+
+static inline void rcu_read_lock(void)
+{
+ /* Increment the nesting counter */
+ atomic_store_explicit(&this_rcu_thread.ctl, (this_rcu_thread.local_ctl += RCU_NEST_CNT), memory_order_release);
+
+ /* Just nested */
+ u64 local_nest = this_rcu_thread.local_ctl & RCU_NEST_MASK;
+ if (local_nest > RCU_NEST_CNT)
+ return;
+
+ ASSUME(local_nest == RCU_NEST_CNT);
+
+ /* Update the phase */
+ u64 new = atomic_load_explicit(&rcu_global_phase, memory_order_acquire) + RCU_NEST_CNT;
+ atomic_store_explicit(&this_rcu_thread.ctl, new, memory_order_release);
+ this_rcu_thread.local_ctl = new;
+}
+
+static inline void rcu_read_unlock(void)
+{
+ /* Just decrement the nesting counter; when unlocked, nobody cares */
+ atomic_fetch_sub_explicit(&this_rcu_thread.ctl, RCU_NEST_CNT, memory_order_acq_rel);
+ this_rcu_thread.local_ctl--;
+}
+
++static inline bool rcu_read_active(void)
+{
+ return !!(this_rcu_thread.local_ctl & RCU_NEST_MASK);
+}
+
+void synchronize_rcu(void);
+
+/* Registering and unregistering a birdloop. To be called from birdloop implementation */
+void rcu_thread_start(void);
+void rcu_thread_stop(void);
+
+/* Run this from resource init */
+void rcu_init(void);
+
+#endif
--- /dev/null
- _Bool seen = 0;
+/*
+ * BIRD Library -- Auto storage attribute cleanup test
+ *
+ * (c) 2023 Maria Matejka <mq@jmq.cz>
+ * (c) 2023 CZ.NIC z.s.p.o.
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#include "test/birdtest.h"
+
+#include "lib/rcu.h"
+#include "lib/io-loop.h"
+
+#include <pthread.h>
+
+#define WRITERS 3
+#define READERS 28
+
+#define WRITER_ROUNDS 20
+
+static struct block {
+ struct block * _Atomic next;
+ u64 value;
+} ball[WRITERS][WRITER_ROUNDS];
+
+static struct block *_Atomic bin;
+static _Atomic uint seen = 0;
+
+static void *
+t_rcu_basic_reader(void *_ UNUSED)
+{
+ rcu_thread_start();
+
+ while (atomic_load_explicit(&bin, memory_order_acquire) == NULL)
+ birdloop_yield();
+
+ atomic_fetch_add_explicit(&seen, 1, memory_order_release);
+
+ while (atomic_load_explicit(&bin, memory_order_acquire))
+ {
+ rcu_read_lock();
+
+ uint mod = 0;
+ for (struct block * _Atomic *bp = &bin, *b;
+ b = atomic_load_explicit(bp, memory_order_acquire);
+ bp = &b->next)
+ {
+ uint val = b->value % WRITERS + 1;
+ ASSERT_DIE(val > mod);
+ mod = val;
+ }
+
+ ASSERT_DIE(mod <= WRITERS);
+
+ rcu_read_unlock();
+ }
+
+ rcu_thread_stop();
+ return NULL;
+}
+
+static _Atomic uint spinlock = 0;
+
+static inline void
+spin_lock(void)
+{
+ while (atomic_exchange_explicit(&spinlock, 1, memory_order_acq_rel))
+ birdloop_yield();
+}
+
+static inline void
+spin_unlock(void)
+{
+ ASSERT_DIE(atomic_exchange_explicit(&spinlock, 0, memory_order_acq_rel));
+}
+
+static void *
+t_rcu_basic_writer(void *order_ptr)
+{
+ rcu_thread_start();
+
+ uint order = (uintptr_t) order_ptr;
+ struct block *cur = &ball[order][0];
+
+ /* Insert the object */
+ spin_lock();
+ for (struct block * _Atomic *bp = &bin; bp; )
+ {
+ struct block *b = atomic_load_explicit(bp, memory_order_acquire);
+ if (b && ((b->value % WRITERS) < order))
+ bp = &b->next;
+ else
+ {
+ ASSERT_DIE(cur->value == 0xbabababababababa);
+ cur->value = order;
+ atomic_store_explicit(&cur->next, b, memory_order_relaxed);
+ atomic_store_explicit(bp, cur, memory_order_release);
+ break;
+ }
+ }
+ spin_unlock();
+
+ /* Wait for readers */
+ while (atomic_load_explicit(&seen, memory_order_acquire) != READERS)
+ birdloop_yield();
+
+ /* Update the object */
+ for (uint i=1; i<WRITER_ROUNDS; i++)
+ {
+ struct block *next = &ball[order][i];
+ ASSERT_DIE(next->value == 0xbabababababababa);
+ next->value = order + i*WRITERS;
+
+ spin_lock();
- _Bool seen = 0;
++ bool seen = 0;
+ for (struct block * _Atomic *bp = &bin, *b;
+ b = atomic_load_explicit(bp, memory_order_acquire);
+ bp = &b->next)
+ if (b == cur)
+ {
+ struct block *link = atomic_load_explicit(&b->next, memory_order_relaxed);
+ atomic_store_explicit(&next->next, link, memory_order_relaxed);
+ atomic_store_explicit(bp, next, memory_order_release);
+ seen = 1;
+ break;
+ }
+ ASSERT_DIE(seen);
+ spin_unlock();
+
+ synchronize_rcu();
+
+ ASSERT_DIE(cur->value + WRITERS == next->value);
+ cur->value = 0xd4d4d4d4d4d4d4d4;
+ atomic_store_explicit(&cur->next, ((void *) 0xd8d8d8d8d8d8d8d8), memory_order_relaxed);
+
+ cur = next;
+ }
+
+ /* Remove the object */
+ spin_lock();
++ bool seen = 0;
+ for (struct block * _Atomic *bp = &bin, *b;
+ b = atomic_load_explicit(bp, memory_order_acquire);
+ bp = &b->next)
+ if (b == cur)
+ {
+ struct block *link = atomic_load_explicit(&b->next, memory_order_relaxed);
+ atomic_store_explicit(bp, link, memory_order_relaxed);
+ seen = 1;
+ break;
+ }
+ ASSERT_DIE(seen);
+ spin_unlock();
+
+ synchronize_rcu();
+
+ cur->value = 0xd4d4d4d4d4d4d4d4;
+ atomic_store_explicit(&cur->next, ((void *) 0xd8d8d8d8d8d8d8d8), memory_order_relaxed);
+
+ rcu_thread_stop();
+ return NULL;
+}
+
+static int
+t_rcu_basic(void)
+{
+ memset(ball, 0xba, sizeof ball);
+
+ pthread_t readers[READERS];
+ pthread_t writers[WRITERS];
+
+ for (uint i=0; i<READERS; i++)
+ pthread_create(&readers[i], NULL, t_rcu_basic_reader, NULL);
+
+ for (uintptr_t i=0; i<WRITERS; i++)
+ pthread_create(&writers[i], NULL, t_rcu_basic_writer, (void *) i);
+
+ for (uintptr_t i=0; i<WRITERS; i++)
+ pthread_join(writers[i], NULL);
+
+ for (uintptr_t i=0; i<READERS; i++)
+ pthread_join(readers[i], NULL);
+
+ for (uint w = 0; w < WRITERS; w++)
+ for (uint r = 0; r < WRITER_ROUNDS; r++)
+ {
+ ASSERT_DIE(ball[w][r].value == 0xd4d4d4d4d4d4d4d4);
+ ASSERT_DIE(atomic_load_explicit(&ball[w][r].next, memory_order_relaxed) == (void *) 0xd8d8d8d8d8d8d8d8);
+ }
+
+ return 1;
+}
+
+int main(int argc, char **argv)
+{
+ bt_init(argc, argv);
+
+ bt_test_suite(t_rcu_basic, "Basic RCU check");
+
+ return bt_exit_value();
+}
--- /dev/null
- ea_unset_attr(ea_list **to, _Bool local, const struct ea_class *def)
+/*
+ * BIRD Internet Routing Daemon -- Routing data structures
+ *
+ * (c) 1998--2000 Martin Mares <mj@ucw.cz>
+ * (c) 2022 Maria Matejka <mq@jmq.cz>
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#ifndef _BIRD_LIB_ROUTE_H_
+#define _BIRD_LIB_ROUTE_H_
+
+#undef RT_SOURCE_DEBUG
+
+#include "lib/type.h"
+#include "lib/rcu.h"
+#include "lib/hash.h"
+#include "lib/event.h"
+#include "lib/lockfree.h"
+
+struct network;
+struct proto;
+struct cli;
+struct rtable_private;
+struct rte_storage;
+
+#define RTE_IN_TABLE_WRITABLE \
+ byte pflags; /* Protocol-specific flags; may change in-table (!) */ \
+ byte flags; /* Table-specific flags */ \
+ u8 stale_cycle; /* Auxiliary value for route refresh; may change in-table (!) */ \
+
+typedef struct rte {
+ RTE_IN_TABLE_WRITABLE;
+ u8 generation; /* If this route import is based on other previously exported route,
+ this value should be 1 + MAX(generation of the parent routes).
+ Otherwise the route is independent and this value is zero. */
+ u32 id; /* Table specific route id */
+ struct ea_list *attrs; /* Attributes of this route */
+ const net_addr *net; /* Network this RTE belongs to */
+ struct rte_src *src; /* Route source that created the route */
+ struct rt_import_hook *sender; /* Import hook used to send the route to the routing table */
+ btime lastmod; /* Last modified (set by table) */
+} rte;
+
+#define REF_FILTERED 2 /* Route is rejected by import filter */
+#define REF_OBSOLETE 16 /* Route is obsolete, pending propagation */
+#define REF_PENDING 32 /* Route has not propagated completely yet */
+
+/* Route is valid for propagation (may depend on other flags in the future), accepts NULL */
+static inline int rte_is_valid(const rte *r) { return r && !(r->flags & REF_FILTERED); }
+
+/* Route just has REF_FILTERED flag */
+static inline int rte_is_filtered(const rte *r) { return !!(r->flags & REF_FILTERED); }
+
+/* Strip the route of the table-specific values */
+static inline rte rte_init_from(const rte *r)
+{
+ return (rte) {
+ .attrs = r->attrs,
+ .net = r->net,
+ .src = r->src,
+ };
+}
+
+int rte_same(const rte *, const rte *);
+
+struct rte_src {
+ struct rte_src *next; /* Hash chain */
+ struct rte_owner *owner; /* Route source owner */
+ u64 private_id; /* Private ID, assigned by the protocol */
+ u32 global_id; /* Globally unique ID of the source */
+ struct lfuc uc; /* Use count */
+};
+
+struct rte_owner_class {
+ void (*get_route_info)(const rte *, byte *buf); /* Get route information (for `show route' command) */
+ int (*rte_better)(const rte *, const rte *);
+ int (*rte_mergable)(const rte *, const rte *);
+ u32 (*rte_igp_metric)(const rte *);
+};
+
+struct rte_owner {
+ struct rte_owner_class *class;
+ int (*rte_recalculate)(struct rtable_private *, struct network *, struct rte_storage *new, struct rte_storage *, struct rte_storage *);
+ HASH(struct rte_src) hash;
+ const char *name;
+ u32 hash_key;
+ u32 uc;
+ u32 debug;
+ event_list *list;
+ event *prune;
+ event *stop;
+};
+
+extern DOMAIN(attrs) attrs_domain;
+
+#define RTA_LOCK LOCK_DOMAIN(attrs, attrs_domain)
+#define RTA_UNLOCK UNLOCK_DOMAIN(attrs, attrs_domain)
+
+#define RTE_SRC_PU_SHIFT 44
+#define RTE_SRC_IN_PROGRESS (1ULL << RTE_SRC_PU_SHIFT)
+
+/* Get a route source. This also locks the source, therefore the caller has to
+ * unlock the source after the route has been propagated. */
+struct rte_src *rt_get_source_o(struct rte_owner *o, u32 id);
+#define rt_get_source(p, id) rt_get_source_o(&(p)->sources, (id))
+
+struct rte_src *rt_find_source_global(u32 id);
+
+#ifdef RT_SOURCE_DEBUG
+#define rt_lock_source _rt_lock_source_internal
+#define rt_unlock_source _rt_unlock_source_internal
+#endif
+
+static inline void rt_lock_source(struct rte_src *src)
+{
+ lfuc_lock(&src->uc);
+}
+
+static inline void rt_unlock_source(struct rte_src *src)
+{
+ lfuc_unlock(&src->uc, src->owner->list, src->owner->prune);
+}
+
+#ifdef RT_SOURCE_DEBUG
+#undef rt_lock_source
+#undef rt_unlock_source
+
+#define rt_lock_source(x) ( log(L_INFO "Lock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_lock_source_internal(x) )
+#define rt_unlock_source(x) ( log(L_INFO "Unlock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_unlock_source_internal(x) )
+#endif
+
+void rt_init_sources(struct rte_owner *, const char *name, event_list *list);
+void rt_destroy_sources(struct rte_owner *, event *);
+
+void rt_dump_sources(struct rte_owner *);
+
+/*
+ * Route Attributes
+ *
+ * Beware: All standard BGP attributes must be represented here instead
+ * of making them local to the route. This is needed to ensure proper
+ * construction of BGP route attribute lists.
+ */
+
+/* Nexthop structure */
+struct nexthop {
+ ip_addr gw; /* Next hop */
+ struct iface *iface; /* Outgoing interface */
+ byte flags;
+ byte weight;
+ byte labels; /* Number of all labels */
+ u32 label[0];
+};
+
+/* For packing one into eattrs */
+struct nexthop_adata {
+ struct adata ad;
+ /* There is either a set of nexthops or a special destination (RTD_*) */
+ union {
+ struct nexthop nh;
+ uint dest;
+ };
+};
+
+/* For MPLS label stack generation */
+struct nexthop_adata_mpls {
+ struct nexthop_adata nhad;
+ u32 label_space[MPLS_MAX_LABEL_STACK];
+};
+
+#define NEXTHOP_DEST_SIZE (OFFSETOF(struct nexthop_adata, dest) + sizeof(uint) - OFFSETOF(struct adata, data))
+#define NEXTHOP_DEST_LITERAL(x) ((struct nexthop_adata) { \
+ .ad.length = NEXTHOP_DEST_SIZE, .dest = (x), })
+
+#define RNF_ONLINK 0x1 /* Gateway is onlink regardless of IP ranges */
+
+
+#define RTS_STATIC 1 /* Normal static route */
+#define RTS_INHERIT 2 /* Route inherited from kernel */
+#define RTS_DEVICE 3 /* Device route */
+#define RTS_STATIC_DEVICE 4 /* Static device route */
+#define RTS_REDIRECT 5 /* Learned via redirect */
+#define RTS_RIP 6 /* RIP route */
+#define RTS_OSPF 7 /* OSPF route */
+#define RTS_OSPF_IA 8 /* OSPF inter-area route */
+#define RTS_OSPF_EXT1 9 /* OSPF external route type 1 */
+#define RTS_OSPF_EXT2 10 /* OSPF external route type 2 */
+#define RTS_BGP 11 /* BGP route */
+#define RTS_PIPE 12 /* Inter-table wormhole */
+#define RTS_BABEL 13 /* Babel route */
+#define RTS_RPKI 14 /* Route Origin Authorization */
+#define RTS_PERF 15 /* Perf checker */
+#define RTS_L3VPN 16 /* MPLS L3VPN */
+#define RTS_AGGREGATED 17 /* Aggregated route */
+#define RTS_MAX 18
+
+#define RTD_NONE 0 /* Undefined next hop */
+#define RTD_UNICAST 1 /* A standard next hop */
+#define RTD_BLACKHOLE 2 /* Silently drop packets */
+#define RTD_UNREACHABLE 3 /* Reject as unreachable */
+#define RTD_PROHIBIT 4 /* Administratively prohibited */
+#define RTD_MAX 5
+
+extern const char * rta_dest_names[RTD_MAX];
+
+static inline const char *rta_dest_name(uint n)
+{ return (n < RTD_MAX) ? rta_dest_names[n] : "???"; }
+
+
+/*
+ * Extended Route Attributes
+ */
+
+typedef struct eattr {
+ word id; /* EA_CODE(PROTOCOL_..., protocol-dependent ID) */
+ byte flags; /* Protocol-dependent flags */
+ byte type; /* Attribute type */
+ byte rfu:5;
+ byte originated:1; /* The attribute has originated locally */
+ byte fresh:1; /* An uncached attribute (e.g. modified in export filter) */
+ byte undef:1; /* Explicitly undefined */
+
+ PADDING(unused, 3, 3);
+
+ union bval u;
+} eattr;
+
+
+#define EA_CODE_MASK 0xffff
+#define EA_ALLOW_UNDEF 0x10000 /* ea_find: allow EAF_TYPE_UNDEF */
+#define EA_BIT(n) ((n) << 24) /* Used in bitfield accessors */
+#define EA_BIT_GET(ea) ((ea) >> 24)
+
+typedef struct ea_list {
+ struct ea_list *next; /* In case we have an override list */
+ byte flags; /* Flags: EALF_... */
+ byte stored:5; /* enum ea_stored */
+ byte rfu:3;
+ word count; /* Number of attributes */
+ eattr attrs[0]; /* Attribute definitions themselves */
+} ea_list;
+
+enum ea_stored {
+ EALS_NONE = 0, /* This is a temporary ea_list */
+ EALS_PREIMPORT = 1, /* State when route entered rte_update() */
+ EALS_FILTERED = 2, /* State after filters */
+ EALS_IN_TABLE = 3, /* State in table */
+ EALS_KEY = 4, /* EA list used as key */
+ EALS_CUSTOM = 0x10, /* OR this with custom values */
+ EALS_MAX = 0x20,
+};
+
+struct ea_storage {
+ struct ea_storage *next_hash; /* Next in hash chain */
+ _Atomic u64 uc; /* Use count */
+ u32 hash_key; /* List hash */
+ PADDING(unused, 0, 4); /* Sorry, we need u64 for the usecount */
+ ea_list l[0]; /* The list itself */
+};
+
+#define EALF_SORTED 1 /* Attributes are sorted by code */
+#define EALF_BISECT 2 /* Use interval bisection for searching */
+#define EALF_HUGE 8 /* List is too big to fit into slab */
+
+struct ea_class {
+#define EA_CLASS_INSIDE \
+ const char *name; /* Name (both print and filter) */ \
+ struct symbol *sym; /* Symbol to export to configs */ \
+ uint id; /* Autoassigned attribute ID */ \
+ uint uc; /* Reference count */ \
+ btype type; /* Data type ID */ \
+ u16 flags; /* Protocol-dependent flags */ \
+ uint readonly:1; /* This attribute can't be changed by filters */ \
+ uint conf:1; /* Requested by config */ \
+ uint hidden:1; /* Technical attribute, do not show, do not expose to filters */ \
+ void (*format)(const eattr *ea, byte *buf, uint size); \
+ void (*stored)(const eattr *ea); /* When stored into global hash */ \
+ void (*freed)(const eattr *ea); /* When released from global hash */ \
+
+ EA_CLASS_INSIDE;
+};
+
+struct ea_class_ref {
+ resource r;
+ struct ea_class *class;
+};
+
+void ea_register_init(struct ea_class *);
+struct ea_class_ref *ea_register_alloc(pool *, struct ea_class);
+struct ea_class_ref *ea_ref_class(pool *, struct ea_class *); /* Reference for an attribute alias */
+
+#define EA_REGISTER_ALL_HELPER(x) ea_register_init(x);
+#define EA_REGISTER_ALL(...) MACRO_FOREACH(EA_REGISTER_ALL_HELPER, __VA_ARGS__)
+
+struct ea_class *ea_class_find_by_id(uint id);
+struct ea_class *ea_class_find_by_name(const char *name);
+static inline struct ea_class *ea_class_self(struct ea_class *self) { return self; }
+#define ea_class_find(_arg) _Generic((_arg), \
+ uint: ea_class_find_by_id, \
+ word: ea_class_find_by_id, \
+ char *: ea_class_find_by_name, \
+ const char *: ea_class_find_by_name, \
+ struct ea_class *: ea_class_self)(_arg)
+
+struct ea_walk_state {
+ ea_list *eattrs; /* Ccurrent ea_list, initially set by caller */
+ eattr *ea; /* Current eattr, initially NULL */
+ u32 visited[4]; /* Bitfield, limiting max to 128 */
+};
+
+#define ea_find(_l, _arg) _Generic((_arg), uint: ea_find_by_id, struct ea_class *: ea_find_by_class, char *: ea_find_by_name)(_l, _arg)
+eattr *ea_find_by_id(ea_list *, unsigned ea);
+static inline eattr *ea_find_by_class(ea_list *l, const struct ea_class *def)
+{ return ea_find_by_id(l, def->id); }
+static inline eattr *ea_find_by_name(ea_list *l, const char *name)
+{
+ const struct ea_class *def = ea_class_find_by_name(name);
+ return def ? ea_find_by_class(l, def) : NULL;
+}
+
+#define ea_get_int(_l, _ident, _def) ({ \
+ struct ea_class *cls = ea_class_find((_ident)); \
+ ASSERT_DIE(cls->type & EAF_EMBEDDED); \
+ const eattr *ea = ea_find((_l), cls->id); \
+ (ea ? ea->u.data : (_def)); \
+ })
+
+#define ea_get_ip(_l, _ident, _def) ({ \
+ struct ea_class *cls = ea_class_find((_ident)); \
+ ASSERT_DIE(cls->type == T_IP); \
+ const eattr *ea = ea_find((_l), cls->id); \
+ (ea ? *((const ip_addr *) ea->u.ptr->data) : (_def)); \
+ })
+
+#define ea_get_adata(_l, _ident) ({ \
+ struct ea_class *cls = ea_class_find((_ident)); \
+ ASSERT_DIE(!(cls->type & EAF_EMBEDDED)); \
+ const eattr *ea = ea_find((_l), cls->id); \
+ (ea ? ea->u.ptr : &null_adata); \
+ })
+
+eattr *ea_walk(struct ea_walk_state *s, uint id, uint max);
+void ea_dump(ea_list *);
+int ea_same(ea_list *x, ea_list *y); /* Test whether two ea_lists are identical */
+uint ea_hash(ea_list *e); /* Calculate attributes hash value */
+ea_list *ea_append(ea_list *to, ea_list *what);
+void ea_format_bitfield(const struct eattr *a, byte *buf, int bufsize, const char **names, int min, int max);
+
+/* Normalize ea_list; allocates the result from tmp_linpool */
+ea_list *ea_normalize(ea_list *e, u32 upto);
+
+uint ea_list_size(ea_list *);
+void ea_list_copy(ea_list *dest, ea_list *src, uint size);
+
+#define EA_LOCAL_LIST(N) struct { ea_list l; eattr a[N]; }
+
+#define EA_LITERAL_EMBEDDED(_class, _flags, _val) ({ \
+ btype _type = (_class)->type; \
+ ASSERT_DIE(_type & EAF_EMBEDDED); \
+ EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.i = _val); \
+ })
+
+#define EA_LITERAL_STORE_ADATA(_class, _flags, _buf, _len) ({ \
+ btype _type = (_class)->type; \
+ ASSERT_DIE(!(_type & EAF_EMBEDDED)); \
+ EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.ad = tmp_store_adata((_buf), (_len))); \
+ })
+
+#define EA_LITERAL_DIRECT_ADATA(_class, _flags, _adata) ({ \
+ btype _type = (_class)->type; \
+ ASSERT_DIE(!(_type & EAF_EMBEDDED)); \
+ EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.ad = _adata); \
+ })
+
+#define EA_LITERAL_GENERIC(_id, _type, _flags, ...) \
+ ((eattr) { .id = _id, .type = _type, .flags = _flags, __VA_ARGS__ })
+
+static inline eattr *
+ea_set_attr(ea_list **to, eattr a)
+{
+ if (!a.id)
+ bug("You have forgotten to register your EA class");
+
+ EA_LOCAL_LIST(1) *ea = tmp_alloc(sizeof(*ea));
+ *ea = (typeof(*ea)) {
+ .l.flags = EALF_SORTED,
+ .l.count = 1,
+ .l.next = *to,
+ .a[0] = a,
+ };
+
+ *to = &ea->l;
+ return &ea->a[0];
+}
+
+static inline void
++ea_unset_attr(ea_list **to, bool local, const struct ea_class *def)
+{
+ ea_set_attr(to, EA_LITERAL_GENERIC(def->id, 0, 0,
+ .fresh = local, .originated = local, .undef = 1));
+}
+
+static inline void
+ea_set_attr_u32(ea_list **to, const struct ea_class *def, uint flags, u64 data)
+{ ea_set_attr(to, EA_LITERAL_EMBEDDED(def, flags, data)); }
+
+static inline void
+ea_set_attr_data(ea_list **to, const struct ea_class *def, uint flags, const void *data, uint len)
+{ ea_set_attr(to, EA_LITERAL_STORE_ADATA(def, flags, data, len)); }
+
+static inline void
+ea_copy_attr(ea_list **to, ea_list *from, const struct ea_class *def)
+{
+ eattr *e = ea_find_by_class(from, def);
+ if (e)
+ if (e->type & EAF_EMBEDDED)
+ ea_set_attr_u32(to, def, e->flags, e->u.data);
+ else
+ ea_set_attr_data(to, def, e->flags, e->u.ptr->data, e->u.ptr->length);
+ else
+ ea_unset_attr(to, 0, def);
+}
+
+/*
+ * Common route attributes
+ */
+
+/* Preference: first-order comparison */
+extern struct ea_class ea_gen_preference;
+static inline u32 rt_get_preference(const rte *rt)
+{ return ea_get_int(rt->attrs, &ea_gen_preference, 0); }
+
+/* IGP metric: second-order comparison */
+extern struct ea_class ea_gen_igp_metric;
+u32 rt_get_igp_metric(const rte *rt);
+#define IGP_METRIC_UNKNOWN 0x80000000 /* Default igp_metric used when no other
+ protocol-specific metric is availabe */
+
+/* From: Advertising router */
+extern struct ea_class ea_gen_from;
+
+
+/* MPLS Label, Policy and Class */
+extern struct ea_class ea_gen_mpls_label,
+ ea_gen_mpls_policy, ea_gen_mpls_class;
+
+
+/* Source: An old method to devise the route source protocol and kind.
+ * To be superseded in a near future by something more informative. */
+extern struct ea_class ea_gen_source;
+static inline u32 rt_get_source_attr(const rte *rt)
+{ return ea_get_int(rt->attrs, &ea_gen_source, 0); }
+
+/* Flowspec validation result */
+enum flowspec_valid {
+ FLOWSPEC_UNKNOWN = 0,
+ FLOWSPEC_VALID = 1,
+ FLOWSPEC_INVALID = 2,
+ FLOWSPEC__MAX,
+};
+
+extern const char * flowspec_valid_names[FLOWSPEC__MAX];
+static inline const char *flowspec_valid_name(enum flowspec_valid v)
+{ return (v < FLOWSPEC__MAX) ? flowspec_valid_names[v] : "???"; }
+
+extern struct ea_class ea_gen_flowspec_valid;
+static inline enum flowspec_valid rt_get_flowspec_valid(const rte *rt)
+{ return ea_get_int(rt->attrs, &ea_gen_flowspec_valid, FLOWSPEC_UNKNOWN); }
+
+/* Next hop: For now, stored as adata */
+extern struct ea_class ea_gen_nexthop;
+
+static inline void ea_set_dest(struct ea_list **to, uint flags, uint dest)
+{
+ struct nexthop_adata nhad = NEXTHOP_DEST_LITERAL(dest);
+ ea_set_attr_data(to, &ea_gen_nexthop, flags, &nhad.ad.data, nhad.ad.length);
+}
+
+/* Next hop structures */
+
+#define NEXTHOP_ALIGNMENT (_Alignof(struct nexthop))
+#define NEXTHOP_MAX_SIZE (sizeof(struct nexthop) + sizeof(u32)*MPLS_MAX_LABEL_STACK)
+#define NEXTHOP_SIZE(_nh) NEXTHOP_SIZE_CNT(((_nh)->labels))
+#define NEXTHOP_SIZE_CNT(cnt) BIRD_ALIGN((sizeof(struct nexthop) + sizeof(u32) * (cnt)), NEXTHOP_ALIGNMENT)
+#define nexthop_size(nh) NEXTHOP_SIZE((nh))
+
+#define NEXTHOP_NEXT(_nh) ((void *) (_nh) + NEXTHOP_SIZE(_nh))
+#define NEXTHOP_END(_nhad) ((_nhad)->ad.data + (_nhad)->ad.length)
+#define NEXTHOP_VALID(_nh, _nhad) ((void *) (_nh) < (void *) NEXTHOP_END(_nhad))
+#define NEXTHOP_ONE(_nhad) (NEXTHOP_NEXT(&(_nhad)->nh) == NEXTHOP_END(_nhad))
+
+#define NEXTHOP_WALK(_iter, _nhad) for ( \
+ struct nexthop *_iter = &(_nhad)->nh; \
+ (void *) _iter < (void *) NEXTHOP_END(_nhad); \
+ _iter = NEXTHOP_NEXT(_iter))
+
+
+static inline int nexthop_same(struct nexthop_adata *x, struct nexthop_adata *y)
+{ return adata_same(&x->ad, &y->ad); }
+struct nexthop_adata *nexthop_merge(struct nexthop_adata *x, struct nexthop_adata *y, int max, linpool *lp);
+struct nexthop_adata *nexthop_sort(struct nexthop_adata *x, linpool *lp);
+int nexthop_is_sorted(struct nexthop_adata *x);
+
+#define NEXTHOP_IS_REACHABLE(nhad) ((nhad)->ad.length > NEXTHOP_DEST_SIZE)
+
+static inline struct nexthop_adata *
+rte_get_nexthops(rte *r)
+{
+ eattr *nhea = ea_find(r->attrs, &ea_gen_nexthop);
+ return nhea ? SKIP_BACK(struct nexthop_adata, ad, nhea->u.ptr) : NULL;
+}
+
+/* Route has regular, reachable nexthop (i.e. not RTD_UNREACHABLE and like) */
+static inline int rte_is_reachable(rte *r)
+{
+ struct nexthop_adata *nhad = rte_get_nexthops(r);
+ return nhad && NEXTHOP_IS_REACHABLE(nhad);
+}
+
+static inline int nhea_dest(eattr *nhea)
+{
+ if (!nhea)
+ return RTD_NONE;
+
+ struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL;
+ if (NEXTHOP_IS_REACHABLE(nhad))
+ return RTD_UNICAST;
+ else
+ return nhad->dest;
+}
+
+static inline int rte_dest(const rte *r)
+{
+ return nhea_dest(ea_find(r->attrs, &ea_gen_nexthop));
+}
+
+void rta_init(void);
+
+ea_list *ea_lookup_slow(ea_list *r, u32 squash_upto, enum ea_stored oid);
+
+static inline struct ea_storage *ea_get_storage(ea_list *r)
+{
+ ASSERT_DIE(r->stored);
+ return SKIP_BACK(struct ea_storage, l[0], r);
+}
+
+static inline ea_list *ea_ref(ea_list *r)
+{
+ ASSERT_DIE(0 < atomic_fetch_add_explicit(&ea_get_storage(r)->uc, 1, memory_order_acq_rel));
+ return r;
+}
+
+static inline ea_list *ea_lookup(ea_list *r, u32 squash_upto, enum ea_stored oid)
+{
+ ASSERT_DIE(oid);
+ if ((r->stored == oid) || BIT32_TEST(&squash_upto, r->stored))
+ return ea_ref(r);
+ else
+ return ea_lookup_slow(r, squash_upto, oid);
+}
+
+struct ea_free_deferred {
+ struct deferred_call dc;
+ ea_list *attrs;
+};
+
+void ea_free_deferred(struct deferred_call *dc);
+
+static inline ea_list *ea_free_later(ea_list *r)
+{
+ if (!r)
+ return NULL;
+
+ struct ea_free_deferred efd = {
+ .dc.hook = ea_free_deferred,
+ .attrs = r,
+ };
+
+ defer_call(&efd.dc, sizeof efd);
+ return r;
+}
+
+#define ea_free ea_free_later
+
+static inline ea_list *ea_lookup_tmp(ea_list *r, u32 squash_upto, enum ea_stored oid)
+{
+ return ea_free_later(ea_lookup(r, squash_upto, oid));
+}
+
+static inline ea_list *ea_ref_tmp(ea_list *r)
+{
+ ASSERT_DIE(r->stored);
+ return ea_free_later(ea_ref(r));
+}
+
+static inline ea_list *ea_strip_to(ea_list *r, u32 strip_to)
+{
+ ASSERT_DIE(strip_to);
+ while (r && !BIT32_TEST(&strip_to, r->stored))
+ r = r->next;
+
+ return r;
+}
+
+void ea_dump(ea_list *);
+void ea_dump_all(void);
+void ea_show_list(struct cli *, ea_list *);
+
+#endif
sock *sock_new(pool *); /* Allocate new socket */
#define sk_new(X) sock_new(X) /* Wrapper to avoid name collision with OpenSSL */
-int sk_open(sock *); /* Open socket */
+int sk_open(sock *, struct birdloop *); /* Open socket */
+void sk_reloop(sock *, struct birdloop *); /* Move socket to another loop. Both loops must be locked. */
+static inline void sk_close(sock *s) { rfree(&s->r); } /* Explicitly close socket */
+
int sk_rx_ready(sock *s);
- _Bool sk_tx_pending(sock *s);
++bool sk_tx_pending(sock *s);
int sk_send(sock *, uint len); /* Send data, <0=err, >0=ok, 0=sleep */
int sk_send_to(sock *, uint len, ip_addr to, uint port); /* sk_send to given destination */
void sk_reallocate(sock *); /* Free and allocate tbuf & rbuf */
/* Remove old unused handles */
if (old_d && !old_d->label_count)
- mpls_free_handle(m->domain, old_d);
+ mpls_free_handle(old_d);
if (old_s && !old_s->label_count)
- mpls_free_handle(m->domain, old_s);
+ mpls_free_handle(old_s);
+}
+
+static void
+mpls_fec_map_cleanup(void *_m)
+{
+ struct mpls_fec_map *m = _m;
- _Bool finished = (m->channel->channel_state == CS_STOP);
++ bool finished = (m->channel->channel_state == CS_STOP);
+ HASH_WALK_DELSAFE(m->label_hash, next_l, fec)
+ if (lfuc_finished(&fec->uc))
+ mpls_free_fec(m, fec);
+ else
+ finished = 0;
+ HASH_WALK_DELSAFE_END;
+
+ DBGL("FEC Map %p Cleanup: %sfinished", m, finished ? "" : "not ");
+
+ if (finished)
+ {
+ ev_postpone(m->cleanup_event);
+ channel_del_obstacle(m->channel);
+ }
}
void
mpls_show_ranges_dom(cmd, cmd->domain->domain);
else
{
- struct mpls_domain *m;
- WALK_LIST(m, mpls_domains)
+ struct mpls_domain_pub *m;
- _Bool first = 1;
++ bool first = 1;
+ WALK_LIST(m, MPLS_GLOBAL->domains)
+ {
+ if (first)
+ first = 0;
+ else
+ cli_msg(-1026, "");
+
mpls_show_ranges_dom(cmd, m);
+ }
}
cli_msg(0, "");
}
static void
-channel_reload_loop(void *ptr)
+channel_check_stopped(struct channel *c)
{
- struct channel *c = ptr;
+ switch (c->channel_state)
+ {
+ case CS_STOP:
+ if (c->obstacles || !EMPTY_LIST(c->roa_subscriptions) || c->in_req.hook)
+ return;
- /* Start reload */
- if (!c->reload_active)
- c->reload_pending = 0;
+ ASSERT_DIE(rt_export_get_state(&c->out_req) == TES_DOWN);
+ ASSERT_DIE(!rt_export_feed_active(&c->reimporter));
- if (!rt_reload_channel(c))
- {
- ev_schedule_work(c->reload_event);
- return;
+ channel_set_state(c, CS_DOWN);
+ proto_send_event(c->proto, c->proto->event);
+
+ break;
+ case CS_PAUSE:
+ if (c->obstacles || !EMPTY_LIST(c->roa_subscriptions))
+ return;
+
+ ASSERT_DIE(rt_export_get_state(&c->out_req) == TES_DOWN);
+ ASSERT_DIE(!rt_export_feed_active(&c->reimporter));
+
+ channel_set_state(c, CS_START);
+ break;
}
- /* Restart reload */
- if (c->reload_pending)
- channel_request_reload(c);
+ DBG("%s.%s: Channel requests/hooks stopped (in state %s)\n", c->proto->name, c->name, c_states[c->channel_state]);
}
-static void
-channel_reset_import(struct channel *c)
+void
+channel_add_obstacle(struct channel *c)
{
- /* Need to abort feeding */
- ev_postpone(c->reload_event);
- rt_reload_channel_abort(c);
-
- rt_prune_sync(c->in_table, 1);
+ c->obstacles++;
}
-static void
-channel_reset_export(struct channel *c)
+void
+channel_del_obstacle(struct channel *c)
{
- /* Just free the routes */
- rt_prune_sync(c->out_table, 1);
+ if (!--c->obstacles)
+ channel_check_stopped(c);
}
-/* Called by protocol to activate in_table */
void
-channel_setup_in_table(struct channel *c)
+channel_import_stopped(struct rt_import_request *req)
{
- struct rtable_config *cf = mb_allocz(c->proto->pool, sizeof(struct rtable_config));
+ SKIP_BACK_DECLARE(struct channel, c, in_req, req);
- cf->name = "import";
- cf->addr_type = c->net_type;
- cf->internal = 1;
+ mb_free(c->in_req.name);
+ c->in_req.name = NULL;
- c->in_table = cf->table = rt_setup(c->proto->pool, cf);
+ bmap_free(&c->imported_map);
- c->reload_event = ev_new_init(c->proto->pool, channel_reload_loop, c);
+ channel_check_stopped(c);
}
-/* Called by protocol to activate out_table */
-void
-channel_setup_out_table(struct channel *c)
+static u32
+channel_reimport_next_feed_index(struct rt_export_feeder *f, u32 try_this)
{
- struct rtable_config *cf = mb_allocz(c->proto->pool, sizeof(struct rtable_config));
- cf->name = "export";
- cf->addr_type = c->net_type;
- cf->internal = 1;
+ SKIP_BACK_DECLARE(struct channel, c, reimporter, f);
+ while (!bmap_test(&c->imported_map, try_this))
+ if (!(try_this & (try_this - 1))) /* return every power of two to check for maximum */
+ return try_this;
+ else
+ try_this++;
+
+ return try_this;
+}
+
+static void
+channel_do_reload(void *_c)
+{
+ struct channel *c = _c;
+
+ RT_FEED_WALK(&c->reimporter, f)
+ {
- _Bool seen = 0;
++ bool seen = 0;
+ for (uint i = 0; i < f->count_routes; i++)
+ {
+ rte *r = &f->block[i];
+
+ if (r->flags & REF_OBSOLETE)
+ break;
+
+ if (r->sender == c->in_req.hook)
+ {
+ /* Strip the table-specific information */
+ rte new = rte_init_from(r);
+
+ /* Strip the later attribute layers */
+ new.attrs = ea_strip_to(new.attrs, BIT32_ALL(EALS_PREIMPORT));
+
+ /* And reload the route */
+ rte_update(c, r->net, &new, new.src);
- c->out_table = rt_setup(c->proto->pool, cf);
+ seen = 1;
+ }
+ }
+
+ if (!seen)
+ bmap_clear(&c->imported_map, f->ni->index);
+
+ /* Local data needed no more */
+ tmp_flush();
+
+ MAYBE_DEFER_TASK(proto_work_list(c->proto), &c->reimport_event,
+ "%s.%s reimport", c->proto->name, c->name);
+ }
+}
+
+/* Called by protocol to activate in_table */
+static void
+channel_setup_in_table(struct channel *c)
+{
+ c->reimporter = (struct rt_export_feeder) {
+ .name = mb_sprintf(c->proto->pool, "%s.%s.reimport", c->proto->name, c->name),
+ .trace_routes = c->debug,
+ .next_feed_index = channel_reimport_next_feed_index,
+ };
+ c->reimport_event = (event) {
+ .hook = channel_do_reload,
+ .data = c,
+ };
+ rt_feeder_subscribe(&c->table->export_all, &c->reimporter);
}
struct cli;
/*
- * Generic data structure for storing network prefixes. Also used
- * for the master routing table. Currently implemented as a hash
- * table.
+ * Master Routing Tables. Generally speaking, each of them contains a FIB
+ * with each entry pointing to a list of route entries representing routes
+ * to given network (with the selected one at the head).
+ *
+ * Each of the RTE's contains variable data (the preference and protocol-dependent
+ * metrics) and a pointer to a route attribute block common for many routes).
*
- * Available operations:
- * - insertion of new entry
- * - deletion of entry
- * - searching for entry by network prefix
- * - asynchronous retrieval of fib contents
+ * It's guaranteed that there is at most one RTE for every (prefix,proto) pair.
*/
-struct fib_node {
- struct fib_node *next; /* Next in hash chain */
- struct fib_iterator *readers; /* List of readers of this node */
- net_addr addr[0];
+struct rtable_config {
+ node n;
+ char *name;
+ union rtable *table;
+ struct proto_config *krt_attached; /* Kernel syncer attached to this table */
+ uint addr_type; /* Type of address data stored in table (NET_*) */
+ uint gc_threshold; /* Maximum number of operations before GC is run */
+ uint gc_period; /* Approximate time between two consecutive GC runs */
+ u32 debug; /* Debugging flags (D_*) */
+ byte sorted; /* Routes of network are sorted according to rte_better() */
+ byte trie_used; /* Rtable has attached trie */
+ struct rt_cork_threshold {
+ u64 low, high;
+ } cork_threshold; /* Cork threshold values */
+ struct settle_config export_settle; /* Export announcement settler */
+ struct settle_config export_rr_settle;/* Export announcement settler config valid when any
+ route refresh is running */
+ struct settle_config digest_settle; /* Settle times for digests */
+ struct rtable_config *roa_aux_table; /* Auxiliary table config for ROA connections */
+ struct rt_stream_config {
+ struct rtable_config *src;
+ void (*setup)(union rtable *);
+ void (*stop)(union rtable *);
+ } master; /* Data source (this table is aux) */
};
-struct fib_iterator { /* See lib/slists.h for an explanation */
- struct fib_iterator *prev, *next; /* Must be synced with struct fib_node! */
- byte efef; /* 0xff to distinguish between iterator and node */
- byte pad[3];
- struct fib_node *node; /* Or NULL if freshly merged */
- uint hash;
+/*
+ * Route export journal
+ *
+ * The journal itself is held in struct rt_exporter.
+ * Workflow:
+ * (1) Initialize by rt_exporter_init()
+ * (2) Push data by rt_exporter_push() (the export item is copied)
+ * (3) Shutdown by rt_exporter_shutdown(), event is called after cleanup
+ *
+ * Subscribers:
+ * (1) Initialize by rt_export_subscribe()
+ * (2a) Get data by rt_export_get();
+ * (2b) Release data after processing by rt_export_release()
+ * (3) Request refeed by rt_export_refeed()
+ * (4) Unsubscribe by rt_export_unsubscribe()
+ */
+
+struct rt_export_request {
+ /* Formal name */
+ char *name;
+
+ /* Memory */
+ pool *pool;
+
+ /* State information */
+ enum rt_export_state {
+#define RT_EXPORT_STATES \
+ DOWN, \
+ FEEDING, \
+ PARTIAL, \
+ READY, \
+ STOP, \
+
+#define RT_EXPORT_STATES_ENUM_HELPER(p) TES_##p,
+ MACRO_FOREACH(RT_EXPORT_STATES_ENUM_HELPER, RT_EXPORT_STATES)
+ TES_MAX
+#undef RT_EXPORT_STATES_ENUM_HELPER
+ } _Atomic export_state;
+ btime last_state_change;
+
+ /* Table feeding contraption */
+ struct rt_export_feeder {
+ /* Formal name */
+ char *name;
+
+ /* Enlisting */
+ struct rt_exporter * _Atomic exporter;
+ DOMAIN(rtable) domain; /* Lock this instead of RCU */
+
+ /* Prefiltering, useful for more scenarios */
+ struct rt_prefilter {
+ /* Network prefilter mode (TE_ADDR_*) */
+ enum {
+ TE_ADDR_NONE = 0, /* No address matching */
+ TE_ADDR_EQUAL, /* Exact query - show route <addr> */
+ TE_ADDR_FOR, /* Longest prefix match - show route for <addr> */
+ TE_ADDR_IN, /* Interval query - show route in <addr> */
+ TE_ADDR_TRIE, /* Query defined by trie */
+ TE_ADDR_HOOK, /* Query processed by supplied custom hook */
+ } mode;
+
+ union {
+ const struct f_trie *trie;
+ const net_addr *addr;
+ int (*hook)(const struct rt_prefilter *, const net_addr *);
+ };
+ } prefilter;
+
+#define TLIST_PREFIX rt_export_feeder
+#define TLIST_TYPE struct rt_export_feeder
+#define TLIST_ITEM n
+#define TLIST_WANT_WALK
+#define TLIST_WANT_ADD_TAIL
+
+ /* Feeding itself */
+ u32 feed_index; /* Index of the feed in progress */
+ u32 (*next_feed_index)(struct rt_export_feeder *, u32 try_this);
+ struct rt_feeding_request {
+ struct rt_feeding_request *next; /* Next in request chain */
+ void (*done)(struct rt_feeding_request *);/* Called when this refeed finishes */
+ struct rt_prefilter prefilter; /* Reload only matching nets */
+ PACKED enum {
+ RFRS_INACTIVE = 0, /* Inactive request */
+ RFRS_PENDING, /* Request enqueued, do not touch */
+ RFRS_RUNNING, /* Request active, do not touch */
+ } state;
+ } *feeding, *feed_pending;
+ TLIST_DEFAULT_NODE;
+ u8 trace_routes;
+ } feeder;
+
+ /* Regular updates */
+ struct bmap seq_map; /* Which lfjour items are already processed */
+ struct bmap feed_map; /* Which nets were already fed (for initial feeding) */
+ struct lfjour_recipient r;
+ struct rt_export_union *cur;
+
+ /* Statistics */
+ struct rt_export_stats {
+ u32 updates_received; /* Number of route updates received */
+ u32 withdraws_received; /* Number of route withdraws received */
+ } stats;
+
+ /* Tracing */
+ u8 trace_routes;
+ void (*dump)(struct rt_export_request *req);
+ void (*fed)(struct rt_export_request *req);
};
-typedef void (*fib_init_fn)(struct fib *, void *);
-
-struct fib {
- pool *fib_pool; /* Pool holding all our data */
- slab *fib_slab; /* Slab holding all fib nodes */
- struct fib_node **hash_table; /* Node hash table */
- uint hash_size; /* Number of hash table entries (a power of two) */
- uint hash_order; /* Binary logarithm of hash_size */
- uint hash_shift; /* 32 - hash_order */
- uint addr_type; /* Type of address data stored in fib (NET_*) */
- uint node_size; /* FIB node size, 0 for nonuniform */
- uint node_offset; /* Offset of fib_node struct inside of user data */
- uint entries; /* Number of entries */
- uint entries_min, entries_max; /* Entry count limits (else start rehashing) */
- fib_init_fn init; /* Constructor */
+#include "lib/tlists.h"
+
+struct rt_export_union {
+ enum rt_export_kind {
+ RT_EXPORT_STOP = 1,
+ RT_EXPORT_FEED,
+ RT_EXPORT_UPDATE,
+ } kind;
+ const struct rt_export_item {
+ LFJOUR_ITEM_INHERIT(li); /* Member of lockfree journal */
+ char data[0]; /* Memcpy helper */
+ const rte *new, *old; /* Route update */
+ } *update;
+ const struct rt_export_feed {
+ uint count_routes, count_exports;
+ struct netindex *ni;
+ rte *block;
+ u64 *exports;
+ char data[0];
+ } *feed;
+ struct rt_export_request *req;
};
-static inline void * fib_node_to_user(struct fib *f, struct fib_node *e)
-{ return e ? (void *) ((char *) e - f->node_offset) : NULL; }
+struct rt_exporter {
+ struct lfjour journal; /* Journal for update keeping */
+ TLIST_LIST(rt_export_feeder) feeders; /* List of active feeder structures */
- _Bool _Atomic feeders_lock; /* Spinlock for the above list */
++ bool _Atomic feeders_lock; /* Spinlock for the above list */
+ u8 trace_routes; /* Debugging flags (D_*) */
+ u8 net_type; /* Which net this exporter provides */
+ DOMAIN(rtable) domain; /* Lock this instead of RCU */
+ u32 _Atomic max_feed_index; /* Stop feeding at this index */
+ const char *name; /* Name for logging */
+ netindex_hash *netindex; /* Table for net <-> id conversion */
+ void (*stopped)(struct rt_exporter *); /* Callback when exporter can stop */
+ void (*cleanup_done)(struct rt_exporter *, u64 end); /* Callback when cleanup has been done */
- struct rt_export_feed *(*feed_net)(struct rt_exporter *, struct rcu_unwinder *, u32, _Bool (*)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *, const struct rt_export_item *first);
++ struct rt_export_feed *(*feed_net)(struct rt_exporter *, struct rcu_unwinder *, u32, bool (*)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *, const struct rt_export_item *first);
+ void (*feed_cleanup)(struct rt_exporter *, struct rt_export_feeder *);
+};
-static inline struct fib_node * fib_user_to_node(struct fib *f, void *e)
-{ return e ? (void *) ((char *) e + f->node_offset) : NULL; }
+extern struct rt_export_feed rt_feed_index_out_of_range;
-void fib_init(struct fib *f, pool *p, uint addr_type, uint node_size, uint node_offset, uint hash_order, fib_init_fn init);
-void *fib_find(struct fib *, const net_addr *); /* Find or return NULL if doesn't exist */
-void *fib_get_chain(struct fib *f, const net_addr *a); /* Find first node in linked list from hash table */
-void *fib_get(struct fib *, const net_addr *); /* Find or create new if nonexistent */
-void *fib_route(struct fib *, const net_addr *); /* Longest-match routing lookup */
-void fib_delete(struct fib *, void *); /* Remove fib entry */
-void fib_free(struct fib *); /* Destroy the fib */
-void fib_check(struct fib *); /* Consistency check for debugging */
+/* Exporter API */
+void rt_exporter_init(struct rt_exporter *, struct settle_config *);
+struct rt_export_item *rt_exporter_push(struct rt_exporter *, const struct rt_export_item *);
+struct rt_export_feed *rt_alloc_feed(uint routes, uint exports);
+void rt_exporter_shutdown(struct rt_exporter *, void (*stopped)(struct rt_exporter *));
-void fit_init(struct fib_iterator *, struct fib *); /* Internal functions, don't call */
-struct fib_node *fit_get(struct fib *, struct fib_iterator *);
-void fit_put(struct fib_iterator *, struct fib_node *);
-void fit_put_next(struct fib *f, struct fib_iterator *i, struct fib_node *n, uint hpos);
-void fit_put_end(struct fib_iterator *i);
-void fit_copy(struct fib *f, struct fib_iterator *dst, struct fib_iterator *src);
+/* Standalone feeds */
+void rt_feeder_subscribe(struct rt_exporter *, struct rt_export_feeder *);
+void rt_feeder_unsubscribe(struct rt_export_feeder *);
+void rt_export_refeed_feeder(struct rt_export_feeder *, struct rt_feeding_request *);
+struct rt_export_feed *rt_export_next_feed(struct rt_export_feeder *);
+#define RT_FEED_WALK(_feeder, _f) \
+ for (const struct rt_export_feed *_f; _f = rt_export_next_feed(_feeder); ) \
- static inline _Bool rt_export_feed_active(struct rt_export_feeder *f)
-#define FIB_WALK(fib, type, z) do { \
- struct fib_node *fn_, **ff_ = (fib)->hash_table; \
- uint count_ = (fib)->hash_size; \
- type *z; \
- while (count_--) \
- for (fn_ = *ff_++; z = fib_node_to_user(fib, fn_); fn_=fn_->next)
++static inline bool rt_export_feed_active(struct rt_export_feeder *f)
+{ return !!atomic_load_explicit(&f->exporter, memory_order_acquire); }
-#define FIB_WALK_END } while (0)
+/* Full blown exports */
+void rtex_export_subscribe(struct rt_exporter *, struct rt_export_request *);
+void rtex_export_unsubscribe(struct rt_export_request *);
-#define FIB_ITERATE_INIT(it, fib) fit_init(it, fib)
+const struct rt_export_union * rt_export_get(struct rt_export_request *);
+void rt_export_release(const struct rt_export_union *);
+void rt_export_retry_later(const struct rt_export_union *);
+void rt_export_processed(struct rt_export_request *, u64);
+void rt_export_refeed_request(struct rt_export_request *rer, struct rt_feeding_request *rfr);
-#define FIB_ITERATE_START(fib, it, type, z) do { \
- struct fib_node *fn_ = fit_get(fib, it); \
- uint count_ = (fib)->hash_size; \
- uint hpos_ = (it)->hash; \
- type *z; \
- for(;;) { \
- if (!fn_) \
- { \
- if (++hpos_ >= count_) \
- break; \
- fn_ = (fib)->hash_table[hpos_]; \
- continue; \
- } \
- z = fib_node_to_user(fib, fn_);
+static inline enum rt_export_state rt_export_get_state(struct rt_export_request *r)
+{ return atomic_load_explicit(&r->export_state, memory_order_acquire); }
+const char *rt_export_state_name(enum rt_export_state state);
-#define FIB_ITERATE_END fn_ = fn_->next; } } while(0)
+static inline void rt_export_walk_cleanup(const struct rt_export_union **up)
+{
+ if (*up)
+ rt_export_release(*up);
+}
-#define FIB_ITERATE_PUT(it) fit_put(it, fn_)
+#define RT_EXPORT_WALK(_reader, _u) \
+ for (CLEANUP(rt_export_walk_cleanup) const struct rt_export_union *_u;\
+ _u = rt_export_get(_reader); \
+ rt_export_release(_u)) \
+
+/* Convenince common call to request refeed */
+#define rt_export_refeed(h, r) _Generic((h), \
+ struct rt_export_feeder *: rt_export_refeed_feeder, \
+ struct rt_export_request *: rt_export_refeed_request, \
+ void *: bug)(h, r)
+
+/* Subscription to regular table exports needs locking */
+#define rt_export_subscribe(_t, _kind, f) do { \
+ RT_LOCKED(_t, tp) { \
+ rt_lock_table(tp); \
+ rtex_export_subscribe(&tp->export_##_kind, f); \
+ }} while (0) \
+
+#define rt_export_unsubscribe(_kind, _fx) do { \
+ struct rt_export_request *_f = _fx; \
+ struct rt_exporter *e = atomic_load_explicit(&_f->feeder.exporter, memory_order_acquire); \
+ RT_LOCKED(SKIP_BACK(rtable, export_##_kind, e), _tp) { \
+ rtex_export_unsubscribe(_f); \
+ rt_unlock_table(_tp); \
+ }} while (0) \
+
+static inline int rt_prefilter_net(const struct rt_prefilter *p, const net_addr *n)
+{
+ switch (p->mode)
+ {
+ case TE_ADDR_NONE: return 1;
+ case TE_ADDR_IN: return net_in_netX(n, p->addr);
+ case TE_ADDR_EQUAL: return net_equal(n, p->addr);
+ case TE_ADDR_FOR: return net_in_netX(p->addr, n);
+ case TE_ADDR_TRIE: return trie_match_net(p->trie, n);
+ case TE_ADDR_HOOK: return p->hook(p, n);
+ }
+
+ bug("Crazy prefilter application attempt failed wildly.");
+}
+
- static inline _Bool
++static inline bool
+rt_net_is_feeding_feeder(struct rt_export_feeder *ref, const net_addr *n)
+{
+ if (!rt_prefilter_net(&ref->prefilter, n))
+ return 0;
+
+ if (!ref->feeding)
+ return 1;
-#define FIB_ITERATE_PUT_NEXT(it, fib) fit_put_next(fib, it, fn_, hpos_)
+ for (struct rt_feeding_request *rfr = ref->feeding; rfr; rfr = rfr->next)
+ if (rt_prefilter_net(&rfr->prefilter, n))
+ return 1;
-#define FIB_ITERATE_PUT_END(it) fit_put_end(it)
+ return 0;
+}
- static inline _Bool
-#define FIB_ITERATE_UNLINK(it, fib) fit_get(fib, it)
++static inline bool
+rt_net_is_feeding_request(struct rt_export_request *req, const net_addr *n)
+{
+ struct netindex *ni = NET_TO_INDEX(n);
+ return
+ !bmap_test(&req->feed_map, ni->index)
+ && rt_net_is_feeding_feeder(&req->feeder, n);
+}
-#define FIB_ITERATE_COPY(dst, src, fib) fit_copy(fib, dst, src)
+#define rt_net_is_feeding(h, n) _Generic((h), \
+ struct rt_export_feeder *: rt_net_is_feeding_feeder, \
+ struct rt_export_request *: rt_net_is_feeding_request, \
+ void *: bug)(h, n)
/*
* delete as soon as use_count becomes 0 and remove
* obstacle from this routing table.
*/
- struct event *rt_event; /* Routing table event */
+ struct rt_export_request best_req; /* Internal request from best route announcement cleanup */
+ struct rt_uncork_callback nhu_uncork; /* Helper event to schedule NHU on uncork */
+ struct rt_uncork_callback hcu_uncork; /* Helper event to schedule HCU on uncork */
struct timer *prune_timer; /* Timer for periodic pruning / GC */
+ struct event *prune_event; /* Event for prune execution */
btime last_rt_change; /* Last time when route changed */
- btime base_settle_time; /* Start time of rtable settling interval */
btime gc_time; /* Time of last GC */
uint gc_counter; /* Number of operations since last GC */
+ uint rr_counter; /* Number of currently running route refreshes,
+ in fact sum of (stale_set - stale_pruned) over all importers
+ + one for each TIS_FLUSHING importer */
+ uint wait_counter; /* Number of imports in TIS_WAITING state */
byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */
byte prune_trie; /* Prune prefix trie during next table prune */
- byte hcu_scheduled; /* Hostcache update is scheduled */
+ byte imports_flushing; /* Some imports are being flushed right now */
byte nhu_state; /* Next Hop Update state */
- struct fib_iterator prune_fit; /* Rtable prune FIB iterator */
- struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */
+ byte nhu_corked; /* Next Hop Update is corked with this state */
+ byte export_used; /* Pending Export pruning is scheduled */
+ byte cork_active; /* Cork has been activated */
+ struct rt_cork_threshold cork_threshold; /* Threshold for table cork */
+ u32 prune_index; /* Rtable prune FIB iterator */
+ u32 nhu_index; /* Next Hop Update FIB iterator */
+ event *nhu_event; /* Nexthop updater */
struct f_trie *trie_new; /* New prefix trie defined during pruning */
- struct f_trie *trie_old; /* Old prefix trie waiting to be freed */
+ const struct f_trie *trie_old; /* Old prefix trie waiting to be freed */
u32 trie_lock_count; /* Prefix trie locked by walks */
u32 trie_old_lock_count; /* Old prefix trie locked by walks */
+ struct tbf rl_pipe; /* Rate limiting token buffer for pipe collisions */
- list subscribers; /* Subscribers for notifications */
- struct timer *settle_timer; /* Settle time for notifications */
- list flowspec_links; /* List of flowspec links, src for NET_IPx and dst for NET_FLOWx */
struct f_trie *flowspec_trie; /* Trie for evaluation of flowspec notifications */
// struct mpls_domain *mpls_domain; /* Label allocator for MPLS */
-} rtable;
+ u32 rte_free_deferred; /* Counter of deferred rte_free calls */
-struct rt_subscription {
- node n;
- rtable *tab;
- void (*hook)(struct rt_subscription *b);
- void *data;
+ struct rt_digestor *export_digest; /* Route export journal for digest tries */
+ struct rt_stream *master; /* Data source (this table is aux) */
};
-struct rt_flowspec_link {
- node n;
- rtable *src;
- rtable *dst;
- u32 uc;
+/* The final union private-public rtable structure */
+typedef union rtable {
+ struct {
+ RTABLE_PUBLIC;
+ };
+ struct rtable_private priv;
+} rtable;
+
+/* Define the lock cleanup function */
+LOBJ_UNLOCK_CLEANUP(rtable, rtable);
+
+#define RT_IS_LOCKED(tab) LOBJ_IS_LOCKED((tab), rtable)
+#define RT_LOCKED(tab, tp) LOBJ_LOCKED((tab), tp, rtable, rtable)
+#define RT_LOCK(tab, tp) LOBJ_LOCK((tab), tp, rtable, rtable)
+
+#define RT_LOCK_SIMPLE(tab) LOBJ_LOCK_SIMPLE((tab), rtable)
+#define RT_UNLOCK_SIMPLE(tab) LOBJ_UNLOCK_SIMPLE((tab), rtable)
+
+#define RT_UNLOCKED_TEMPORARILY(tab, tp) LOBJ_UNLOCKED_TEMPORARILY((tab), tp, rtable, rtable)
+
+#define RT_PUB(tab) SKIP_BACK(rtable, priv, tab)
+
+#define RT_UNCORKING (1ULL << 44)
+
+extern struct rt_cork {
+ _Atomic u64 active;
+ DOMAIN(resource) dom;
+ event_list queue;
+} rt_cork;
+
+static inline void rt_cork_acquire(void)
+{
+ atomic_fetch_add_explicit(&rt_cork.active, 1, memory_order_acq_rel);
+}
+
+static inline void rt_cork_release(void)
+{
+ u64 upd = atomic_fetch_add_explicit(&rt_cork.active, RT_UNCORKING, memory_order_acq_rel) + RT_UNCORKING;
+
+ /* Actualy released? */
+ if ((upd >> 44) == (upd & (RT_UNCORKING - 1)))
+ {
+ LOCK_DOMAIN(resource, rt_cork.dom);
+ synchronize_rcu();
+ ev_run_list(&rt_cork.queue);
+ UNLOCK_DOMAIN(resource, rt_cork.dom);
+ }
+
+ atomic_fetch_sub_explicit(&rt_cork.active, RT_UNCORKING + 1, memory_order_acq_rel);
+}
+
+void rt_cork_send_callback(void *_data);
+
- static inline _Bool rt_cork_check(struct rt_uncork_callback *rcc)
++static inline bool rt_cork_check(struct rt_uncork_callback *rcc)
+{
+ /* Wait until all uncorks have finished */
+ while (1)
+ {
+ rcu_read_lock();
+
+ /* Not corked */
+ u64 corked = atomic_load_explicit(&rt_cork.active, memory_order_acquire);
+ if (!corked)
+ {
+ rcu_read_unlock();
+ return 0;
+ }
+
+ /* Yes, corked */
+ if (corked < RT_UNCORKING)
+ {
+ if (!rcc->ev.hook)
+ {
+ rcc->ev.hook = rt_cork_send_callback;
+ rcc->ev.data = rcc;
+ }
+
+ ev_send(&rt_cork.queue, &rcc->ev);
+ rcu_read_unlock();
+ return 1;
+ }
+
+ /* In progress, retry */
+ rcu_read_unlock();
+ birdloop_yield();
+ }
+}
+
+struct rt_pending_export {
+ struct rt_export_item it;
+ struct rt_pending_export *_Atomic next; /* Next export for the same net */
+ u64 seq_all; /* Interlink from BEST to ALL */
};
-#define NHU_CLEAN 0
-#define NHU_SCHEDULED 1
-#define NHU_RUNNING 2
-#define NHU_DIRTY 3
+struct rt_net_pending_export {
+ struct rt_pending_export * _Atomic first, * _Atomic last;
+};
typedef struct network {
- struct rte *routes; /* Available routes for this network */
- struct fib_node n; /* FIB flags reserved for kernel syncer */
+ struct rte_storage * _Atomic routes; /* Available routes for this network */
+
+ /* Uncleaned pending exports */
+ struct rt_net_pending_export all;
+ struct rt_net_pending_export best;
} net;
-struct hostcache {
- slab *slab; /* Slab holding all hostentries */
- struct hostentry **hash_table; /* Hash table for hostentries */
- unsigned hash_order, hash_shift;
- unsigned hash_max, hash_min;
- unsigned hash_items;
- linpool *lp; /* Linpool for trie */
- struct f_trie *trie; /* Trie of prefixes that might affect hostentries */
- list hostentries; /* List of all hostentries */
- byte update_hostcache;
+struct rte_storage {
+ struct rte_storage * _Atomic next; /* Next in chain */
+ union {
+ struct {
+ RTE_IN_TABLE_WRITABLE;
+ };
+ const struct rte rte; /* Route data */
+ };
};
-struct hostentry {
- node ln;
- ip_addr addr; /* IP address of host, part of key */
- ip_addr link; /* (link-local) IP address of host, used as gw
- if host is directly attached */
- rtable *tab; /* Dependent table, part of key */
- rtable *owner; /* Nexthop owner table */
- struct hostentry *next; /* Next in hash chain */
- unsigned hash_key; /* Hash key */
- unsigned uc; /* Use count */
- struct rta *src; /* Source rta entry */
- byte dest; /* Chosen route destination type (RTD_...) */
- byte nexthop_linkable; /* Nexthop list is completely non-device */
- u32 igp_metric; /* Chosen route IGP metric */
+#define RTE_COPY(r) ((r) ? (r)->rte : (rte) {})
+#define RTE_COPY_VALID(r) (((r) && (rte_is_valid((r)))) ? *(r) : (rte) {})
+#define RTE_OR_NULL(r) ((r) ? &((r)->rte) : NULL)
+#define RTE_VALID_OR_NULL(r) (((r) && (rte_is_valid((r)))) ? (r) : NULL)
+
+#define RTES_WRITE(r) (((r) != ((struct rte_storage *) 0)) ? ((struct rte *) &(r)->rte) : NULL)
+
+#define RTE_GET_NETINDEX(e) NET_TO_INDEX((e)->net)
+
+/* Table import */
+
+struct rt_import_request {
+ struct rt_import_hook *hook; /* The table part of importer */
+ char *name;
+ u8 trace_routes;
+
+ struct birdloop *loop; /* Where to schedule cleanup event */
+
+ void (*dump_req)(struct rt_import_request *req);
+ void (*log_state_change)(struct rt_import_request *req, u8 state);
+ /* Preimport is called when the @new route is just-to-be inserted, replacing @old.
+ * Return a route (may be different or modified in-place) to continue or NULL to withdraw. */
+ int (*preimport)(struct rt_import_request *req, struct rte *new, const struct rte *old);
+};
+
+struct rt_import_hook {
+ node n;
+ rtable *table; /* The connected table */
+ struct rt_import_request *req; /* The requestor */
+
+ struct rt_import_stats {
+ /* Import - from protocol to core */
+ u32 pref; /* Number of routes selected as best in the (adjacent) routing table */
+ u32 updates_ignored; /* Number of route updates rejected as already in route table */
+ u32 updates_accepted; /* Number of route updates accepted and imported */
+ u32 withdraws_ignored; /* Number of route withdraws rejected as already not in route table */
+ u32 withdraws_accepted; /* Number of route withdraws accepted and processed */
+ } stats;
+
+ u64 flush_seq; /* Table export seq when the channel announced flushing */
+ btime last_state_change; /* Time of last state transition */
+
+ u8 import_state; /* IS_* */
+ u8 stale_set; /* Set this stale_cycle to imported routes */
+ u8 stale_valid; /* Routes with this stale_cycle and bigger are considered valid */
+ u8 stale_pruned; /* Last prune finished when this value was set at stale_valid */
+ u8 stale_pruning; /* Last prune started when this value was set at stale_valid */
+
+ void (*stopped)(struct rt_import_request *); /* Stored callback when import is stopped */
+ event cleanup_event; /* Used to finally unhook the import from the table */
};
-typedef struct rte {
- struct rte *next;
- net *net; /* Network this RTE belongs to */
- struct rte_src *src; /* Route source that created the route */
- struct channel *sender; /* Channel used to send the route to the routing table */
- struct rta *attrs; /* Attributes of this route */
- u32 id; /* Table specific route id */
- byte flags; /* Flags (REF_...) */
- byte pflags; /* Protocol-specific flags */
- btime lastmod; /* Last modified */
-} rte;
-#define REF_COW 1 /* Copy this rte on write */
-#define REF_FILTERED 2 /* Route is rejected by import filter */
-#define REF_STALE 4 /* Route is stale in a refresh cycle */
-#define REF_DISCARD 8 /* Route is scheduled for discard */
-#define REF_MODIFY 16 /* Route is scheduled for modify */
+#define TIS_DOWN 0
+#define TIS_UP 1
+#define TIS_STOP 2
+#define TIS_FLUSHING 3
+#define TIS_WAITING 4
+#define TIS_CLEARED 5
+#define TIS_MAX 6
-/* Route is valid for propagation (may depend on other flags in the future), accepts NULL */
-static inline int rte_is_valid(rte *r) { return r && !(r->flags & REF_FILTERED); }
-/* Route just has REF_FILTERED flag */
-static inline int rte_is_filtered(rte *r) { return !!(r->flags & REF_FILTERED); }
+void rt_request_import(rtable *tab, struct rt_import_request *req);
+void rt_stop_import(struct rt_import_request *, void (*stopped)(struct rt_import_request *));
+const char *rt_import_state_name(u8 state);
+static inline u8 rt_import_get_state(struct rt_import_hook *ih) { return ih ? ih->import_state : TIS_DOWN; }
+
+void rte_import(struct rt_import_request *req, const net_addr *net, rte *new, struct rte_src *src);
+
+/* When rtable is just a view / aggregate, this is the basis for its source */
+struct rt_stream {
+ struct rt_import_request dst;
+ rtable *dst_tab;
+};
+
+
+#if 0
+/*
+ * For table export processing
+ */
+
+/* Get next rpe. If src is given, it must match. */
+struct rt_pending_export *rpe_next(struct rt_pending_export *rpe, struct rte_src *src);
+
+/* Walk all rpe's */
+#define RPE_WALK(first, it, src) \
+ for (struct rt_pending_export *it = (first); it; it = rpe_next(it, (src)))
+
+/* Mark the pending export processed */
+void rpe_mark_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe);
+
+#define rpe_mark_seen_all(hook, first, last, src) do { \
+ RPE_WALK((first), _rpe, (src)) { \
+ rpe_mark_seen((hook), _rpe); \
+ if (_rpe == last) break; \
+ }} while (0)
+
+/* Get pending export seen status */
+int rpe_get_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe);
+
+#endif
+
+/*
+ * Channel export hooks. To be refactored out.
+ */
+
+int channel_preimport(struct rt_import_request *req, rte *new, const rte *old);
/* Types of route announcement, also used as flags */
while (ss);
}
- static _Bool eattr_same_value(const eattr *a, const eattr *b);
++static bool eattr_same_value(const eattr *a, const eattr *b);
+
/**
* In place discard duplicates and undefs in sorted ea_list. We use stable sort
* for this reason.
t->count += e->count;
d += e->count;
e = e->next;
+
+ if (e && BIT32_TEST(&upto, e->stored))
+ break;
}
- static _Bool
+
+ t->next = e;
+}
+
+ea_list *
+ea_normalize(ea_list *e, u32 upto)
+{
+#if 0
+ debug("(normalize)");
+ ea_dump(e);
+ debug(" ----> ");
+#endif
+ ea_list *t = tmp_allocz(ea_scan(e, upto));
+ ea_merge(e, t, upto);
+ ea_sort(t);
+#if 0
+ ea_dump(t);
+ debug("\n");
+#endif
+
+ return t;
+}
+
++static bool
+eattr_same_value(const eattr *a, const eattr *b)
+{
+ if (
+ a->id != b->id ||
+ a->flags != b->flags ||
+ a->type != b->type ||
+ a->undef != b->undef
+ )
+ return 0;
+
+ if (a->undef)
+ return 1;
+
+ if (a->type & EAF_EMBEDDED)
+ return a->u.data == b->u.data;
+ else
+ return adata_same(a->u.ptr, b->u.ptr);
}
- static _Bool
++static bool
+eattr_same(const eattr *a, const eattr *b)
+{
+ return
+ eattr_same_value(a, b) &&
+ a->originated == b->originated &&
+ a->fresh == b->fresh;
+}
+
+
/**
* ea_same - compare two &ea_list's
* @x: attribute list
--- /dev/null
- _Bool done = 1;
+/*
+ * BIRD -- Route Export Mechanisms
+ *
+ * (c) 2024 Maria Matejka <mq@jmq.cz>
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#include "nest/bird.h"
+#include "nest/route.h"
+#include "nest/protocol.h"
+
+struct rt_export_feed rt_feed_index_out_of_range;
+
+#define rtex_trace(_req, _cat, msg, args...) do { \
+ if ((_req)->trace_routes & _cat) \
+ log(L_TRACE "%s: " msg, (_req)->name, ##args); \
+} while (0)
+
+static inline enum rt_export_state
+rt_export_change_state(struct rt_export_request *r, u32 expected_mask, enum rt_export_state state)
+{
+ r->last_state_change = current_time();
+ enum rt_export_state old = atomic_exchange_explicit(&r->export_state, state, memory_order_acq_rel);
+ if (!((1 << old) & expected_mask))
+ bug("Unexpected export state change from %s to %s, expected mask %02x",
+ rt_export_state_name(old),
+ rt_export_state_name(state),
+ expected_mask
+ );
+
+ rtex_trace(r, D_STATES, "Export state changed from %s to %s",
+ rt_export_state_name(old), rt_export_state_name(state));
+
+ return old;
+}
+
+const struct rt_export_union *
+rt_export_get(struct rt_export_request *r)
+{
+ ASSERT_DIE(!r->cur);
+
+#define EXPORT_FOUND(_kind) do { \
+ struct rt_export_union *reu = tmp_alloc(sizeof *reu); \
+ *reu = (struct rt_export_union) { \
+ .kind = _kind, \
+ .req = r, \
+ .update = update, \
+ .feed = feed, \
+ }; \
+ return (r->cur = reu); \
+} while (0)
+
+#define NOT_THIS_UPDATE \
+ lfjour_release(&r->r, &update->li); \
+ continue;
+
+ while (1)
+ {
+ enum rt_export_state es = rt_export_get_state(r);
+ switch (es)
+ {
+ case TES_DOWN:
+ rtex_trace(r, (D_ROUTES|D_STATES), "Export is down");
+ return NULL;
+
+ case TES_STOP:
+ rtex_trace(r, (D_ROUTES|D_STATES), "Received stop event");
+ struct rt_export_union *reu = tmp_alloc(sizeof *reu);
+ *reu = (struct rt_export_union) {
+ .kind = RT_EXPORT_STOP,
+ .req = r,
+ };
+ return (r->cur = reu);
+
+ case TES_PARTIAL:
+ case TES_FEEDING:
+ case TES_READY:
+ break;
+
+ case TES_MAX:
+ bug("invalid export state");
+ }
+
+ /* Process sequence number reset event */
+ if (lfjour_reset_seqno(&r->r))
+ bmap_reset(&r->seq_map, 4);
+
+ /* Get a new update */
+ SKIP_BACK_DECLARE(struct rt_export_item, update, li, lfjour_get(&r->r));
+ SKIP_BACK_DECLARE(struct rt_exporter, e, journal, lfjour_of_recipient(&r->r));
+ struct rt_export_feed *feed = NULL;
+
+ /* No update, try feed */
+ if (!update)
+ {
+ if (es == TES_READY)
+ {
+ /* Fed up of feeding */
+ rtex_trace(r, D_ROUTES, "Export drained");
+ return NULL;
+ }
+ else if (feed = rt_export_next_feed(&r->feeder))
+ {
+ /* Feeding more */
+ bmap_set(&r->feed_map, feed->ni->index);
+ rtex_trace(r, D_ROUTES, "Feeding %N", feed->ni->addr);
+
+ EXPORT_FOUND(RT_EXPORT_FEED);
+ }
+ else if (rt_export_get_state(r) == TES_DOWN)
+ {
+ /* Torn down inbetween */
+ rtex_trace(r, D_STATES, "Export ended itself");
+ return NULL;
+ }
+ else
+ {
+ /* No more food */
+ rt_export_change_state(r, BIT32_ALL(TES_FEEDING, TES_PARTIAL), TES_READY);
+ rtex_trace(r, D_STATES, "Fed up");
+ CALL(r->fed, r);
+ return NULL;
+ }
+ }
+
+ /* There actually is an update */
+ if (bmap_test(&r->seq_map, update->seq))
+ {
+ /* But this update has been already processed, let's try another one */
+ rtex_trace(r, D_ROUTES, "Skipping an already processed update %lu", update->seq);
+ NOT_THIS_UPDATE;
+ }
+
+ /* Is this update allowed by prefilter? */
+ const net_addr *n = (update->new ?: update->old)->net;
+ struct netindex *ni = NET_TO_INDEX(n);
+
+ if (!rt_prefilter_net(&r->feeder.prefilter, n))
+ {
+ rtex_trace(r, D_ROUTES, "Not exporting %N due to prefilter", n);
+ NOT_THIS_UPDATE;
+ }
+
+ if ((es != TES_READY) && rt_net_is_feeding(r, n))
+ {
+ /* But this net shall get a feed first! */
+ rtex_trace(r, D_ROUTES, "Expediting %N feed due to pending update %lu", n, update->seq);
+ if (r->feeder.domain.rtable)
+ {
+ LOCK_DOMAIN(rtable, r->feeder.domain);
+ feed = e->feed_net(e, NULL, ni->index, NULL, NULL, update);
+ UNLOCK_DOMAIN(rtable, r->feeder.domain);
+ }
+ else
+ {
+ RCU_ANCHOR(u);
+ feed = e->feed_net(e, u, ni->index, NULL, NULL, update);
+ }
+
+ bmap_set(&r->feed_map, ni->index);
+ ASSERT_DIE(feed && (feed != &rt_feed_index_out_of_range));
+
+ EXPORT_FOUND(RT_EXPORT_FEED);
+ }
+
+ /* OK, now this actually is an update, thank you for your patience */
+ rtex_trace(r, D_ROUTES, "Updating %N, seq %lu", n, update->seq);
+
+ EXPORT_FOUND(RT_EXPORT_UPDATE);
+ }
+
+#undef NOT_THIS_UPDATE
+#undef EXPORT_FOUND
+}
+
+void
+rt_export_release(const struct rt_export_union *u)
+{
+ /* May be already released */
+ if (!u->req)
+ return;
+
+ struct rt_export_request *r = u->req;
+
+ /* Must be crosslinked */
+ ASSERT_DIE(r->cur == u);
+ r->cur = NULL;
+
+ switch (u->kind)
+ {
+ case RT_EXPORT_FEED:
+ for (uint i = 0; i < u->feed->count_exports; i++)
+ bmap_set(&r->seq_map, u->feed->exports[i]);
+
+ if (!u->update)
+ break;
+
+ /* fall through */
+
+ case RT_EXPORT_UPDATE:
+ rtex_trace(r, D_ROUTES, "Export %lu released", u->update->seq);
+ lfjour_release(&r->r, &u->update->li);
+
+ break;
+
+ case RT_EXPORT_STOP:
+ /* Checking that we have indeed stopped the exporter */
+ ASSERT_DIE(rt_export_get_state(r) == TES_DOWN);
+ rtex_trace(r, D_ROUTES, "Export stopped");
+ break;
+
+ default:
+ bug("strange export kind");
+ }
+}
+
+void
+rt_export_processed(struct rt_export_request *r, u64 seq)
+{
+ rtex_trace(r, D_ROUTES, "Marking export %lu as processed", seq);
+
+ /* Check sequence number reset event */
+ if (lfjour_reset_seqno(&r->r))
+ bmap_reset(&r->seq_map, 4);
+
+ ASSERT_DIE(!bmap_test(&r->seq_map, seq));
+ bmap_set(&r->seq_map, seq);
+}
+
+struct rt_export_feed *
+rt_alloc_feed(uint routes, uint exports)
+{
+ struct rt_export_feed *feed;
+ uint size = sizeof *feed
+ + routes * sizeof *feed->block + _Alignof(typeof(*feed->block))
+ + exports * sizeof *feed->exports + _Alignof(typeof(*feed->exports));
+
+ feed = tmp_alloc(size);
+
+ feed->count_routes = routes;
+ feed->count_exports = exports;
+ BIRD_SET_ALIGNED_POINTER(feed->block, feed->data);
+ BIRD_SET_ALIGNED_POINTER(feed->exports, &feed->block[routes]);
+
+ /* Consistency check */
+ ASSERT_DIE(((void *) &feed->exports[exports]) <= ((void *) feed) + size);
+
+ return feed;
+}
+
+static struct rt_export_feed *
+rt_export_get_next_feed(struct rt_export_feeder *f, struct rcu_unwinder *u)
+{
+ for (uint retry = 0; retry < (u ? 1024 : ~0U); retry++)
+ {
+ ASSERT_DIE(u || DOMAIN_IS_LOCKED(rtable, f->domain));
+
+ struct rt_exporter *e = atomic_load_explicit(&f->exporter, memory_order_acquire);
+ if (!e)
+ {
+ rtex_trace(f, (D_ROUTES|D_STATES), "Exporter kicked us away");
+ return NULL;
+ }
+
+ struct rt_export_feed *feed = e->feed_net(e, u, f->feed_index,
+ rt_net_is_feeding_feeder, f, NULL);
+ if (feed == &rt_feed_index_out_of_range)
+ {
+ rtex_trace(f, D_ROUTES, "Nothing more to feed", f->feed_index);
+ f->feed_index = ~0;
+ return NULL;
+ }
+
+#define NEXT_INDEX(f) f->feed_index = f->next_feed_index ? f->next_feed_index(f, f->feed_index + 1) : f->feed_index + 1
+
+#define NOT_THIS_FEED(...) { \
+ rtex_trace(f, D_ROUTES, __VA_ARGS__); \
+ NEXT_INDEX(f); \
+ continue; \
+}
+
+ if (!feed)
+ NOT_THIS_FEED("Nothing found for index %u", f->feed_index);
+
+ NEXT_INDEX(f);
+ return feed;
+ }
+
+ RCU_RETRY_FAST(u);
+}
+
+struct rt_export_feed *
+rt_export_next_feed(struct rt_export_feeder *f)
+{
+ ASSERT_DIE(f);
+
+ struct rt_export_feed *feed = NULL;
+ if (f->domain.rtable)
+ {
+ LOCK_DOMAIN(rtable, f->domain);
+ feed = rt_export_get_next_feed(f, NULL);
+ UNLOCK_DOMAIN(rtable, f->domain);
+ }
+ else
+ {
+ RCU_ANCHOR(u);
+ feed = rt_export_get_next_feed(f, u);
+ }
+
+ if (feed)
+ return feed;
+
+ /* Feeding done */
+ struct rt_feeding_request *reverse = NULL;
+ while (f->feeding)
+ {
+ struct rt_feeding_request *rfr = f->feeding;
+ f->feeding = rfr->next;
+ rfr->next = reverse;
+ reverse = rfr;
+ }
+
+ /* Call the done hook in the same order as requests came in */
+ while (reverse)
+ {
+ struct rt_feeding_request *rfr = reverse;
+ reverse = rfr->next;
+ CALL(rfr->done, rfr);
+ }
+
+ f->feed_index = 0;
+
+ uint count = 0;
+ for (struct rt_feeding_request *rfr = f->feed_pending; rfr; rfr = rfr->next)
+ count++;
+
+ rtex_trace(f, D_STATES, "Feeding done, %u refeed request%s pending",
+ count, (count == 1) ? "" : "s");
+
+ if (!f->feed_pending)
+ return NULL;
+
+ f->feeding = f->feed_pending;
+ f->feed_pending = NULL;
+ return rt_export_next_feed(f);
+}
+
+static void
+rt_feeding_request_default_done(struct rt_feeding_request *rfr)
+{
+ mb_free(rfr);
+}
+
+void
+rt_export_refeed_feeder(struct rt_export_feeder *f, struct rt_feeding_request *rfr)
+{
+ if (!rfr)
+ return;
+
+ rfr->next = f->feed_pending;
+ f->feed_pending = rfr;
+}
+
+void rt_export_refeed_request(struct rt_export_request *rer, struct rt_feeding_request *rfr)
+{
+ if (!rfr)
+ {
+ rfr = mb_allocz(rer->pool, sizeof *rfr);
+ rfr->done = rt_feeding_request_default_done;
+ }
+
+ bmap_reset(&rer->feed_map, 4);
+ rt_export_refeed_feeder(&rer->feeder, rfr);
+ rt_export_change_state(rer, BIT32_ALL(TES_FEEDING, TES_PARTIAL, TES_READY), TES_PARTIAL);
+ if (rer->r.event)
+ ev_send(rer->r.target, rer->r.event);
+}
+
+void
+rtex_export_subscribe(struct rt_exporter *e, struct rt_export_request *r)
+{
+ rt_export_change_state(r, BIT32_ALL(TES_DOWN), TES_FEEDING);
+
+ ASSERT_DIE(r->pool);
+
+ rt_feeder_subscribe(e, &r->feeder);
+
+ lfjour_register(&e->journal, &r->r);
+
+ r->stats = (struct rt_export_stats) {};
+ r->last_state_change = current_time();
+ bmap_init(&r->seq_map, r->pool, 4);
+ bmap_init(&r->feed_map, r->pool, 4);
+
+ rt_export_refeed_request(r, NULL);
+}
+
+void
+rtex_export_unsubscribe(struct rt_export_request *r)
+{
+ rt_feeder_unsubscribe(&r->feeder);
+
+ if (r->cur)
+ rt_export_release(r->cur);
+
+ switch (rt_export_change_state(r, BIT32_ALL(TES_FEEDING, TES_PARTIAL, TES_READY, TES_STOP), TES_DOWN))
+ {
+ case TES_FEEDING:
+ case TES_PARTIAL:
+ case TES_READY:
+ case TES_STOP:
+ lfjour_unregister(&r->r);
+ break;
+ default:
+ bug("not implemented");
+ }
+
+ bmap_free(&r->feed_map);
+ bmap_free(&r->seq_map);
+}
+
+static void
+rt_exporter_cleanup_done(struct lfjour *j, u64 begin_seq UNUSED, u64 end_seq)
+{
+ SKIP_BACK_DECLARE(struct rt_exporter, e, journal, j);
+
+ /* TODO: log the begin_seq / end_seq values */
+
+ CALL(e->cleanup_done, e, end_seq);
+ if (e->stopped && (lfjour_count_recipients(j) == 0))
+ {
+ settle_cancel(&j->announce_timer);
+ ev_postpone(&j->cleanup_event);
+ e->stopped(e);
+ }
+}
+
+void
+rt_exporter_init(struct rt_exporter *e, struct settle_config *scf)
+{
+ rtex_trace(e, D_STATES, "Exporter init");
+ e->journal.cleanup_done = rt_exporter_cleanup_done;
+ lfjour_init(&e->journal, scf);
+ ASSERT_DIE(e->feed_net);
+ ASSERT_DIE(e->netindex);
+}
+
+struct rt_export_item *
+rt_exporter_push(struct rt_exporter *e, const struct rt_export_item *uit)
+{
+ /* Get the object */
+ struct lfjour_item *lit = lfjour_push_prepare(&e->journal);
+ if (!lit)
+ return NULL;
+
+ SKIP_BACK_DECLARE(struct rt_export_item, it, li, lit);
+
+ /* Copy the data, keeping the header */
+ memcpy(&it->data, &uit->data, e->journal.item_size - OFFSETOF(struct rt_export_item, data));
+
+ /* Commit the update */
+ rtex_trace(e, D_ROUTES, "Announcing change %lu at %N: %p (%u) -> %p (%u)",
+ lit->seq, (uit->new ?: uit->old)->net,
+ uit->old, uit->old ? uit->old->id : 0,
+ uit->new, uit->new ? uit->new->id : 0);
+
+ lfjour_push_commit(&e->journal);
+
+ /* Return the update pointer */
+ return it;
+}
+
+#define RTEX_FEEDERS_LOCK(e) \
+ while (atomic_exchange_explicit(&e->feeders_lock, 1, memory_order_acq_rel)) \
+ birdloop_yield(); \
+ CLEANUP(_rtex_feeders_unlock_) UNUSED struct rt_exporter *_rtex_feeders_locked_ = e;
+
+static inline void _rtex_feeders_unlock_(struct rt_exporter **e)
+{
+ ASSERT_DIE(atomic_exchange_explicit(&(*e)->feeders_lock, 0, memory_order_acq_rel));
+}
+
+void
+rt_feeder_subscribe(struct rt_exporter *e, struct rt_export_feeder *f)
+{
+ f->feed_index = 0;
+
+ atomic_store_explicit(&f->exporter, e, memory_order_relaxed);
+ f->domain = e->domain;
+
+ RTEX_FEEDERS_LOCK(e);
+ rt_export_feeder_add_tail(&e->feeders, f);
+
+ rtex_trace(f, D_STATES, "Subscribed to exporter %s", e->name);
+}
+
+static void
+rt_feeder_do_unsubscribe(struct rt_export_feeder *f)
+{
+ struct rt_exporter *e = atomic_exchange_explicit(&f->exporter, NULL, memory_order_acquire);
+ if (e)
+ {
+ RTEX_FEEDERS_LOCK(e);
+ rt_export_feeder_rem_node(&e->feeders, f);
+
+ rtex_trace(f, D_STATES, "Unsubscribed from exporter %s", e->name);
+ }
+ else
+ rtex_trace(f, D_STATES, "Already unsubscribed");
+}
+
+void
+rt_feeder_unsubscribe(struct rt_export_feeder *f)
+{
+ if (f->domain.rtable)
+ {
+ LOCK_DOMAIN(rtable, f->domain);
+ rt_feeder_do_unsubscribe(f);
+ UNLOCK_DOMAIN(rtable, f->domain);
+ }
+ else
+ {
+ RCU_ANCHOR(u);
+ rt_feeder_do_unsubscribe(f);
+ }
+}
+
+void
+rt_exporter_shutdown(struct rt_exporter *e, void (*stopped)(struct rt_exporter *))
+{
+ rtex_trace(e, D_STATES, "Exporter shutdown");
+
+ /* Last lock check before dropping the domain reference */
+ if (e->journal.domain)
+ ASSERT_DIE(DG_IS_LOCKED(e->journal.domain));
+
+ e->journal.domain = NULL;
+
+ /* We have to tell every receiver to stop */
++ bool done = 1;
+ WALK_TLIST(lfjour_recipient, r, &e->journal.recipients)
+ {
+ done = 0;
+ rt_export_change_state(
+ SKIP_BACK(struct rt_export_request, r, r),
+ BIT32_ALL(TES_FEEDING, TES_PARTIAL, TES_READY, TES_STOP),
+ TES_STOP);
+ }
+
+ /* We can drop feeders synchronously */
+ {
+ RTEX_FEEDERS_LOCK(e);
+ WALK_TLIST_DELSAFE(rt_export_feeder, f, &e->feeders)
+ {
+ ASSERT_DIE(atomic_exchange_explicit(&f->exporter, NULL, memory_order_acq_rel) == e);
+ rt_export_feeder_rem_node(&e->feeders, f);
+ }
+ }
+
+ /* Wait for feeders to finish */
+ synchronize_rcu();
+
+ /* The rest is done via the cleanup routine */
+ lfjour_do_cleanup_now(&e->journal);
+
+ if (done)
+ {
+ ev_postpone(&e->journal.cleanup_event);
+ settle_cancel(&e->journal.announce_timer);
+ CALL(stopped, e);
+ }
+ else
+// e->stopped = stopped;
+ bug("not implemented yet");
+}
#undef FVR_VPN
}
- _Bool withdraw = 0;
+/*
+ * ROA aggregation subsystem
+ */
+
+struct rt_roa_aggregator {
+ struct rt_stream stream;
+ struct rte_owner sources;
+ struct rte_src *main_source;
+ struct rt_export_request src;
+ event event;
+};
+
+static void
+rt_dump_roa_aggregator_dst_req(struct rt_import_request *req)
+{
+ debug(" ROA aggregator import request req=%p", req);
+}
+
+static void
+rt_dump_roa_aggregator_src_req(struct rt_export_request *req)
+{
+ debug(" ROA aggregator export request req=%p", req);
+}
+
+static void
+rt_roa_aggregator_state_change(struct rt_import_request *req, u8 state)
+{
+ if (req->trace_routes & D_STATES)
+ log("%s: import state changed to %s",
+ req->name, rt_import_state_name(state));
+}
+
+struct rt_roa_aggregated_adata {
+ adata ad;
+ u32 padding;
+ struct { u32 asn, max_pxlen; } u[0];
+};
+
+#define ROA_AGGR_COUNT(rad) (((typeof (&(rad)->u[0])) (rad->ad.data + rad->ad.length)) - &(rad)->u[0])
+
+static void
+ea_roa_aggregate_format(const eattr *a, byte *buf, uint size)
+{
+ SKIP_BACK_DECLARE(struct rt_roa_aggregated_adata, rad, ad, a->u.ptr);
+ uint cnt = ROA_AGGR_COUNT(rad);
+ for (uint upos = 0; upos < cnt; upos++)
+ {
+ int x = bsnprintf(buf, size, "as %u max %u, ", rad->u[upos].asn, rad->u[upos].max_pxlen);
+ size -= x;
+ buf += x;
+ if (size < 30)
+ {
+ bsnprintf(buf, size, " ... ");
+ return;
+ }
+ }
+
+ buf[-2] = 0;
+}
+
+static struct ea_class ea_roa_aggregated = {
+ .name = "roa_aggregated",
+ .type = T_ROA_AGGREGATED,
+ .format = ea_roa_aggregate_format,
+};
+
+
+static void
+rt_aggregate_roa(void *_rag)
+{
+ struct rt_roa_aggregator *rag = _rag;
+
+ RT_EXPORT_WALK(&rag->src, u) TMP_SAVED
+ {
++ bool withdraw = 0;
+ const net_addr *nroa = NULL;
+ switch (u->kind)
+ {
+ case RT_EXPORT_STOP:
+ bug("Main table export stopped");
+ break;
+
+ case RT_EXPORT_FEED:
+ nroa = u->feed->ni->addr;
+ withdraw = (u->feed->count_routes == 0);
+ break;
+
+ case RT_EXPORT_UPDATE:
+ nroa = u->update->new ? u->update->new->net : u->update->old->net;
+ withdraw = !u->update->new;
+ break;
+ }
+
+ net_addr_union nip;
+ net_copy(&nip.n, nroa);
+
+ uint asn, max_pxlen;
+
+ switch (nip.n.type)
+ {
+ case NET_ROA6: nip.n.type = NET_IP6;
+ nip.n.length = net_addr_length[NET_IP6];
+ asn = nip.roa6.asn;
+ max_pxlen = nip.roa6.max_pxlen;
+ break;
+ case NET_ROA4: nip.n.type = NET_IP4;
+ nip.n.length = net_addr_length[NET_IP4];
+ asn = nip.roa4.asn;
+ max_pxlen = nip.roa4.max_pxlen;
+ break;
+ default: bug("exported garbage from ROA table");
+ }
+
+ rte prev = rt_net_best(rag->stream.dst_tab, &nip.n);
+
+ struct rt_roa_aggregated_adata *rad_new;
+ uint count;
+
+ if (prev.attrs)
+ {
+ eattr *ea = ea_find(prev.attrs, &ea_roa_aggregated);
+ SKIP_BACK_DECLARE(struct rt_roa_aggregated_adata, rad, ad, ea->u.ptr);
+
+ count = ROA_AGGR_COUNT(rad);
+ rad_new = tmp_alloc(sizeof *rad_new + (count + 1) * sizeof rad_new->u[0]);
+
+ /* Insertion into a sorted list */
+ uint p = 0;
+ for (p = 0; p < count; p++)
+ if ((rad->u[p].asn < asn) || (rad->u[p].asn == asn) && (rad->u[p].max_pxlen < max_pxlen))
+ rad_new->u[p] = rad->u[p];
+ else
+ break;
+
+ if ((rad->u[p].asn == asn) && (rad->u[p].max_pxlen))
+ /* Found */
+ if (withdraw)
+ memcpy(&rad_new->u[p], &rad->u[p+1], (--count - p) * sizeof rad->u[p]);
+ else
+ continue;
+ else
+ /* Not found */
+ if (withdraw)
+ continue;
+ else
+ {
+ rad_new->u[p].asn = asn;
+ rad_new->u[p].max_pxlen = max_pxlen;
+ memcpy(&rad_new->u[p+1], &rad->u[p], (count++ - p) * sizeof rad->u[p]);
+ }
+ }
+ else if (!withdraw)
+ {
+ count = 1;
+ rad_new = tmp_alloc(sizeof *rad_new + sizeof rad_new->u[0]);
+ rad_new->u[0].asn = asn;
+ rad_new->u[0].max_pxlen = max_pxlen;
+ }
+ else
+ continue;
+
+ rad_new->ad.length = (byte *) &rad_new->u[count] - rad_new->ad.data;
+
+ rte r = {
+ .src = rag->main_source,
+ };
+
+ ea_set_attr(&r.attrs, EA_LITERAL_DIRECT_ADATA(&ea_roa_aggregated, 0, &rad_new->ad));
+
+ rte_import(&rag->stream.dst, &nip.n, &r, rag->main_source);
+
+#if 0
+ /* Do not split ROA aggregator, we want this to be finished asap */
+ MAYBE_DEFER_TASK(rag->src.r.target, rag->src.r.event,
+ "export to %s", rag->src.name);
+#endif
+ }
+}
+
+static void
+rt_setup_roa_aggregator(rtable *t)
+{
+ rtable *src = t->config->master.src->table;
+ struct rt_roa_aggregator *rag;
+ {
+ RT_LOCK(t, tab);
+ char *ragname = mb_sprintf(tab->rp, "%s.roa-aggregator", src->name);
+ rag = mb_alloc(tab->rp, sizeof *rag);
+ *rag = (struct rt_roa_aggregator) {
+ .stream = {
+ .dst = {
+ .name = ragname,
+ .trace_routes = tab->debug,
+ .loop = t->loop,
+ .dump_req = rt_dump_roa_aggregator_dst_req,
+ .log_state_change = rt_roa_aggregator_state_change,
+ },
+ .dst_tab = t,
+ },
+ .src = {
+ .name = ragname,
+ .r = {
+ .target = birdloop_event_list(t->loop),
+ .event = &rag->event,
+ },
+ .pool = birdloop_pool(t->loop),
+ .dump = rt_dump_roa_aggregator_src_req,
+ .trace_routes = tab->debug,
+ },
+ .event = {
+ .hook = rt_aggregate_roa,
+ .data = rag,
+ },
+ };
+
+ rt_init_sources(&rag->sources, ragname, birdloop_event_list(t->loop));
+ rag->main_source = rt_get_source_o(&rag->sources, 0);
+
+ tab->master = &rag->stream;
+ }
+
+ rt_request_import(t, &rag->stream.dst);
+ rt_export_subscribe(src, best, &rag->src);
+}
+
+static void
+rt_roa_aggregator_sources_gone(void *t)
+{
+ rt_unlock_table((rtable *) t);
+}
+
+static void
+rt_stop_roa_aggregator(rtable *t)
+{
+ struct rt_roa_aggregator *rag;
+ RT_LOCKED(t, tab)
+ {
+ rag = SKIP_BACK(struct rt_roa_aggregator, stream, tab->master);
+
+ rt_lock_table(tab);
+ rt_destroy_sources(&rag->sources, ev_new_init(tab->rp,
+ rt_roa_aggregator_sources_gone, tab));
+ rt_unlock_source(rag->main_source);
+ }
+
+ /* Stopping both import and export.
+ * All memory will be freed with table shutdown,
+ * no need to do anything from import done callback */
+ rt_stop_import(&rag->stream.dst, NULL);
+ rt_export_unsubscribe(best, &rag->src);
+}
/**
* roa_check - check validity of route origination in a ROA table
}
static void
-rt_notify_accepted(struct channel *c, net *net, rte *new_changed, rte *old_changed, int refeed)
+rt_notify_accepted(struct channel *c, const struct rt_export_feed *feed)
{
- // struct proto *p = c->proto;
- rte *new_best = NULL;
- rte *old_best = NULL;
- rte *new_free = NULL;
- int new_first = 0;
+ rte *old_best, *new_best;
- _Bool feeding = rt_net_is_feeding(&c->out_req, feed->ni->addr);
- _Bool idempotent = 0;
++ bool feeding = rt_net_is_feeding(&c->out_req, feed->ni->addr);
++ bool idempotent = 0;
- /*
- * We assume that there are no changes in net route order except (added)
- * new_changed and (removed) old_changed. Therefore, the function is not
- * compatible with deterministic_med (where nontrivial reordering can happen
- * as a result of a route change) and with recomputation of recursive routes
- * due to next hop update (where many routes can be changed in one step).
- *
- * Note that we need this assumption just for optimizations, we could just
- * run full new_best recomputation otherwise.
- *
- * There are three cases:
- * feed or old_best is old_changed -> we need to recompute new_best
- * old_best is before new_changed -> new_best is old_best, ignore
- * old_best is after new_changed -> try new_changed, otherwise old_best
- */
-
- if (net->routes)
- c->stats.exp_updates_received++;
- else
- c->stats.exp_withdraws_received++;
-
- /* Find old_best - either old_changed, or route for net->routes */
- if (old_changed && bmap_test(&c->export_map, old_changed->id))
- old_best = old_changed;
- else
+ for (uint i = 0; i < feed->count_routes; i++)
{
- for (rte *r = net->routes; rte_is_valid(r); r = r->next)
+ rte *r = &feed->block[i];
+
+ /* Previously exported */
+ if (!old_best && bmap_test(&c->export_accepted_map, r->id))
{
- if (bmap_test(&c->export_map, r->id))
+ old_best = r;
+
+ /* Is still the best and need not be refed anyway */
+ if (!new_best && !feeding)
{
- old_best = r;
- break;
+ idempotent = 1;
+ new_best = r;
}
+ }
+
+ /* Unflag obsolete routes */
+ if (r->flags & REF_OBSOLETE)
+ bmap_clear(&c->export_rejected_map, r->id);
+
+ /* Mark invalid as rejected */
+ else if (!rte_is_valid(r))
+ bmap_set(&c->export_rejected_map, r->id);
- /* Note if new_changed found before old_best */
- if (r == new_changed)
- new_first = 1;
+ /* Already rejected */
+ else if (!feeding && bmap_test(&c->export_rejected_map, r->id))
+ ;
+
+ /* No new best route yet and this is a valid candidate */
+ else if (!new_best)
+ {
+ /* This branch should not be executed if this route is old best */
+ ASSERT_DIE(r != old_best);
+
+ /* Have no new best route yet, try this route not seen before */
+ new_best = export_filter(c, r, 0);
+ DBG("rt_notify_accepted: checking route id %u: %s\n", r->id, new_best ? "ok" : "no");
}
}
}
rte *
-rt_export_merged(struct channel *c, net *net, rte **rt_free, linpool *pool, int silent)
+rt_export_merged(struct channel *c, const struct rt_export_feed *feed, linpool *pool, int silent)
{
- _Bool feeding = !silent && rt_net_is_feeding(&c->out_req, feed->ni->addr);
++ bool feeding = !silent && rt_net_is_feeding(&c->out_req, feed->ni->addr);
+
// struct proto *p = c->proto;
- struct nexthop *nhs = NULL;
- rte *best0, *best, *rt0, *rt, *tmp;
+ struct nexthop_adata *nhs = NULL;
+ rte *best0 = &feed->block[0];
+ rte *best = NULL;
- best0 = net->routes;
- *rt_free = NULL;
+ /* First route is obsolete */
+ if (best0->flags & REF_OBSOLETE)
+ return NULL;
+ /* First route is invalid */
if (!rte_is_valid(best0))
return NULL;
}
static void
-rt_notify_merged(struct channel *c, net *net, rte *new_changed, rte *old_changed,
- rte *new_best, rte *old_best, int refeed)
+rt_notify_merged(struct channel *c, const struct rt_export_feed *f)
{
- // struct proto *p = c->proto;
- rte *new_free = NULL;
-
- /* We assume that all rte arguments are either NULL or rte_is_valid() */
+ const rte *old_best = NULL;
+ /* Find old best route */
+ for (uint i = 0; i < f->count_routes; i++)
+ if (bmap_test(&c->export_accepted_map, f->block[i].id))
+ {
+ old_best = &f->block[i];
+ break;
+ }
- /* This check should be done by the caller */
- if (!new_best && !old_best)
- return;
+ /* Prepare new merged route */
+ rte *new_merged = f->count_routes ? rt_export_merged(c, f, tmp_linpool, 0) : NULL;
- /* Check whether the change is relevant to the merged route */
- if ((new_best == old_best) &&
- (new_changed != old_changed) &&
- !rte_mergable(new_best, new_changed) &&
- !rte_mergable(old_best, old_changed))
- return;
+ /* And notify the protocol */
+ if (new_merged || old_best)
+ do_rt_notify(c, f->ni->addr, new_merged, old_best);
+}
- if (new_best)
- c->stats.exp_updates_received++;
- else
- c->stats.exp_withdraws_received++;
- /* Prepare new merged route */
- if (new_best)
- new_best = rt_export_merged(c, net, &new_free, rte_update_pool, 0);
+void
+channel_notify_merged(void *_channel)
+{
+ struct channel *c = _channel;
- /* Check old merged route */
- if (old_best && !bmap_test(&c->export_map, old_best->id))
- old_best = NULL;
+ RT_EXPORT_WALK(&c->out_req, u)
+ {
+ switch (u->kind)
+ {
+ case RT_EXPORT_STOP:
+ bug("Main table export stopped");
- if (!new_best && !old_best)
- return;
+ case RT_EXPORT_FEED:
+ if (u->feed->count_routes)
+ rt_notify_merged(c, u->feed);
+ break;
- do_rt_notify(c, net, new_best, old_best, refeed);
+ case RT_EXPORT_UPDATE:
+ {
+ struct rt_export_feed *f = rt_net_feed(c->table, u->update->new ? u->update->new->net : u->update->old->net, SKIP_BACK(struct rt_pending_export, it, u->update));
+ rt_notify_merged(c, f);
+ for (uint i=0; i<f->count_exports; i++)
+ rt_export_processed(&c->out_req, f->exports[i]);
+ break;
+ }
+ }
- /* Discard temporary rte */
- if (new_free)
- rte_free(new_free);
+ MAYBE_DEFER_TASK(c->out_req.r.target, c->out_req.r.event,
+ "export to %s.%s (merged)", c->proto->name, c->name);
+ }
}
-
-/**
- * rte_announce - announce a routing table change
- * @tab: table the route has been added to
- * @type: type of route announcement (RA_UNDEF or RA_ANY)
- * @net: network in question
- * @new: the new or changed route
- * @old: the previous route replaced by the new one
- * @new_best: the new best route for the same network
- * @old_best: the previous best route for the same network
- *
- * This function gets a routing table update and announces it to all protocols
- * that are connected to the same table by their channels.
- *
- * There are two ways of how routing table changes are announced. First, there
- * is a change of just one route in @net (which may caused a change of the best
- * route of the network). In this case @new and @old describes the changed route
- * and @new_best and @old_best describes best routes. Other routes are not
- * affected, but in sorted table the order of other routes might change.
- *
- * Second, There is a bulk change of multiple routes in @net, with shared best
- * route selection. In such case separate route changes are described using
- * @type of %RA_ANY, with @new and @old specifying the changed route, while
- * @new_best and @old_best are NULL. After that, another notification is done
- * where @new_best and @old_best are filled (may be the same), but @new and @old
- * are NULL.
- *
- * The function announces the change to all associated channels. For each
- * channel, an appropriate preprocessing is done according to channel &ra_mode.
- * For example, %RA_OPTIMAL channels receive just changes of best routes.
- *
- * In general, we first call preexport() hook of a protocol, which performs
- * basic checks on the route (each protocol has a right to veto or force accept
- * of the route before any filter is asked). Then we consult an export filter
- * of the channel and verify the old route in an export map of the channel.
- * Finally, the rt_notify() hook of the protocol gets called.
- *
- * Note that there are also calls of rt_notify() hooks due to feed, but that is
- * done outside of scope of rte_announce().
- */
-static void
-rte_announce(rtable *tab, uint type, net *net, rte *new, rte *old,
- rte *new_best, rte *old_best)
+void
+channel_notify_basic(void *_channel)
{
- if (!rte_is_valid(new))
- new = NULL;
+ struct channel *c = _channel;
- if (!rte_is_valid(old))
- old = NULL;
+ RT_EXPORT_WALK(&c->out_req, u)
+ {
+ switch (u->kind)
+ {
+ case RT_EXPORT_STOP:
+ bug("Main table export stopped");
+
+ case RT_EXPORT_FEED:
+ {
+ /* Find where the old route block begins */
+ uint oldpos = 0;
+ while ((oldpos < u->feed->count_routes) && !(u->feed->block[oldpos].flags & REF_OBSOLETE))
+ oldpos++;
+
+ /* Send updates one after another */
+ for (uint i = 0; i < oldpos; i++)
+ {
+ rte *new = &u->feed->block[i];
+ rte *old = NULL;
+ for (uint o = oldpos; o < u->feed->count_routes; o++)
+ if (new->src == u->feed->block[o].src)
+ {
+ old = &u->feed->block[o];
+ break;
+ }
- if (!rte_is_valid(new_best))
- new_best = NULL;
+ rt_notify_basic(c, new, old);
- if (!rte_is_valid(old_best))
- old_best = NULL;
+ /* Mark old processed */
+ if (old)
+ old->src = NULL;
+ }
- if (!new && !old && !new_best && !old_best)
- return;
+ /* Send withdraws */
+ for (uint o = oldpos; o < u->feed->count_routes; o++)
+ if (u->feed->block[o].src)
+ rt_notify_basic(c, NULL, &u->feed->block[o]);
+ }
+ break;
- if (new_best != old_best)
- {
- if (new_best)
- new_best->sender->stats.pref_routes++;
- if (old_best)
- old_best->sender->stats.pref_routes--;
+ case RT_EXPORT_UPDATE:
+ {
+ const rte *new = u->update->new;
+ const rte *old = u->update->old;
+ struct rte_src *src = (c->ra_mode == RA_ANY) ? (new ? new->src : old->src) : NULL;
+
+ /* Squashing subsequent updates */
+ for (SKIP_BACK_DECLARE(const struct rt_pending_export, rpe, it, u->update);
+ rpe = atomic_load_explicit(&rpe->next, memory_order_acquire) ;)
+ /* Either new is the same as this update's "old". Then the squash
+ * is obvious.
+ *
+ * Or we're squashing an update-from-nothing with a withdrawal,
+ * and then either src is set because it must match (RA_ANY)
+ * or it doesn't matter at all (RA_OPTIMAL).
+ */
+ if ((rpe->it.old == new) && (new || src && (src == rpe->it.new->src)))
+ {
+ new = rpe->it.new;
+ rt_export_processed(&c->out_req, rpe->it.seq);
+ }
- if (tab->hostcache)
- rt_notify_hostcache(tab, net);
+ if (new && old && rte_same(new, old))
+ {
+ channel_rte_trace_out(D_ROUTES, c, new, "already exported");
- if (!EMPTY_LIST(tab->flowspec_links))
- rt_flowspec_notify(tab, net);
- }
+ if ((new->id != old->id) && bmap_test(&c->export_accepted_map, old->id))
+ {
+ bmap_set(&c->export_accepted_map, new->id);
+ bmap_clear(&c->export_accepted_map, old->id);
+ }
+ }
+ else if (!new && !old)
+ channel_rte_trace_out(D_ROUTES, c, u->update->new, "idempotent withdraw (squash)");
+ else
+ rt_notify_basic(c, new, old);
+
+ break;
+ }
+ }
- rt_schedule_notify(tab);
+ MAYBE_DEFER_TASK(c->out_req.r.target, c->out_req.r.event,
+ "export to %s.%s (regular)", c->proto->name, c->name);
+ }
+}
- struct channel *c; node *n;
- WALK_LIST2(c, n, tab->channels, table_node)
+static void
+rt_flush_best(struct rtable_private *tab, u64 upto)
+{
+ u64 last_seq = 0;
+ RT_EXPORT_WALK(&tab->best_req, u)
{
- if (c->export_state == ES_DOWN)
- continue;
+ ASSERT_DIE(u->kind == RT_EXPORT_UPDATE);
+ ASSERT_DIE(u->update->seq <= upto);
+ last_seq = u->update->seq;
+ if (last_seq == upto)
+ return;
+ }
- if (type && (type != c->ra_mode))
- continue;
+ rt_trace(tab, D_STATES, "Export best full flushed regular up to %lu", last_seq);
+}
- switch (c->ra_mode)
- {
- case RA_OPTIMAL:
- if (new_best != old_best)
- rt_notify_basic(c, net, new_best, old_best, 0);
- break;
+static struct rt_pending_export *
+rte_announce_to(struct rt_exporter *e, struct rt_net_pending_export *npe, const rte *new, const rte *old)
+{
+ if (new == old)
+ return NULL;
- case RA_ANY:
- if (new != old)
- rt_notify_basic(c, net, new, old, 0);
- break;
+ struct rt_pending_export rpe = {
+ .it = {
+ .new = new,
+ .old = old,
+ },
+ };
- case RA_ACCEPTED:
- /*
- * The (new != old) condition is problematic here, as it would break
- * the second usage pattern (announcement after bulk change, used in
- * rt_next_hop_update_net(), which sends both new and old as NULL).
- *
- * But recursive next hops do not work with sorted tables anyways,
- * such configuration is forbidden in BGP and not supported in
- * rt_notify_accepted().
- *
- * The condition is needed to eliminate spurious announcements where
- * both old and new routes are not valid (so they are NULL).
- */
- if (new != old)
- rt_notify_accepted(c, net, new, old, 0);
- break;
+ struct rt_export_item *rei = rt_exporter_push(e, &rpe.it);
+ if (!rei)
+ return NULL;
- case RA_MERGED:
- rt_notify_merged(c, net, new, old, new_best, old_best, 0);
- break;
- }
- }
+ SKIP_BACK_DECLARE(struct rt_pending_export, pushed, it, rei);
+
+ struct rt_pending_export *last = atomic_load_explicit(&npe->last, memory_order_relaxed);
+ if (last)
+ ASSERT_DIE(atomic_exchange_explicit(&last->next, pushed, memory_order_acq_rel) == NULL);
+
+ atomic_store_explicit(&npe->last, pushed, memory_order_release);
+ if (!atomic_load_explicit(&npe->first, memory_order_relaxed))
+ atomic_store_explicit(&npe->first, pushed, memory_order_release);
+
+ return pushed;
}
-static inline int
-rte_validate(rte *e)
+static void
+rte_announce(struct rtable_private *tab, const struct netindex *i UNUSED, net *net, const rte *new, const rte *old,
+ const rte *new_best, const rte *old_best)
{
- int c;
- net *n = e->net;
+ /* Update network count */
+ tab->net_count += (!!new_best - !!old_best);
+
+ int new_best_valid = rte_is_valid(new_best);
+ int old_best_valid = rte_is_valid(old_best);
+
+ if ((new == old) && (new_best == old_best))
+ return;
- if (!net_validate(n->n.addr))
+ if (new_best_valid)
+ new_best->sender->stats.pref++;
+ if (old_best_valid)
+ old_best->sender->stats.pref--;
+
+ /* Try to push */
+ struct rt_pending_export *best_rpe = NULL;
+ struct rt_pending_export *all_rpe = rte_announce_to(&tab->export_all, &net->all, new, old);
+ if (all_rpe)
{
- log(L_WARN "Ignoring bogus prefix %N received via %s",
- n->n.addr, e->sender->proto->name);
- return 0;
+ /* Also best may have changed */
+ best_rpe = rte_announce_to(&tab->export_best, &net->best, new_best, old_best);
+ if (best_rpe)
+ /* Announced best, need an anchor to all */
+ best_rpe->seq_all = all_rpe->it.seq;
+ else if (!lfjour_pending_items(&tab->export_best.journal))
+ /* Best is idle, flush its recipient immediately */
+ rt_flush_best(tab, all_rpe->it.seq);
+
+ rt_check_cork_high(tab);
}
-
- /* FIXME: better handling different nettypes */
- c = !net_is_flow(n->n.addr) ?
- net_classify(n->n.addr): (IADDR_HOST | SCOPE_UNIVERSE);
- if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
+ else
{
- log(L_WARN "Ignoring bogus route %N received via %s",
- n->n.addr, e->sender->proto->name);
- return 0;
+ /* Not announced anything, cleanup now */
+ ASSERT_DIE(new_best == old_best);
+ hmap_clear(&tab->id_map, old->id);
+ rte_free(SKIP_BACK(struct rte_storage, rte, old), tab);
}
+}
- if (net_type_match(n->n.addr, NB_DEST) == !e->attrs->dest)
- {
- /* Exception for flowspec that failed validation */
- if (net_is_flow(n->n.addr) && (e->attrs->dest == RTD_UNREACHABLE))
- return 1;
+static net *
+rt_cleanup_find_net(struct rtable_private *tab, struct rt_pending_export *rpe)
+{
+ /* Find the appropriate struct network */
+ ASSERT_DIE(rpe->it.new || rpe->it.old);
+ const net_addr *n = rpe->it.new ?
+ rpe->it.new->net :
+ rpe->it.old->net;
+ struct netindex *ni = NET_TO_INDEX(n);
+ ASSERT_DIE(ni->index < atomic_load_explicit(&tab->routes_block_size, memory_order_relaxed));
+ net *routes = atomic_load_explicit(&tab->routes, memory_order_relaxed);
+ return &routes[ni->index];
+}
- static _Bool
- log(L_WARN "Ignoring route %N with invalid dest %d received via %s",
- n->n.addr, e->attrs->dest, e->sender->proto->name);
- return 0;
- }
++static bool
+rt_cleanup_update_pointers(struct rt_net_pending_export *npe, struct rt_pending_export *rpe)
+{
+ struct rt_pending_export *first = atomic_load_explicit(&npe->first, memory_order_relaxed);
+ struct rt_pending_export *last = atomic_load_explicit(&npe->last, memory_order_relaxed);
+ ASSERT_DIE(rpe == first);
- if ((e->attrs->dest == RTD_UNICAST) && !nexthop_is_sorted(&(e->attrs->nh)))
- {
- log(L_WARN "Ignoring unsorted multipath route %N received via %s",
- n->n.addr, e->sender->proto->name);
+ atomic_store_explicit(
+ &npe->first,
+ atomic_load_explicit(&rpe->next, memory_order_relaxed),
+ memory_order_release
+ );
+
+ if (rpe != last)
return 0;
- }
+ atomic_store_explicit(&npe->last, NULL, memory_order_release);
return 1;
}
-/**
- * rte_free - delete a &rte
- * @e: &rte to be deleted
- *
- * rte_free() deletes the given &rte from the routing table it's linked to.
- */
-void
-rte_free(rte *e)
+static void
+rt_cleanup_export_best(struct lfjour *j, struct lfjour_item *i)
{
- rt_unlock_source(e->src);
- if (rta_is_cached(e->attrs))
- rta_free(e->attrs);
- sl_free(e);
+ SKIP_BACK_DECLARE(struct rt_pending_export, rpe, it.li, i);
+ SKIP_BACK_DECLARE(struct rtable_private, tab, export_best.journal, j);
+ rt_flush_best(tab, rpe->seq_all);
+
+ /* Find the appropriate struct network */
+ net *net = rt_cleanup_find_net(tab, rpe);
+
+ /* Update the first and last pointers */
+ rt_cleanup_update_pointers(&net->best, rpe);
}
-static inline void
-rte_free_quick(rte *e)
+static void
+rt_cleanup_export_all(struct lfjour *j, struct lfjour_item *i)
{
- rt_unlock_source(e->src);
- rta_free(e->attrs);
- sl_free(e);
+ SKIP_BACK_DECLARE(struct rt_pending_export, rpe, it.li, i);
+ SKIP_BACK_DECLARE(struct rtable_private, tab, export_all.journal, j);
+
+ /* Find the appropriate struct network */
+ net *net = rt_cleanup_find_net(tab, rpe);
+
+ /* Update the first and last pointers */
- _Bool is_last = rt_cleanup_update_pointers(&net->all, rpe);
++ bool is_last = rt_cleanup_update_pointers(&net->all, rpe);
+
+ /* Free the old route */
+ if (rpe->it.old)
+ {
+ ASSERT_DIE(rpe->it.old->flags & REF_OBSOLETE);
+ hmap_clear(&tab->id_map, rpe->it.old->id);
+ rte_free(SKIP_BACK(struct rte_storage, rte, rpe->it.old), tab);
+ }
+
+ if (is_last)
+ tab->gc_counter++;
}
-int
-rte_same(rte *x, rte *y)
+static void
+rt_dump_best_req(struct rt_export_request *req)
{
- /* rte.flags / rte.pflags are not checked, as they are internal to rtable */
- return
- x->attrs == y->attrs &&
- x->src == y->src &&
- rte_is_filtered(x) == rte_is_filtered(y);
+ SKIP_BACK_DECLARE(struct rtable_private, tab, best_req, req);
+ debug(" Table %s best cleanup request (%p)\n", tab->name, req);
}
-static inline int rte_is_ok(rte *e) { return e && !rte_is_filtered(e); }
-
static void
-rte_recalculate(struct channel *c, net *net, rte *new, struct rte_src *src)
+rt_import_cleared(void *_ih)
{
- struct proto *p = c->proto;
- struct rtable *table = c->table;
- struct proto_stats *stats = &c->stats;
- static struct tbf rl_pipe = TBF_DEFAULT_LOG_LIMITS;
- rte *before_old = NULL;
- rte *old_best = net->routes;
- rte *old = NULL;
- rte **k;
-
- k = &net->routes; /* Find and remove original route from the same protocol */
- while (old = *k)
- {
- if (old->src == src)
- {
- /* If there is the same route in the routing table but from
- * a different sender, then there are two paths from the
- * source protocol to this routing table through transparent
- * pipes, which is not allowed.
- *
- * We log that and ignore the route. If it is withdraw, we
- * ignore it completely (there might be 'spurious withdraws',
- * see FIXME in do_rte_announce())
- */
- if (old->sender->proto != p)
- {
- if (new)
- {
- log_rl(&rl_pipe, L_ERR "Pipe collision detected when sending %N to table %s",
- net->n.addr, table->name);
- rte_free_quick(new);
- }
- return;
- }
-
- if (new && rte_same(old, new))
- {
- /* No changes, ignore the new route and refresh the old one */
+ struct rt_import_hook *hook = _ih;
- old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
-
- if (!rte_is_filtered(new))
- {
- stats->imp_updates_ignored++;
- rte_trace_in(D_ROUTES, c, new, "ignored");
- }
+ ASSERT_DIE(hook->import_state == TIS_CLEARED);
- rte_free_quick(new);
- return;
- }
- *k = old->next;
- table->rt_count--;
- break;
- }
- k = &old->next;
- before_old = old;
- }
+ /* Local copy of the otherwise freed callback data */
+ void (*stopped)(struct rt_import_request *) = hook->stopped;
+ struct rt_import_request *req = hook->req;
- /* Save the last accessed position */
- rte **pos = k;
+ /* Finally uncouple from the table */
+ RT_LOCKED(hook->table, tab)
+ {
+ req->hook = NULL;
- if (!old)
- before_old = NULL;
+ rt_trace(tab, D_EVENTS, "Hook %s stopped", req->name);
+ rem_node(&hook->n);
+ mb_free(hook);
+ rt_unlock_table(tab);
+ }
- if (!old && !new)
- {
- stats->imp_withdraws_ignored++;
- return;
- }
+ /* And call the callback */
+ CALL(stopped, req);
+}
- int new_ok = rte_is_ok(new);
- int old_ok = rte_is_ok(old);
+static void
+rt_cleanup_done_all(struct rt_exporter *e, u64 end_seq)
+{
+ SKIP_BACK_DECLARE(struct rtable_private, tab, export_all, e);
+ ASSERT_DIE(DG_IS_LOCKED(tab->lock.rtable));
- struct channel_limit *l = &c->rx_limit;
- if (l->action && !old && new && !c->in_table)
- {
- u32 all_routes = stats->imp_routes + stats->filt_routes;
+ if (~end_seq)
+ rt_trace(tab, D_STATES, "Export all cleanup done up to seq %lu", end_seq);
+ else
+ rt_trace(tab, D_STATES, "Export all cleanup complete");
- if (all_routes >= l->limit)
- channel_notify_limit(c, l, PLD_RX, all_routes);
+ rt_check_cork_low(tab);
- if (l->state == PLS_BLOCKED)
+ struct rt_import_hook *ih; node *x, *n;
+ uint cleared_counter = 0;
+ if (tab->wait_counter)
+ WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n)
+ if (ih->import_state == TIS_WAITING)
+ {
+ if (end_seq >= ih->flush_seq)
{
- /* In receive limit the situation is simple, old is NULL so
- we just free new and exit like nothing happened */
-
- stats->imp_updates_ignored++;
- rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
- rte_free_quick(new);
- return;
+ ih->import_state = TIS_CLEARED;
+ tab->wait_counter--;
+ cleared_counter++;
+
+ ih->cleanup_event = (event) {
+ .hook = rt_import_cleared,
+ .data = ih,
+ };
+ ev_send_loop(ih->req->loop, &ih->cleanup_event);
}
- }
-
- l = &c->in_limit;
- if (l->action && !old_ok && new_ok)
- {
- if (stats->imp_routes >= l->limit)
- channel_notify_limit(c, l, PLD_IN, stats->imp_routes);
+ }
- if (l->state == PLS_BLOCKED)
- {
- /* In import limit the situation is more complicated. We
- shouldn't just drop the route, we should handle it like
- it was filtered. We also have to continue the route
- processing if old or new is non-NULL, but we should exit
- if both are NULL as this case is probably assumed to be
- already handled. */
+ if (!EMPTY_LIST(tab->imports) &&
+ (tab->gc_counter >= tab->config->gc_threshold))
+ rt_kick_prune_timer(tab);
+}
- stats->imp_updates_ignored++;
- rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
+static void
+rt_cleanup_done_best(struct rt_exporter *e, u64 end_seq)
+{
+ SKIP_BACK_DECLARE(struct rtable_private, tab, export_best, e);
- if (c->in_keep_filtered)
- new->flags |= REF_FILTERED;
- else
- { rte_free_quick(new); new = NULL; }
+ if (~end_seq)
+ rt_trace(tab, D_STATES, "Export best cleanup done up to seq %lu", end_seq);
+ else
+ {
+ rt_trace(tab, D_STATES, "Export best cleanup complete, flushing regular");
+ rt_flush_best(tab, ~0ULL);
+ }
+}
- /* Note that old && !new could be possible when
- c->in_keep_filtered changed in the recent past. */
+#define RT_EXPORT_BULK 1024
- if (!old && !new)
- return;
+static inline int
+rte_validate(struct channel *ch, rte *e)
+{
+ int c;
+ const net_addr *n = e->net;
+
+#define IGNORING(pre, post) do { \
+ log(L_WARN "%s.%s: Ignoring " pre " %N " post, ch->proto->name, ch->name, n); \
+ return 0; \
+ } while (0)
+
+ if (!net_validate(n))
+ IGNORING("bogus prefix", "");
+
+ /* FIXME: better handling different nettypes */
+ c = !net_is_flow(n) ?
+ net_classify(n): (IADDR_HOST | SCOPE_UNIVERSE);
+ if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
+ IGNORING("bogus route", "");
+
+ if (net_type_match(n, NB_DEST))
+ {
+ eattr *nhea = ea_find(e->attrs, &ea_gen_nexthop);
+ int dest = nhea_dest(nhea);
+
+ if (dest == RTD_NONE)
+ IGNORING("route", "with no destination");
+
+ if ((dest == RTD_UNICAST) &&
+ !nexthop_is_sorted((struct nexthop_adata *) nhea->u.ptr))
+ IGNORING("unsorted multipath route", "");
+ }
+ else if (ea_find(e->attrs, &ea_gen_nexthop))
+ IGNORING("route", "having a superfluous nexthop attribute");
+
+ return 1;
+}
+
+int
+rte_same(const rte *x, const rte *y)
+{
+ /* rte.flags / rte.pflags are not checked, as they are internal to rtable */
+ return
+ (x == y) || (
+ (x->attrs == y->attrs) ||
+ ((!x->attrs->stored || !y->attrs->stored) && ea_same(x->attrs, y->attrs))
+ ) &&
+ x->src == y->src &&
+ rte_is_filtered(x) == rte_is_filtered(y);
+}
+
+static inline int rte_is_ok(const rte *e) { return e && !rte_is_filtered(e); }
+
+static void
+rte_recalculate(struct rtable_private *table, struct rt_import_hook *c, struct netindex *i, net *net, rte *new, struct rte_src *src)
+{
+ struct rt_import_request *req = c->req;
+ struct rt_import_stats *stats = &c->stats;
+ struct rte_storage *old_best_stored = NET_BEST_ROUTE(table, net);
+ const rte *old_best = old_best_stored ? &old_best_stored->rte : NULL;
+
+ /* If the new route is identical to the old one, we find the attributes in
+ * cache and clone these with no performance drop. OTOH, if we were to lookup
+ * the attributes, such a route definitely hasn't been anywhere yet,
+ * therefore it's definitely worth the time. */
+ struct rte_storage *new_stored = NULL;
+ if (new)
+ {
+ new_stored = rte_store(new, i, table);
+ new = RTES_WRITE(new_stored);
+ }
+
+ struct rte_storage * _Atomic *last_ptr = NULL;
+ struct rte_storage *old_stored = NULL;
+ const rte *old = NULL;
- new_ok = 0;
- goto skip_stats1;
+ /* Find the original route from the same protocol */
+ NET_WALK_ROUTES(table, net, ep, e)
+ {
+ last_ptr = &e->next;
+ if (e->rte.src == src)
+ if (old_stored)
+ bug("multiple routes in table with the same src");
+ else
+ old_stored = e;
+ }
+
+ if (old_stored)
+ {
+ old = &old_stored->rte;
+
+ /* If there is the same route in the routing table but from
+ * a different sender, then there are two paths from the
+ * source protocol to this routing table through transparent
+ * pipes, which is not allowed.
+ * We log that and ignore the route. */
+ if (old->sender != c)
+ {
+ if (!old->generation && !new->generation)
+ bug("Two protocols claim to author a route with the same rte_src in table %s: %N %s/%u:%u",
+ c->table->name, i->addr, old->src->owner->name, old->src->private_id, old->src->global_id);
+
+ log_rl(&table->rl_pipe, L_ERR "Route source collision in table %s: %N %s/%u:%u",
+ c->table->name, i->addr, old->src->owner->name, old->src->private_id, old->src->global_id);
}
+
+ if (new && rte_same(old, &new_stored->rte))
+ {
+ /* No changes, ignore the new route and refresh the old one */
+ old_stored->stale_cycle = new->stale_cycle;
+
+ if (!rte_is_filtered(new))
+ {
+ stats->updates_ignored++;
+ rt_rte_trace_in(D_ROUTES, req, new, "ignored");
+ }
+
+ /* We need to free the already stored route here before returning */
+ rte_free(new_stored, table);
+ return;
+ }
+ }
+
+ if (!old && !new)
+ {
+ stats->withdraws_ignored++;
+ return;
}
+ /* If rejected by import limit, we need to pretend there is no route */
+ if (req->preimport && (req->preimport(req, new, old) == 0))
+ {
+ rte_free(new_stored, table);
+ new_stored = NULL;
+ new = NULL;
+ }
+
+ if (!new && !old)
+ {
+ stats->withdraws_ignored++;
+ return;
+ }
+
+ int new_ok = rte_is_ok(new);
+ int old_ok = rte_is_ok(old);
+
if (new_ok)
- stats->imp_updates_accepted++;
+ stats->updates_accepted++;
else if (old_ok)
- stats->imp_withdraws_accepted++;
+ stats->withdraws_accepted++;
else
- stats->imp_withdraws_ignored++;
+ stats->withdraws_ignored++;
if (old_ok || new_ok)
table->last_rt_change = current_time();
ASSERT(c->channel_state == CS_UP);
- rte_update_lock();
+ /* Storing prefilter routes as an explicit layer */
+ if (new && (c->in_keep & RIK_PREFILTER))
+ new->attrs = ea_lookup_tmp(new->attrs, 0, EALS_PREIMPORT);
+
+#if 0
+ debug("%s.%s -(prefilter)-> %s: %N ", c->proto->name, c->name, c->table->name, n);
+ if (new) ea_dump(new->attrs);
+ else debug("withdraw");
+ debug("\n");
+#endif
+
+ const struct filter *filter = c->in_filter;
+ struct channel_import_stats *stats = &c->import_stats;
+ struct mpls_fec *fec = NULL;
+
if (new)
{
- /* Create a temporary table node */
- nn = alloca(sizeof(net) + n->length);
- memset(nn, 0, sizeof(net) + n->length);
- net_copy(nn->n.addr, n);
+ new->net = n;
+ new->sender = c->in_req.hook;
- new->net = nn;
- new->sender = c;
+ int fr;
- stats->imp_updates_received++;
- if (!rte_validate(new))
+ stats->updates_received++;
+ if ((filter == FILTER_REJECT) ||
+ ((fr = f_run(filter, new, 0)) > F_ACCEPT))
{
- rte_trace_in(D_FILTERS, c, new, "invalid");
- stats->imp_updates_invalid++;
- goto drop;
+ stats->updates_filtered++;
+ channel_rte_trace_in(D_FILTERS, c, new, "filtered out");
+
+ if (c->in_keep & RIK_REJECTED)
+ new->flags |= REF_FILTERED;
+ else
+ new = NULL;
}
- if (filter == FILTER_REJECT)
- {
- stats->imp_updates_filtered++;
- rte_trace_in(D_FILTERS, c, new, "filtered out");
+ if (new && c->proto->mpls_channel)
+ if (mpls_handle_rte(c->proto->mpls_channel, n, new, &fec) < 0)
+ {
+ channel_rte_trace_in(D_FILTERS, c, new, "invalid");
+ stats->updates_invalid++;
+ new = NULL;
+ }
+
+ if (new)
+ {
+ new->attrs = ea_lookup_tmp(new->attrs,
+ (c->in_keep & RIK_PREFILTER) ? BIT32_ALL(EALS_PREIMPORT) : 0, EALS_FILTERED);
- if (! c->in_keep_filtered)
- goto drop;
+ if (net_is_flow(n))
+ rt_flowspec_resolve_rte(new, c);
+ else
+ rt_next_hop_resolve_rte(new);
+ }
- /* new is a private copy, i could modify it */
- new->flags |= REF_FILTERED;
- }
- else if (filter)
+ if (new && !rte_validate(c, new))
{
- int fr = f_run(filter, &new, rte_update_pool, 0);
- if (fr > F_ACCEPT)
- {
- stats->imp_updates_filtered++;
- rte_trace_in(D_FILTERS, c, new, "filtered out");
+ channel_rte_trace_in(D_FILTERS, c, new, "invalid");
+ stats->updates_invalid++;
+ new = NULL;
+ }
+ }
+ else
+ stats->withdraws_received++;
- if (! c->in_keep_filtered)
- goto drop;
+ rte_import(&c->in_req, n, new, src);
- new->flags |= REF_FILTERED;
- }
- }
+ if (fec)
+ {
+ mpls_unlock_fec(fec);
+ DBGL( "Unlock FEC %p (rte_update %N)", fec, n);
+ }
+}
- if (p->mpls_map)
- {
- if (mpls_handle_rte(p->mpls_map, n, new, rte_update_pool, &fec) < 0)
- {
- rte_trace_in(D_FILTERS, c, new, "invalid");
- stats->imp_updates_invalid++;
- goto drop;
- }
- }
+void
+rte_import(struct rt_import_request *req, const net_addr *n, rte *new, struct rte_src *src)
+{
+ struct rt_import_hook *hook = req->hook;
+ if (!hook)
+ {
+ log(L_WARN "%s: Called rte_import without import hook", req->name);
+ return;
+ }
- if (!rta_is_cached(new->attrs)) /* Need to copy attributes */
- new->attrs = rta_lookup(new->attrs);
- new->flags |= REF_COW;
+ RT_LOCKED(hook->table, tab)
+ {
+ u32 bs = atomic_load_explicit(&tab->routes_block_size, memory_order_acquire);
- /* Use the actual struct network, not the dummy one */
- nn = net_get(c->table, n);
- new->net = nn;
- }
- else
+ struct netindex *i;
+ net *routes = atomic_load_explicit(&tab->routes, memory_order_acquire);
+ net *nn;
+ if (new)
{
- stats->imp_withdraws_received++;
+ /* An update */
+ /* Set auxiliary values */
+ new->stale_cycle = hook->stale_set;
+ new->sender = hook;
- if (!(nn = net_find(c->table, n)) || !src)
- {
- stats->imp_withdraws_ignored++;
- rte_update_unlock();
- return;
- }
- }
+ /* Allocate the key structure */
+ i = net_get_index(tab->netindex, n);
+ new->net = i->addr;
- recalc:
- /* And recalculate the best route */
- rte_recalculate(c, nn, new, src);
+ /* Block size update */
+ u32 nbs = bs;
+ while (i->index >= nbs)
+ nbs *= 2;
- if (p->mpls_map)
- mpls_handle_rte_cleanup(p->mpls_map, &fec);
+ if (nbs > bs)
+ {
+ net *nb = mb_alloc(tab->rp, nbs * sizeof *nb);
+ memcpy(&nb[0], routes, bs * sizeof *nb);
+ memset(&nb[bs], 0, (nbs - bs) * sizeof *nb);
+ ASSERT_DIE(atomic_compare_exchange_strong_explicit(
+ &tab->routes, &routes, nb,
+ memory_order_acq_rel, memory_order_relaxed));
+ ASSERT_DIE(atomic_compare_exchange_strong_explicit(
+ &tab->routes_block_size, &bs, nbs,
+ memory_order_acq_rel, memory_order_relaxed));
+ ASSERT_DIE(atomic_compare_exchange_strong_explicit(
+ &tab->export_all.max_feed_index, &bs, nbs,
+ memory_order_acq_rel, memory_order_relaxed));
+ ASSERT_DIE(atomic_compare_exchange_strong_explicit(
+ &tab->export_best.max_feed_index, &bs, nbs,
+ memory_order_acq_rel, memory_order_relaxed));
+
+ synchronize_rcu();
+ mb_free(routes);
+
+ routes = nb;
+ }
- rte_update_unlock();
- return;
+ /* Update table tries */
+ struct f_trie *trie = atomic_load_explicit(&tab->trie, memory_order_relaxed);
+ if (trie)
+ trie_add_prefix(trie, i->addr, i->addr->pxlen, i->addr->pxlen);
+
+ if (tab->trie_new)
+ trie_add_prefix(tab->trie_new, i->addr, i->addr->pxlen, i->addr->pxlen);
+ }
+ else if ((i = net_find_index(tab->netindex, n)) && (i->index < bs))
+ /* Found an block where we can withdraw from */
+ ;
+ else
+ {
+ /* No route for this net is present at all. Ignore right now. */
+ req->hook->stats.withdraws_ignored++;
+ if (req->trace_routes & D_ROUTES)
+ log(L_TRACE "%s > ignored %N withdraw", req->name, n);
+ return;
+ }
- drop:
- rte_free(new);
- new = NULL;
- if (nn = net_find(c->table, n))
- goto recalc;
+ /* Resolve the net structure */
+ nn = &routes[i->index];
- rte_update_unlock();
+ /* Recalculate the best route. */
+ rte_recalculate(tab, hook, i, nn, new, src);
+ }
}
-/* Independent call to rte_announce(), used from next hop
- recalculation, outside of rte_update(). new must be non-NULL */
-static inline void
-rte_announce_i(rtable *tab, uint type, net *net, rte *new, rte *old,
- rte *new_best, rte *old_best)
+/*
+ * Feeding
+ */
+
+static net *
+rt_net_feed_get_net(struct rtable_reading *tr, uint index)
{
- rte_update_lock();
- rte_announce(tab, type, net, new, old, new_best, old_best);
- rte_update_unlock();
+ /* Get the route block from the table */
+ net *routes = atomic_load_explicit(&tr->t->routes, memory_order_acquire);
+ u32 bs = atomic_load_explicit(&tr->t->routes_block_size, memory_order_acquire);
+
+ /* Nothing to actually feed */
+ if (index >= bs)
+ return NULL;
+
+ /* We have a net to feed! */
+ return &routes[index];
}
-static inline void
-rte_discard(rte *old) /* Non-filtered route deletion, used during garbage collection */
+static const struct rt_pending_export *
+rt_net_feed_validate_first(
+ struct rtable_reading *tr,
+ const struct rt_pending_export *first_in_net,
+ const struct rt_pending_export *last_in_net,
+ const struct rt_pending_export *first)
{
- rte_update_lock();
- rte_recalculate(old->sender, old->net, NULL, old->src);
- rte_update_unlock();
+ /* Inconsistent input */
+ if (!first_in_net != !last_in_net)
+ RT_READ_RETRY(tr);
+
+ if (!first)
+ return first_in_net;
+
+ /* Export item validity check: we must find it between first_in_net and last_in_net */
+ const struct rt_pending_export *rpe = first_in_net;
+ while (rpe)
+ if (rpe == first)
+ return first;
+ else if (rpe == last_in_net)
+ /* Got to the end without finding the beginning */
+ break;
+ else
+ rpe = atomic_load_explicit(&rpe->next, memory_order_acquire);
+
+ /* Not found, inconsistent export, retry */
+ RT_READ_RETRY(tr);
}
-/* Modify existing route by protocol hook, used for long-lived graceful restart */
-static inline void
-rte_modify(rte *old)
+static struct rt_export_feed *
- rt_net_feed_index(struct rtable_reading *tr, net *n, _Bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_pending_export *first)
++rt_net_feed_index(struct rtable_reading *tr, net *n, bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_pending_export *first)
{
- rte_update_lock();
+ /* Get the feed itself. It may change under our hands tho. */
+ struct rt_pending_export *first_in_net, *last_in_net;
+ first_in_net = atomic_load_explicit(&n->all.first, memory_order_acquire);
+ last_in_net = atomic_load_explicit(&n->all.last, memory_order_acquire);
+
+ first = rt_net_feed_validate_first(tr, first_in_net, last_in_net, first);
+
+ /* Count the elements */
+ uint rcnt = rte_feed_count(tr, n);
+ uint ecnt = 0;
+ uint ocnt = 0;
+ for (const struct rt_pending_export *rpe = first; rpe;
+ rpe = atomic_load_explicit(&rpe->next, memory_order_acquire))
+ {
+ ecnt++;
+ if (rpe->it.old)
+ ocnt++;
+ }
+
+ if (ecnt) {
+ const net_addr *a = (first->it.new ?: first->it.old)->net;
+ if (prefilter && !prefilter(f, a))
+ return NULL;
+ }
+
+ struct rt_export_feed *feed = NULL;
- rte *new = old->sender->proto->rte_modify(old, rte_update_pool);
- if (new != old)
+ if (rcnt || ocnt || ecnt)
{
- if (new)
+ if (!ecnt && prefilter && !prefilter(f, NET_READ_BEST_ROUTE(tr, n)->rte.net))
+ return NULL;
+
+ feed = rt_alloc_feed(rcnt+ocnt, ecnt);
+
+ if (rcnt)
+ rte_feed_obtain_copy(tr, n, feed->block, rcnt);
+
+ if (ecnt)
{
- if (!rta_is_cached(new->attrs))
- new->attrs = rta_lookup(new->attrs);
- new->flags = (old->flags & ~REF_MODIFY) | REF_COW;
+ uint e = 0;
+ uint rpos = rcnt;
+ for (const struct rt_pending_export *rpe = first; rpe;
+ rpe = atomic_load_explicit(&rpe->next, memory_order_acquire))
+ if (e >= ecnt)
+ RT_READ_RETRY(tr);
+ else
+ {
+ feed->exports[e++] = rpe->it.seq;
+
+ /* Copy also obsolete routes */
+ if (rpe->it.old)
+ {
+ ASSERT_DIE(rpos < rcnt + ocnt);
+ feed->block[rpos++] = *rpe->it.old;
+ ea_free_later(ea_ref(rpe->it.old->attrs));
+ }
+ }
+
+ ASSERT_DIE(e == ecnt);
}
- rte_recalculate(old->sender, old->net, new, old->src);
+ feed->ni = NET_TO_INDEX(feed->block[0].net);
}
- rte_update_unlock();
+ /* Check that it indeed didn't change and the last export is still the same. */
+ if (
+ (first_in_net != atomic_load_explicit(&n->all.first, memory_order_acquire))
+ || (last_in_net != atomic_load_explicit(&n->all.last, memory_order_acquire)))
+ RT_READ_RETRY(tr);
+
+ return feed;
}
-/* Check rtable for best route to given net whether it would be exported do p */
-int
-rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter)
+static struct rt_export_feed *
- rt_net_feed_internal(struct rtable_reading *tr, u32 index, _Bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_pending_export *first)
++rt_net_feed_internal(struct rtable_reading *tr, u32 index, bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_pending_export *first)
{
- struct proto *p = c->proto;
- net *n = net_find(t, a);
- rte *rt = n ? n->routes : NULL;
+ net *n = rt_net_feed_get_net(tr, index);
+ if (!n)
+ return &rt_feed_index_out_of_range;
- if (!rte_is_valid(rt))
- return 0;
+ return rt_net_feed_index(tr, n, prefilter, f, first);
+}
- rte_update_lock();
+struct rt_export_feed *
+rt_net_feed(rtable *t, const net_addr *a, const struct rt_pending_export *first)
+{
+ RT_READ(t, tr);
+ const struct netindex *ni = net_find_index(tr->t->netindex, a);
+ return ni ? rt_net_feed_internal(tr, ni->index, NULL, NULL, first) : NULL;
+}
- /* Rest is stripped down export_filter() */
- int v = p->preexport ? p->preexport(c, rt) : 0;
- if (v == RIC_PROCESS)
- v = (f_run(filter, &rt, rte_update_pool, FF_SILENT) <= F_ACCEPT);
+static struct rt_export_feed *
- rt_feed_net_all(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, _Bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first)
++rt_feed_net_all(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first)
+{
+ RT_READ_ANCHORED(SKIP_BACK(rtable, export_all, e), tr, u);
+ return rt_net_feed_internal(tr, index, prefilter, f, SKIP_BACK(const struct rt_pending_export, it, _first));
+}
- /* Discard temporary rte */
- if (rt != n->routes)
- rte_free(rt);
+rte
+rt_net_best(rtable *t, const net_addr *a)
+{
+ rte rt = {};
- rte_update_unlock();
+ RT_READ(t, tr);
- return v > 0;
+ struct netindex *i = net_find_index(t->netindex, a);
+ net *n = i ? net_find(tr, i) : NULL;
+ if (!n)
+ return rt;
+
+ struct rte_storage *e = NET_READ_BEST_ROUTE(tr, n);
+ if (!e || !rte_is_valid(&e->rte))
+ return rt;
+
+ ASSERT_DIE(e->rte.net == i->addr);
+ ea_free_later(ea_ref(e->rte.attrs));
+ return RTE_COPY(e);
}
- rt_feed_net_best(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, _Bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first)
+static struct rt_export_feed *
++rt_feed_net_best(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first)
+{
+ SKIP_BACK_DECLARE(rtable, t, export_best, e);
+ SKIP_BACK_DECLARE(const struct rt_pending_export, first, it, _first);
+
+ RT_READ_ANCHORED(t, tr, u);
+
+ net *n = rt_net_feed_get_net(tr, index);
+ if (!n)
+ return &rt_feed_index_out_of_range;
+ /* No more to feed, we are fed up! */
+
+ const struct rt_pending_export *first_in_net, *last_in_net;
+ first_in_net = atomic_load_explicit(&n->best.first, memory_order_acquire);
+ last_in_net = atomic_load_explicit(&n->best.last, memory_order_acquire);
+ first = rt_net_feed_validate_first(tr, first_in_net, last_in_net, first);
+
+ uint ecnt = 0;
+ for (const struct rt_pending_export *rpe = first; rpe;
+ rpe = atomic_load_explicit(&rpe->next, memory_order_acquire))
+ ecnt++;
+
+ if (ecnt) {
+ const net_addr *a = (first->it.new ?: first->it.old)->net;
+ if (prefilter && !prefilter(f, a))
+ return NULL;
+ }
-/**
- * rt_refresh_begin - start a refresh cycle
- * @t: related routing table
- * @c related channel
- *
- * This function starts a refresh cycle for given routing table and announce
- * hook. The refresh cycle is a sequence where the protocol sends all its valid
- * routes to the routing table (by rte_update()). After that, all protocol
- * routes (more precisely routes with @c as @sender) not sent during the
- * refresh cycle but still in the table from the past are pruned. This is
+ struct rte_storage *best = NET_READ_BEST_ROUTE(tr, n);
+
+ if (!ecnt && (!best || prefilter && !prefilter(f, best->rte.net)))
+ return NULL;
+
+ struct rt_export_feed *feed = rt_alloc_feed(!!best, ecnt);
+ if (best)
+ {
+ feed->block[0] = best->rte;
+ feed->ni = NET_TO_INDEX(best->rte.net);
+ }
+ else
+ feed->ni = NET_TO_INDEX((first->it.new ?: first->it.old)->net);
+
+ if (ecnt)
+ {
+ uint e = 0;
+ for (const struct rt_pending_export *rpe = first; rpe;
+ rpe = atomic_load_explicit(&rpe->next, memory_order_acquire))
+ if (e >= ecnt)
+ RT_READ_RETRY(tr);
+ else
+ feed->exports[e++] = rpe->it.seq;
+
+ ASSERT_DIE(e == ecnt);
+ }
+
+ /* Check that it indeed didn't change and the last export is still the same. */
+ if (
+ (first_in_net != atomic_load_explicit(&n->best.first, memory_order_acquire))
+ || (last_in_net != atomic_load_explicit(&n->best.last, memory_order_acquire)))
+ RT_READ_RETRY(tr);
+
+ /* And we're finally done */
+ return feed;
+}
+
+
+/* Check rtable for best route to given net whether it would be exported do p */
+int
+rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter)
+{
+ rte rt = rt_net_best(t, a);
+
+ int v = c->proto->preexport ? c->proto->preexport(c, &rt) : 0;
+ if (v == RIC_PROCESS)
+ v = (f_run(filter, &rt, FF_SILENT) <= F_ACCEPT);
+
+ return v > 0;
+}
+
+static inline void
+rt_set_import_state(struct rt_import_hook *hook, u8 state)
+{
+ hook->last_state_change = current_time();
+ hook->import_state = state;
+
+ CALL(hook->req->log_state_change, hook->req, state);
+}
+
+void
+rt_request_import(rtable *t, struct rt_import_request *req)
+{
+ RT_LOCKED(t, tab)
+ {
+ rt_lock_table(tab);
+
+ struct rt_import_hook *hook = req->hook = mb_allocz(tab->rp, sizeof(struct rt_import_hook));
+
+ DBG("Lock table %s for import %p req=%p uc=%u\n", tab->name, hook, req, tab->use_count);
+
+ hook->req = req;
+ hook->table = t;
+
+ rt_set_import_state(hook, TIS_UP);
+ add_tail(&tab->imports, &hook->n);
+ }
+}
+
+void
+rt_stop_import(struct rt_import_request *req, void (*stopped)(struct rt_import_request *))
+{
+ ASSERT_DIE(req->hook);
+ struct rt_import_hook *hook = req->hook;
+
+ RT_LOCKED(hook->table, tab)
+ {
+ rt_set_import_state(hook, TIS_STOP);
+ hook->stopped = stopped;
+
+ rt_refresh_trace(tab, hook, "stop import");
+
+ /* Cancel table rr_counter */
+ if (hook->stale_set != hook->stale_pruned)
+ tab->rr_counter -= ((int) hook->stale_set - (int) hook->stale_pruned);
+
+ tab->rr_counter++;
+
+ hook->stale_set = hook->stale_pruned = hook->stale_pruning = hook->stale_valid = 0;
+
+ rt_schedule_prune(tab);
+ }
+}
+
+
+/**
+ * rt_refresh_begin - start a refresh cycle
+ * @t: related routing table
+ * @c related channel
+ *
+ * This function starts a refresh cycle for given routing table and announce
+ * hook. The refresh cycle is a sequence where the protocol sends all its valid
+ * routes to the routing table (by rte_update()). After that, all protocol
+ * routes (more precisely routes with @c as @sender) not sent during the
+ * refresh cycle but still in the table from the past are pruned. This is
* implemented by marking all related routes as stale by REF_STALE flag in
* rt_refresh_begin(), then marking all related stale routes with REF_DISCARD
* flag in rt_refresh_end() and then removing such routes in the prune loop.
void
rt_flowspec_unlink(rtable *src, rtable *dst)
{
- struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst);
+ birdloop_enter(dst->loop);
- _Bool unlock_dst = 0;
- ASSERT(ln && (ln->uc > 0));
++ bool unlock_dst = 0;
- ln->uc--;
-
- if (!ln->uc)
+ struct rt_flowspec_link *ln;
+ RT_LOCKED(src, t)
{
- rem_node(&ln->n);
- mb_free(ln);
+ ln = rt_flowspec_find_link(t, dst);
- rt_unlock_table(src);
- rt_unlock_table(dst);
+ ASSERT(ln && (ln->uc > 0));
+
+ if (!--ln->uc)
+ {
+ rt_flowspec_link_rem_node(&t->flowspec_links, ln);
+ rtex_export_unsubscribe(&ln->req);
+ ev_postpone(&ln->event);
+ mb_free(ln);
+ unlock_dst = 1;
+ }
}
+
+ if (unlock_dst)
+ rt_unlock_table(dst);
+
+ birdloop_leave(dst->loop);
}
static void
rt_init(void)
{
rta_init();
- rt_table_pool = rp_new(&root_pool, "Routing tables");
- rte_update_pool = lp_new_default(rt_table_pool);
- rte_slab = sl_new(rt_table_pool, sizeof(rte));
+ rt_table_pool = rp_new(&root_pool, the_bird_domain.the_bird, "Routing tables");
init_list(&routing_tables);
- static _Bool
+ init_list(&deleted_routing_tables);
+ ev_init_list(&rt_cork.queue, &main_birdloop, "Route cork release");
+ rt_cork.dom = DOMAIN_NEW_RCU_SYNC(resource);
+ idm_init(&rtable_idm, rt_table_pool, 256);
+
+ ea_register_init(&ea_roa_aggregated);
+}
+
- _Bool stale = (s->import_state == TIS_FLUSHING);
++static bool
+rt_prune_net(struct rtable_private *tab, struct network *n)
+{
+ NET_WALK_ROUTES(tab, n, ep, e)
+ {
+ ASSERT_DIE(!(e->flags & REF_OBSOLETE));
+ struct rt_import_hook *s = e->rte.sender;
+
++ bool stale = (s->import_state == TIS_FLUSHING);
+
+ if (!stale)
+ {
+
+ /*
+ * The range of 0..256 is split by s->stale_* like this:
+ *
+ * pruned pruning valid set
+ * | | | |
+ * 0 v v v v 256
+ * |...........................+++++++++++........|
+ *
+ * We want to drop everything outside the marked range, thus
+ * (e->rte.stale_cycle < s->stale_valid) ||
+ * (e->rte.stale_cycle > s->stale_set))
+ * looks right.
+ *
+ * But the pointers may wrap around, and in the following situation, all the routes get pruned:
+ *
+ * set pruned pruning valid
+ * | | | |
+ * 0 v v v v 256
+ * |++++++..................................++++++|
+ *
+ * In that case, we want
+ * (e->rte.stale_cycle > s->stale_valid) ||
+ * (e->rte.stale_cycle < s->stale_set))
+ *
+ * Full logic table:
+ *
+ * permutation | result | (S < V) + (S < SC) + (SC < V)
+ * -----------------+----------+---------------------------------
+ * SC < V <= S | prune | 0 + 0 + 1 = 1
+ * S < SC < V | prune | 1 + 1 + 1 = 3
+ * V <= S < SC | prune | 0 + 1 + 0 = 1
+ * SC <= S < V | keep | 1 + 0 + 1 = 2
+ * V <= SC <= S | keep | 0 + 0 + 0 = 0
+ * S < V <= SC | keep | 1 + 1 + 0 = 2
+ *
+ * Now the following code hopefully makes sense.
+ */
+
+ int sv = (s->stale_set < s->stale_valid);
+ int ssc = (s->stale_set < e->rte.stale_cycle);
+ int scv = (e->rte.stale_cycle < s->stale_valid);
+ stale = (sv + ssc + scv) & 1;
+ }
+
+ /* By the C standard, either the importer is flushing and stale_perm is 1,
+ * or by the table above, stale_perm is between 0 and 3, where even values
+ * say "keep" and odd values say "prune". */
+
+ if (stale)
+ {
+ /* Announce withdrawal */
+ struct netindex *i = RTE_GET_NETINDEX(&e->rte);
+ rte_recalculate(tab, e->rte.sender, i, n, NULL, e->rte.src);
+ return 1;
+ }
+ }
+ return 0;
}
#endif
}
+static inline void
+rt_flowspec_resolve_rte(rte *r, struct channel *c)
+{
+#ifdef CONFIG_BGP
+ enum flowspec_valid valid, old = rt_get_flowspec_valid(r);
+ struct bgp_channel *bc = (struct bgp_channel *) c;
-static inline int
-rt_next_hop_update_net(rtable *tab, net *n)
+ if ( (rt_get_source_attr(r) == RTS_BGP)
+ && (c->class == &channel_bgp)
+ && (bc->base_table))
+ {
+ SKIP_BACK_DECLARE(struct bgp_proto, p, p, bc->c.proto);
+ RT_LOCKED(c->in_req.hook->table, tab)
+ valid = rt_flowspec_check(
+ bc->base_table, tab,
+ r->net, r->attrs, p->is_interior);
+ }
+ else
+ valid = FLOWSPEC_UNKNOWN;
+
+ if (valid == old)
+ return;
+
+ if (valid == FLOWSPEC_UNKNOWN)
+ ea_unset_attr(&r->attrs, 0, &ea_gen_flowspec_valid);
+ else
+ ea_set_attr_u32(&r->attrs, &ea_gen_flowspec_valid, 0, valid);
+#endif
+}
+
+static inline void
+rt_next_hop_update_net(struct rtable_private *tab, struct netindex *ni, net *n)
{
- rte **k, *e, *new, *old_best, **new_best;
- int count = 0;
- int free_old_best = 0;
+ uint count = 0;
+ int is_flow = net_val_match(tab->addr_type, NB_FLOW);
- old_best = n->routes;
+ struct rte_storage *old_best = NET_BEST_ROUTE(tab, n);
if (!old_best)
- return 0;
+ return;
+
+ NET_WALK_ROUTES(tab, n, ep, e)
+ count++;
+
+ if (!count)
+ return;
+
+ struct rte_multiupdate {
+ struct rte_storage *old, *new_stored;
+ rte new;
+ } *updates = tmp_allocz(sizeof(struct rte_multiupdate) * (count+1));
+
+ uint pos = 0;
+ NET_WALK_ROUTES(tab, n, ep, e)
+ updates[pos++].old = e;
+
+ uint mod = 0;
+ if (is_flow)
+ for (uint i = 0; i < pos; i++)
+ mod += rt_flowspec_update_rte(tab, &updates[i].old->rte, &updates[i].new);
+
+ else
+ for (uint i = 0; i < pos; i++)
+ mod += rt_next_hop_update_rte(&updates[i].old->rte, &updates[i].new);
+
+ if (!mod)
+ return;
+
+ /* We add a spinlock sentinel to the beginning */
+ struct rte_storage local_sentinel = {
+ .flags = REF_OBSOLETE,
+ .next = old_best,
+ };
+ atomic_store_explicit(&n->routes, &local_sentinel, memory_order_release);
- for (k = &n->routes; e = *k; k = &e->next)
+ /* Now we mark all the old routes obsolete */
+ for (uint i = 0; i < pos; i++)
+ if (updates[i].new.attrs)
+ updates[i].old->flags |= REF_OBSOLETE;
+
+ /* Wait for readers */
+ synchronize_rcu();
+
+ /* And now we go backwards to keep the list properly linked */
+ struct rte_storage *next = NULL;
+ for (int i = pos - 1; i >= 0; i--)
{
- if (!net_is_flow(n->n.addr))
- new = rt_next_hop_update_rte(tab, e);
+ struct rte_storage *this;
+ if (updates[i].new.attrs)
+ {
+ rte *new = &updates[i].new;
+ new->lastmod = current_time();
+ new->id = hmap_first_zero(&tab->id_map);
+ hmap_set(&tab->id_map, new->id);
+ this = updates[i].new_stored = rte_store(new, ni, tab);
+ }
else
- new = rt_flowspec_update_rte(tab, e);
+ this = updates[i].old;
- if (new)
- {
- *k = new;
+ atomic_store_explicit(&this->next, next, memory_order_release);
+ next = this;
+ }
- rte_trace_in(D_ROUTES, new->sender, new, "updated");
- rte_announce_i(tab, RA_ANY, n, new, e, NULL, NULL);
+ /* Add behind the sentinel */
+ atomic_store_explicit(&local_sentinel.next, next, memory_order_release);
- /* Call a pre-comparison hook */
+ /* Call the pre-comparison hooks */
+ for (uint i = 0; i < pos; i++)
+ if (updates[i].new_stored)
+ {
/* Not really an efficient way to compute this */
- if (e->src->proto->rte_recalculate)
- e->src->proto->rte_recalculate(tab, n, new, e, NULL);
+ if (updates[i].old->rte.src->owner->rte_recalculate)
+ updates[i].old->rte.src->owner->rte_recalculate(tab, n, updates[i].new_stored, updates[i].old, old_best);
+ }
- if (e != old_best)
- rte_free_quick(e);
- else /* Freeing of the old best rte is postponed */
- free_old_best = 1;
+ /* Find the new best route */
+ uint best_pos = 0;
+ struct rte_storage *new_best = updates[0].new_stored ?: updates[0].old;
- e = new;
- count++;
- }
+ for (uint i = 1; i < pos; i++)
+ {
+ struct rte_storage *s = updates[i].new_stored ?: updates[i].old;
+ if (rte_better(&s->rte, &new_best->rte))
+ {
+ best_pos = i;
+ new_best = s;
+ }
}
- if (!count)
- return 0;
+ /* Relink the new best route to the first position */
+ struct rte_storage * _Atomic *best_prev;
+ if (best_pos)
+ best_prev = &(updates[best_pos-1].new_stored ?: updates[best_pos-1].old)->next;
+ else
+ best_prev = &local_sentinel.next;
- /* Find the new best route */
- new_best = NULL;
- for (k = &n->routes; e = *k; k = &e->next)
+ /* Unlink from the original place */
+ atomic_store_explicit(best_prev,
+ atomic_load_explicit(&new_best->next, memory_order_relaxed),
+ memory_order_release);
+
+ /* Link out */
+ atomic_store_explicit(&new_best->next,
+ atomic_load_explicit(&local_sentinel.next, memory_order_relaxed),
+ memory_order_release);
+
+ /* Now we have to announce the routes the right way, to not cause any
+ * strange problems with consistency. */
+
+ ASSERT_DIE(updates[0].old == old_best);
+
+ /* Find new best route original position */
+ uint nbpos = ~0;
+ for (uint i=0; i<count; i++)
+ if ((updates[i].new_stored == new_best) || (updates[i].old == new_best))
{
- if (!new_best || rte_better(e, *new_best))
- new_best = k;
+ nbpos = i;
+ break;
}
+ ASSERT_DIE(~nbpos);
- /* Relink the new best route to the first position */
- new = *new_best;
- if (new != n->routes)
+ struct rt_pending_export *best_rpe =
+ (new_best != old_best) ?
+ rte_announce_to(&tab->export_best, &n->best, &new_best->rte, &old_best->rte)
+ : NULL;
+
+ uint total = 0;
+ u64 last_seq = 0;
+
+ /* Announce the changes */
+ for (uint i=0; i<count; i++)
+ {
+ /* Not changed at all */
+ if (!updates[i].new_stored)
+ continue;
+
+ struct rt_pending_export *this_rpe =
+ rte_announce_to(&tab->export_all, &n->all,
+ &updates[i].new_stored->rte, &updates[i].old->rte);
+
+ ASSERT_DIE(this_rpe);
- _Bool nb = (new_best->rte.src == updates[i].new.src), ob = (i == 0);
++ bool nb = (new_best->rte.src == updates[i].new.src), ob = (i == 0);
+ char info[96];
+ char best_indicator[2][2] = { { ' ', '+' }, { '-', '=' } };
+ bsnprintf(info, sizeof info, "autoupdated [%cbest]", best_indicator[ob][nb]);
+
+ rt_rte_trace_in(D_ROUTES, updates[i].new.sender->req, &updates[i].new, info);
+
+ /* Double announcement of this specific route */
+ if (ob && best_rpe)
{
- *new_best = new->next;
- new->next = n->routes;
- n->routes = new;
+ ASSERT_DIE(best_rpe->it.old == &updates[i].old->rte);
+ ASSERT_DIE(!best_rpe->seq_all);
+ best_rpe->seq_all = this_rpe->it.seq;
}
+ else
+ last_seq = this_rpe->it.seq;
- /* Announce the new best route */
- if (new != old_best)
- rte_trace_in(D_ROUTES, new->sender, new, "updated [best]");
+ total++;
+ }
- /* Propagate changes */
- rte_announce_i(tab, RA_UNDEF, n, NULL, NULL, n->routes, old_best);
+ if (best_rpe && !best_rpe->seq_all)
+ {
+ ASSERT_DIE(!updates[0].new_stored);
+ best_rpe->seq_all = last_seq;
+ }
- if (free_old_best)
- rte_free_quick(old_best);
+ /* Now we can finally release the changes back into the table */
+ atomic_store_explicit(&n->routes, new_best, memory_order_release);
- return count;
+ return;
+}
+
+static void
+rt_nhu_uncork(callback *cb)
+{
+ RT_LOCKED(SKIP_BACK(rtable, priv.nhu_uncork.cb, cb), tab)
+ {
+ ASSERT_DIE(tab->nhu_corked);
+ ASSERT_DIE(tab->nhu_state == 0);
+
+ /* Reset the state */
+ tab->nhu_state = tab->nhu_corked;
+ tab->nhu_corked = 0;
+ rt_trace(tab, D_STATES, "Next hop updater uncorked");
+
+ ev_send_loop(tab->loop, tab->nhu_event);
+ rt_unlock_table(tab);
+ }
}
static void
}
}
-void
-rt_prune_sync(rtable *t, int all)
+
+static int
+rt_reconfigure(struct rtable_private *tab, struct rtable_config *new, struct rtable_config *old)
{
- struct fib_iterator fit;
+ if ((new->addr_type != old->addr_type) ||
+ (new->sorted != old->sorted) ||
+ (new->trie_used != old->trie_used))
+ return 0;
- FIB_ITERATE_INIT(&fit, &t->fib);
+ ASSERT_DIE(new->master.setup == old->master.setup);
+ ASSERT_DIE(new->master.stop == old->master.stop);
-again:
- FIB_ITERATE_START(&t->fib, &fit, net, n)
- {
- rte *e, **ee = &n->routes;
+ DBG("\t%s: same\n", new->name);
+ new->table = RT_PUB(tab);
+ tab->name = new->name;
+ tab->config = new;
+ tab->debug = new->debug;
+ tab->export_all.trace_routes = tab->export_best.trace_routes = new->debug;
- while (e = *ee)
- {
- if (all || (e->flags & (REF_STALE | REF_DISCARD)))
- {
- *ee = e->next;
- rte_free_quick(e);
- t->rt_count--;
- }
- else
- ee = &e->next;
- }
+ if (tab->hostcache)
+ tab->hostcache->req.trace_routes = new->debug;
- if (all || !n->routes)
- {
- FIB_ITERATE_PUT(&fit);
- fib_delete(&t->fib, n);
- goto again;
- }
- }
- FIB_ITERATE_END;
-}
+ WALK_TLIST(rt_flowspec_link, ln, &tab->flowspec_links)
+ ln->req.trace_routes = new->debug;
+ tab->cork_threshold = new->cork_threshold;
-/*
- * Export table
- */
+ if (new->cork_threshold.high != old->cork_threshold.high)
+ rt_check_cork_high(tab);
-int
-rte_update_out(struct channel *c, const net_addr *n, rte *new, rte *old0, int refeed)
-{
- struct rtable *tab = c->out_table;
- struct rte_src *src;
- rte *old, **pos;
- net *net;
+ if (new->cork_threshold.low != old->cork_threshold.low)
+ rt_check_cork_low(tab);
- if (new)
- {
- net = net_get(tab, n);
- src = new->src;
+ if (tab->export_digest && (
+ (new->digest_settle.min != tab->export_digest->settle.cf.min)
+ || (new->digest_settle.max != tab->export_digest->settle.cf.max)))
+ tab->export_digest->settle.cf = new->digest_settle;
- if (!rta_is_cached(new->attrs))
- new->attrs = rta_lookup(new->attrs);
- }
- else
- {
- net = net_find(tab, n);
- src = old0->src;
+ return 1;
+}
- if (!net)
- goto drop_withdraw;
- }
+static struct rtable_config *
+rt_find_table_config(struct config *cf, char *name)
+{
+ struct symbol *sym = cf_find_symbol(cf, name);
+ return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL;
+}
+
+/**
+ * rt_commit - commit new routing table configuration
+ * @new: new configuration
+ * @old: original configuration or %NULL if it's boot time config
+ *
+ * Scan differences between @old and @new configuration and modify
+ * the routing tables according to these changes. If @new defines a
+ * previously unknown table, create it, if it omits a table existing
+ * in @old, schedule it for deletion (it gets deleted when all protocols
+ * disconnect from it by calling rt_unlock_table()), if it exists
+ * in both configurations, leave it unchanged.
+ */
+void
+rt_commit(struct config *new, struct config *old)
+{
+ struct rtable_config *o, *r;
- /* Find the old rte */
- for (pos = &net->routes; old = *pos; pos = &old->next)
- if ((c->ra_mode != RA_ANY) || (old->src == src))
+ DBG("rt_commit:\n");
+
+ if (old)
{
- if (new && rte_same(old, new))
+ WALK_LIST(o, old->tables)
{
- _Bool ok;
- /* REF_STALE / REF_DISCARD not used in export table */
- /*
- if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY))
++ bool ok;
+ RT_LOCKED(o->table, tab)
{
- old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
- return 1;
+ r = OBSREF_GET(tab->deleted) ? NULL : rt_find_table_config(new, o->name);
+ ok = r && !new->shutdown && rt_reconfigure(tab, r, o);
}
- */
- goto drop_update;
- }
+ if (ok)
+ continue;
- /* Remove the old rte */
- *pos = old->next;
- rte_free_quick(old);
- tab->rt_count--;
+ birdloop_enter(o->table->loop);
+ RT_LOCKED(o->table, tab)
+ {
+ DBG("\t%s: deleted\n", o->name);
+ OBSREF_SET(tab->deleted, old);
+ rt_check_cork_low(tab);
+ rt_lock_table(tab);
+ rt_unlock_table(tab);
+ }
- break;
+ CALL(o->table->config->master.stop, o->table);
+ birdloop_leave(o->table->loop);
+ }
}
- if (!new)
- {
- if (!old)
- goto drop_withdraw;
-
- if (!net->routes)
- fib_delete(&tab->fib, net);
-
- return 1;
- }
-
- /* Insert the new rte */
- rte *e = rte_do_cow(new);
- e->flags |= REF_COW;
- e->net = net;
- e->sender = c;
- e->lastmod = current_time();
- e->next = *pos;
- *pos = e;
- tab->rt_count++;
- return 1;
-
-drop_update:
- return refeed;
-
-drop_withdraw:
- return 0;
+ WALK_LIST(r, new->tables)
+ if (!r->table)
+ {
+ r->table = rt_setup(rt_table_pool, r);
+ DBG("\t%s: created\n", r->name);
+ add_tail(&routing_tables, &r->table->n);
+ }
+ DBG("\tdone\n");
}
}
void
-bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px)
+bgp_done_prefix(struct bgp_ptx_private *c, struct bgp_prefix *px, struct bgp_bucket *buck)
{
+ /* BMP hack */
+ if (buck->bmp)
+ return;
+
+ /* Cleanup: We're called from bucket senders. */
+ ASSERT_DIE(px->cur == buck);
rem_node(&px->buck_node);
- HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px);
- if (c->prefix_slab)
- sl_free(px);
- else
- mb_free(px);
+ /* We may want to store the updates */
+ if (c->c->tx_keep)
+ {
+ /* Nothing to be sent right now */
+ px->cur = NULL;
+
+ /* Unref the previous sent version */
+ if (px->last)
+ if (!--px->last->px_uc)
+ bgp_done_bucket(c, px->last);
+
+ /* Ref the current sent version */
+ if (!IS_WITHDRAW_BUCKET(buck))
+ {
+ px->last = buck;
+ px->last->px_uc++;
+ return;
+ }
+
+ /* Prefixes belonging to the withdraw bucket are freed always */
+ }
+
+ bgp_free_prefix(c, px);
+}
+
+void
+bgp_tx_resend(struct bgp_proto *p, struct bgp_channel *bc)
+{
+ BGP_PTX_LOCK(bc->tx, c);
+
+ ASSERT_DIE(bc->tx_keep);
+ uint seen = 0;
+
+ HASH_WALK(c->prefix_hash, next, px)
+ {
+ if (!px->cur)
+ {
+ ASSERT_DIE(px->last);
+ struct bgp_bucket *last = px->last;
+
+ /* Remove the last reference, we wanna resend the route */
+ px->last->px_uc--;
+ px->last = NULL;
+
+ /* And send it once again */
+ seen += bgp_update_prefix(c, px, last);
+ }
+ }
+ HASH_WALK_END;
+
+ if (bc->c.debug & D_EVENTS)
+ log(L_TRACE "%s.%s: TX resending %u routes",
+ bc->c.proto->name, bc->c.name, seen);
+
+ if (seen)
+ bgp_schedule_packet(p->conn, bc, PKT_UPDATE);
+}
+
+/*
+ * Prefix hash table exporter
+ */
+
+static void
+bgp_out_item_done(struct lfjour *j, struct lfjour_item *i)
+{}
+
+static struct rt_export_feed *
- bgp_out_feed_net(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, _Bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first)
++bgp_out_feed_net(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first)
+{
+ ASSERT_DIE(u == NULL);
+ SKIP_BACK_DECLARE(struct bgp_ptx_private, c, exporter, e);
+ ASSERT_DIE(DOMAIN_IS_LOCKED(rtable, c->lock));
+
+ struct netindex *ni = net_resolve_index(c->exporter.netindex, index);
+ if (ni == &net_index_out_of_range)
+ return &rt_feed_index_out_of_range;
+
+ if (ni == NULL)
+ return NULL;
+
+ if (prefilter && !prefilter(f, ni->addr))
+ return NULL;
+
+ struct rt_export_feed *feed = NULL;
+
+ uint count = 0;
+
+ struct bgp_prefix *chain = HASH_FIND_CHAIN(c->prefix_hash, PXH, ni, NULL);
+
+ for (struct bgp_prefix *px = chain; px; px = px->next)
+ if (px->ni == ni)
+ count += !!px->last + !!px->cur;
+
+ if (count)
+ {
+ feed = rt_alloc_feed(count, 0);
+ feed->ni = ni;
+
+ uint pos = 0;
+
+ for (struct bgp_prefix *px = chain; px; px = px->next)
+ if (px->ni == ni)
+ {
+ if (px->cur)
+ feed->block[pos++] = (rte) {
+ .attrs = (px->cur == c->withdraw_bucket) ? NULL : ea_free_later(ea_lookup_slow(px->cur->eattrs, 0, EALS_CUSTOM)),
+ .net = ni->addr,
+ .src = px->src,
+ .lastmod = px->lastmod,
+ .flags = REF_PENDING,
+ };
+
+ if (px->last)
+ feed->block[pos++] = (rte) {
+ .attrs = (px->last == c->withdraw_bucket) ? NULL : ea_free_later(ea_lookup_slow(px->last->eattrs, 0, EALS_CUSTOM)),
+ .net = ni->addr,
+ .src = px->src,
+ .lastmod = px->lastmod,
+ };
+ }
+
+ ASSERT_DIE(pos == count);
+ }
+
+ return feed;
+}
+
+/* TX structures Init and Free */
+
+void
+bgp_init_pending_tx(struct bgp_channel *c)
+{
+ ASSERT_DIE(c->c.out_table == NULL);
+ ASSERT_DIE(c->tx == NULL);
+
+ DOMAIN(rtable) dom = DOMAIN_NEW_RCU_SYNC(rtable);
+ LOCK_DOMAIN(rtable, dom);
+ pool *p = rp_newf(c->pool, dom.rtable, "%s.%s TX", c->c.proto->name, c->c.name);
+
+ struct bgp_ptx_private *bpp = mb_allocz(p, sizeof *bpp);
+
+ bpp->lock = dom;
+ bpp->pool = p;
+ bpp->c = c;
+
+ bgp_init_bucket_table(bpp);
+ bgp_init_prefix_table(bpp);
+
+ bpp->exporter = (struct rt_exporter) {
+ .journal = {
+ .loop = c->c.proto->loop,
+ .item_size = sizeof(struct rt_export_item),
+ .item_done = bgp_out_item_done,
+ },
+ .name = mb_sprintf(c->c.proto->pool, "%s.%s.export", c->c.proto->name, c->c.name),
+ .net_type = c->c.net_type,
+ .max_feed_index = 0,
+ .netindex = c->c.table->netindex,
+ .trace_routes = c->c.debug,
+ .feed_net = bgp_out_feed_net,
+ .domain = dom,
+ };
+
+ rt_exporter_init(&bpp->exporter, &c->cf->ptx_exporter_settle);
+ c->c.out_table = &bpp->exporter;
+
+ c->tx = BGP_PTX_PUB(bpp);
+
+ UNLOCK_DOMAIN(rtable, dom);
+}
+
+void
+bgp_free_pending_tx(struct bgp_channel *bc)
+{
+ if (!bc->tx)
+ return;
+
+ DOMAIN(rtable) dom = bc->tx->lock;
+ LOCK_DOMAIN(rtable, dom);
+ struct bgp_ptx_private *c = &bc->tx->priv;
+
+ bc->c.out_table = NULL;
+ rt_exporter_shutdown(&c->exporter, NULL); /* TODO: actually implement exports */
+
+ /* Move all prefixes to the withdraw bucket to unref the "last" prefixes */
+ struct bgp_bucket *b = bgp_get_withdraw_bucket(c);
+ HASH_WALK(c->prefix_hash, next, px)
+ bgp_update_prefix(c, px, b);
+ HASH_WALK_END;
+
+ /* Flush withdrawals */
+ struct bgp_prefix *px;
+ WALK_LIST_FIRST(px, b->prefixes)
+ bgp_done_prefix(c, px, b);
+
+ /* Flush pending TX */
+ WALK_LIST_FIRST(b, c->bucket_queue)
+ {
+ WALK_LIST_FIRST(px, b->prefixes)
+ bgp_done_prefix(c, px, b);
+ bgp_done_bucket(c, b);
+ }
+
+ /* Consistency and resource leak checks */
+ HASH_WALK(c->prefix_hash, next, n)
+ bug("Stray prefix after cleanup");
+ HASH_WALK_END;
+
+ HASH_FREE(c->prefix_hash);
+ sl_delete(c->prefix_slab);
+ c->prefix_slab = NULL;
+
+ HASH_WALK(c->bucket_hash, next, n)
+ bug("Stray bucket after cleanup");
+ HASH_WALK_END;
+
+ HASH_FREE(c->bucket_hash);
+ sl_delete(c->bucket_slab);
+ c->bucket_slab = NULL;
+
+ rp_free(c->pool);
+
+ UNLOCK_DOMAIN(rtable, dom);
+ DOMAIN_FREE(rtable, dom);
+
+ bc->tx = NULL;
}
long page_size = 0;
#ifdef HAVE_MMAP
-#define KEEP_PAGES_MAIN_MAX 256
-#define KEEP_PAGES_MAIN_MIN 8
-#define CLEANUP_PAGES_BULK 256
+# define KEEP_PAGES_MAX 16384
+# define KEEP_PAGES_MIN 32
+# define KEEP_PAGES_MAX_LOCAL 128
+# define ALLOC_PAGES_AT_ONCE 32
+
+ STATIC_ASSERT(KEEP_PAGES_MIN * 4 < KEEP_PAGES_MAX);
+ STATIC_ASSERT(ALLOC_PAGES_AT_ONCE < KEEP_PAGES_MAX_LOCAL);
+
- static _Bool use_fake = 0;
- static _Bool initialized = 0;
++ static bool use_fake = 0;
++ static bool initialized = 0;
+
+# define PROTECT_PAGE(pg)
+# define UNPROTECT_PAGE(pg)
+
+# if DEBUGGING
+# ifdef ENABLE_EXPENSIVE_CHECKS
+# undef PROTECT_PAGE
+# undef UNPROTECT_PAGE
+# define PROTECT_PAGE(pg) mprotect((pg), page_size, PROT_READ)
+# define UNPROTECT_PAGE(pg) mprotect((pg), page_size, PROT_READ | PROT_WRITE)
+# endif
+
+# define AJSIZE 16384
+
+ static struct alloc_journal {
+ void *fp;
+ void *next;
+ u16 pos;
+ u16 type;
+ uint thread_id;
+ } alloc_journal[AJSIZE];
+
+ _Thread_local int alloc_journal_local_pos = -1;
+ _Atomic int alloc_journal_pos = 0;
+
+# define AJT_ALLOC_LOCAL_HOT 1
+# define AJT_ALLOC_GLOBAL_HOT 2
+# define AJT_ALLOC_COLD_STD 3
+# define AJT_ALLOC_COLD_KEEPER 4
+# define AJT_ALLOC_MMAP 5
+
+# define AJT_FREE_LOCAL_HOT 0x11
+# define AJT_FREE_GLOBAL_HOT 0x12
+
+# define AJT_CLEANUP_NOTHING 0xc0
+# define AJT_CLEANUP_COLD_STD 0xc3
+# define AJT_CLEANUP_COLD_KEEPER 0xc4
+# define AJT_CLEANUP_BEGIN 0xcb
+# define AJT_CLEANUP_END 0xce
+
+# define AJT_FLUSH_LOCAL_BEGIN 0xfb
+# define AJT_FLUSH_LOCAL_END 0xfe
+# define AJT_SCHEDULE_CLEANUP 0xff
+
+ static void
+ ajlog(void *fp, void *next, u16 pos, u16 type)
+ {
+ alloc_journal[(alloc_journal_local_pos = atomic_fetch_add_explicit(&alloc_journal_pos, 1, memory_order_relaxed)) % AJSIZE] = (struct alloc_journal) {
+ .fp = fp,
+ .next = next,
+ .pos = pos,
+ .type = type,
+ .thread_id = THIS_THREAD_ID,
+ };
+ }
-STATIC_ASSERT(KEEP_PAGES_MAIN_MIN * 4 < KEEP_PAGES_MAIN_MAX);
+ struct free_page {
+ node unused[42];
+ struct free_page * _Atomic next;
+ };
+# else /* ! DEBUGGING */
-static bool use_fake = 0;
+# define ajlog(...)
-#if DEBUGGING
-struct free_page {
- node unused[42];
- node n;
-};
-#else
-struct free_page {
- node n;
-};
-#endif
+ struct free_page {
+ struct free_page * _Atomic next;
+ };
-#define EP_POS_MAX ((page_size - OFFSETOF(struct empty_pages, pages)) / sizeof (void *))
+# endif
-struct empty_pages {
- node n;
- uint pos;
- void *pages[0];
-};
+# define WRITE_NEXT(pg, val) do { UNPROTECT_PAGE((pg)); (pg)->next = (val); PROTECT_PAGE((pg)); } while (0)
-struct free_pages {
- list pages; /* List of (struct free_page) keeping free pages without releasing them (hot) */
- list empty; /* List of (struct empty_pages) keeping invalidated pages mapped for us (cold) */
- u16 min, max; /* Minimal and maximal number of free pages kept */
- uint cnt; /* Number of free pages in list */
- event cleanup;
-};
+# define EP_POS_MAX ((page_size - OFFSETOF(struct empty_pages, pages)) / sizeof (void *))
-static void global_free_pages_cleanup_event(void *);
-static void *alloc_cold_page(void);
+ struct empty_pages {
+ struct empty_pages *next;
+ uint pos;
+ void *pages[0];
+ };
-static struct free_pages global_free_pages = {
- .min = KEEP_PAGES_MAIN_MIN,
- .max = KEEP_PAGES_MAIN_MAX,
- .cleanup = { .hook = global_free_pages_cleanup_event },
-};
+ static DOMAIN(resource) empty_pages_domain;
+ static struct empty_pages *empty_pages = NULL;
-uint *pages_kept = &global_free_pages.cnt;
+ static struct free_page * _Atomic page_stack = NULL;
+ static _Thread_local struct free_page * local_page_stack = NULL;
+ static struct free_page page_stack_blocked;
-static void *
-alloc_sys_page(void)
-{
- void *ptr = mmap(NULL, page_size, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ /* Try to replace the page stack head with a cork, until it succeeds. */
+# define PAGE_STACK_GET ({ \
+ struct free_page *fp; \
+ while ((fp = atomic_exchange_explicit(&page_stack, &page_stack_blocked, memory_order_acq_rel)) == &page_stack_blocked) birdloop_yield(); \
+ fp; })
+ /* Reinstate the stack with another value */
+# define PAGE_STACK_PUT(val) ASSERT_DIE(atomic_exchange_explicit(&page_stack, (val), memory_order_acq_rel) == &page_stack_blocked)
- if (ptr == MAP_FAILED)
- die("mmap(%ld) failed: %m", (s64) page_size);
+ static void page_cleanup(void *);
+ static event page_cleanup_event = { .hook = page_cleanup, };
+# define SCHEDULE_CLEANUP do if (initialized && !shutting_down) ev_send(&global_event_list, &page_cleanup_event); while (0)
- return ptr;
-}
+ _Atomic int pages_kept = 0;
+ _Atomic int pages_kept_locally = 0;
+ static _Thread_local int pages_kept_here = 0;
-extern int shutting_down; /* Shutdown requested. */
+ static void *
+ alloc_sys_page(void)
+ {
+ void *ptr = mmap(NULL, page_size * ALLOC_PAGES_AT_ONCE, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+
+ if (ptr == MAP_FAILED)
+ die("mmap(%ld) failed: %m", (s64) page_size);
+
+ return ptr;
+ }
+
+ extern int shutting_down; /* Shutdown requested. */
#else // ! HAVE_MMAP
-#define use_fake 1
+# define use_fake 1
#endif
void *
--- /dev/null
- _Bool forbidden_when_reading_rcu;
+/*
+ * BIRD Locking
+ *
+ * (c) 2020 Maria Matejka <mq@jmq.cz>
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#undef LOCAL_DEBUG
+
+#undef DEBUG_LOCKING
+
+#include "lib/birdlib.h"
+#include "lib/locking.h"
+#include "lib/resource.h"
+#include "lib/timer.h"
+
+#include "conf/conf.h"
+
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <pthread.h>
+#include <semaphore.h>
+#include <stdatomic.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+/*
+ * Locking subsystem
+ */
+
+#ifdef DEBUGGING
+_Thread_local rw_spinlock *rw_spinlocks_taken[MAX_RWS_AT_ONCE];
+_Thread_local btime rw_spinlocks_time[MAX_RWS_AT_ONCE];
+_Thread_local u32 rw_spinlocks_taken_cnt;
+_Thread_local u32 rw_spinlocks_taken_write;
+#endif
+
+_Thread_local struct lock_order locking_stack = {};
+_Thread_local struct domain_generic **last_locked = NULL;
+
+#define ASSERT_NO_LOCK ASSERT_DIE(last_locked == NULL)
+
+struct domain_generic {
+ pthread_mutex_t mutex;
+ uint order;
- domain_new(uint order, _Bool allow_rcu)
++ bool forbidden_when_reading_rcu;
+ struct domain_generic **prev;
+ struct lock_order *locked_by;
+ const char *name;
+ pool *pool;
+};
+
+#define DOMAIN_INIT(_order, _allow_rcu) { \
+ .mutex = PTHREAD_MUTEX_INITIALIZER, \
+ .order = _order, \
+ .forbidden_when_reading_rcu = !_allow_rcu, \
+}
+
+static struct domain_generic the_bird_domain_gen = DOMAIN_INIT(OFFSETOF(struct lock_order, the_bird), 1);
+
+DOMAIN(the_bird) the_bird_domain = { .the_bird = &the_bird_domain_gen };
+
+struct domain_generic *
++domain_new(uint order, bool allow_rcu)
+{
+ ASSERT_DIE(order < sizeof(struct lock_order));
+ struct domain_generic *dg = xmalloc(sizeof(struct domain_generic));
+ *dg = (struct domain_generic) DOMAIN_INIT(order, allow_rcu);
+ return dg;
+}
+
+void
+domain_free(struct domain_generic *dg)
+{
+ pthread_mutex_destroy(&dg->mutex);
+ xfree(dg);
+}
+
+const char *
+domain_name(struct domain_generic *dg)
+{
+ return dg->name;
+}
+
+uint dg_order(struct domain_generic *dg)
+{
+ return dg->order;
+}
+
+void
+domain_setup(struct domain_generic *dg, const char *name, pool *p)
+{
+ ASSERT_DIE(dg->pool == NULL);
+ dg->pool = p;
+ dg->name = name;
+}
+
+void do_lock(struct domain_generic *dg, struct domain_generic **lsp)
+{
+ struct lock_order stack_copy;
+ memcpy(&stack_copy, &locking_stack, sizeof(stack_copy));
+ struct domain_generic **lll = last_locked;
+
+ if (dg->forbidden_when_reading_rcu)
+ if (rcu_read_active())
+ bug("Locking of this lock forbidden while RCU reader is active");
+ else
+ rcu_blocked++;
+
+ if ((char *) lsp - (char *) &locking_stack != dg->order)
+ bug("Trying to lock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack);
+
+ if (lsp <= last_locked)
+ bug("Trying to lock in a bad order: %p %p", &stack_copy, lll);
+ if (*lsp)
+ bug("Inconsistent locking stack state on lock");
+
+ btime lock_begin = current_time();
+ pthread_mutex_lock(&dg->mutex);
+ btime duration = current_time() - lock_begin;
+ btime wdw = atomic_load_explicit(&global_runtime, memory_order_relaxed)->watchdog_warning;
+ if (wdw && (duration > wdw))
+ log(L_WARN "Locking of %s took %d ms", dg->name, (int) (duration TO_MS));
+
+ if (dg->prev || dg->locked_by)
+ bug("Previous unlock not finished correctly");
+ dg->prev = last_locked;
+ *lsp = dg;
+ last_locked = lsp;
+ dg->locked_by = &locking_stack;
+}
+
+void do_unlock(struct domain_generic *dg, struct domain_generic **lsp)
+{
+ if (dg->forbidden_when_reading_rcu)
+ ASSERT_DIE(rcu_blocked--);
+
+ if ((char *) lsp - (char *) &locking_stack != dg->order)
+ bug("Trying to unlock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack);
+
+ if (dg->locked_by != &locking_stack)
+ bug("Inconsistent domain state on unlock");
+ if ((last_locked != lsp) || (*lsp != dg))
+ bug("Inconsistent locking stack state on unlock");
+ dg->locked_by = NULL;
+ last_locked = dg->prev;
+ *lsp = NULL;
+ dg->prev = NULL;
+ pthread_mutex_unlock(&dg->mutex);
+
+ /* From here on, the dg pointer is invalid! */
+}
--- /dev/null
- _Bool
+/*
+ * BIRD -- I/O and event loop
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <pthread.h>
+#include <time.h>
+#include <sys/time.h>
+
+#include "nest/bird.h"
+
+#include "lib/buffer.h"
+#include "lib/defer.h"
+#include "lib/lists.h"
+#include "lib/locking.h"
+#include "lib/resource.h"
+#include "lib/event.h"
+#include "lib/timer.h"
+#include "lib/socket.h"
+
+#include "lib/io-loop.h"
+#include "sysdep/unix/io-loop.h"
+#include "conf/conf.h"
+#include "nest/cli.h"
+
+#define THREAD_STACK_SIZE 65536 /* To be lowered in near future */
+
+static struct birdloop *birdloop_new_no_pickup(pool *pp, uint order, const char *name, ...);
+
+/*
+ * Nanosecond time for accounting purposes
+ *
+ * A fixed point on startup is set as zero, all other values are relative to that.
+ * Caution: this overflows after like 500 years or so. If you plan to run
+ * BIRD for such a long time, please implement some means of overflow prevention.
+ */
+
+#if ! HAVE_CLOCK_MONOTONIC_COARSE
+#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC
+#endif
+
+static struct timespec ns_begin;
+
+static void ns_init(void)
+{
+ if (clock_gettime(CLOCK_MONOTONIC_COARSE, &ns_begin))
+ bug("clock_gettime: %m");
+}
+
+#define NSEC_IN_SEC ((u64) (1000 * 1000 * 1000))
+
+u64 ns_now(void)
+{
+ struct timespec ts;
+ if (clock_gettime(CLOCK_MONOTONIC_COARSE, &ts))
+ bug("clock_gettime: %m");
+
+ return (u64) (ts.tv_sec - ns_begin.tv_sec) * NSEC_IN_SEC + ts.tv_nsec - ns_begin.tv_nsec;
+}
+
+#define NSEC_TO_SEC(x) ((x) / NSEC_IN_SEC)
+#define CURRENT_SEC NSEC_TO_SEC(ns_now())
+
+static _Thread_local struct spent_time *account_target_spent_time;
+static _Thread_local u64 *account_target_total;
+static _Thread_local u64 account_last;
+
+static u64 account_finish(void)
+{
+ /* Get current time */
+ u64 now = ns_now();
+ u64 dif = now - account_last;
+
+ /* Update second by second */
+ if (account_target_spent_time)
+ {
+ /* Drop old time information if difference is too large */
+ if (NSEC_TO_SEC(account_last) + TIME_BY_SEC_SIZE - 1 < NSEC_TO_SEC(now))
+ account_last = (NSEC_TO_SEC(now) - TIME_BY_SEC_SIZE + 1) * NSEC_IN_SEC;
+
+ /* Zero new records */
+ if (NSEC_TO_SEC(account_target_spent_time->last_written_ns) + TIME_BY_SEC_SIZE < NSEC_TO_SEC(account_last))
+ memset(account_target_spent_time->by_sec_ns, 0, sizeof(account_target_spent_time->by_sec_ns));
+ else
+ for (u64 fclr = NSEC_TO_SEC(account_target_spent_time->last_written_ns) + 1;
+ fclr <= NSEC_TO_SEC(now);
+ fclr++)
+ account_target_spent_time->by_sec_ns[fclr % TIME_BY_SEC_SIZE] = 0;
+
+ /* Add times second by second */
+ while (NSEC_TO_SEC(account_last) != NSEC_TO_SEC(now))
+ {
+ u64 part = (NSEC_TO_SEC(account_last) + 1) * NSEC_IN_SEC - account_last;
+ account_target_spent_time->by_sec_ns[NSEC_TO_SEC(account_last) % TIME_BY_SEC_SIZE] += part;
+ account_last += part;
+ }
+
+ /* Update the last second */
+ account_target_spent_time->by_sec_ns[NSEC_TO_SEC(account_last) % TIME_BY_SEC_SIZE] += now - account_last;
+
+ /* Store the current time */
+ account_target_spent_time->last_written_ns = now;
+ }
+
+ /* Update the total */
+ if (account_target_total)
+ *account_target_total += dif;
+
+ /* Store current time */
+ account_last = now;
+
+ return dif;
+}
+
+static u64 account_to_spent_time(struct spent_time *st)
+{
+ u64 elapsed = account_finish();
+
+ account_target_spent_time = st;
+ account_target_total = &st->total_ns;
+
+ return elapsed;
+}
+
+static u64 account_to_total(u64 *total)
+{
+ u64 elapsed = account_finish();
+
+ account_target_spent_time = NULL;
+ account_target_total = total;
+
+ return elapsed;
+}
+
+#define account_to(_arg) _Generic((_arg), \
+ struct spent_time *: account_to_spent_time, \
+ u64 *: account_to_total)(_arg)
+
+/*
+ * Current thread context
+ */
+
+_Thread_local struct birdloop *birdloop_current;
+static _Thread_local struct birdloop *birdloop_wakeup_masked;
+static _Thread_local uint birdloop_wakeup_masked_count;
+
+#define LOOP_NAME(loop) domain_name((loop)->time.domain)
+#define LATENCY_DEBUG(flags) (atomic_load_explicit(&global_runtime, memory_order_relaxed)->latency_debug & (flags))
+
+#define LOOP_TRACE(loop, flags, fmt, args...) do { if (LATENCY_DEBUG(flags)) log(L_TRACE "%s (%p): " fmt, LOOP_NAME(loop), (loop), ##args); } while (0)
+#define THREAD_TRACE(flags, ...) do { if (LATENCY_DEBUG(flags)) log(L_TRACE "Thread: " __VA_ARGS__); } while (0)
+
+#define LOOP_WARN(loop, fmt, args...) log(L_WARN "%s (%p): " fmt, LOOP_NAME(loop), (loop), ##args)
+
+
+event_list *
+birdloop_event_list(struct birdloop *loop)
+{
+ return &loop->event_list;
+}
+
+struct timeloop *
+birdloop_time_loop(struct birdloop *loop)
+{
+ return &loop->time;
+}
+
+pool *
+birdloop_pool(struct birdloop *loop)
+{
+ return loop->pool;
+}
+
- _Bool
++bool
+birdloop_inside(struct birdloop *loop)
+{
+ for (struct birdloop *c = birdloop_current; c; c = c->prev_loop)
+ if (loop == c)
+ return 1;
+
+ return 0;
+}
+
- static inline _Bool
++bool
+birdloop_in_this_thread(struct birdloop *loop)
+{
+ return pthread_equal(pthread_self(), loop->thread->thread_id);
+}
+
+/*
+ * Wakeup code for birdloop
+ */
+
+void
+pipe_new(struct pipe *p)
+{
+ int rv = pipe(p->fd);
+ if (rv < 0)
+ die("pipe: %m");
+
+ if (fcntl(p->fd[0], F_SETFL, O_NONBLOCK) < 0)
+ die("fcntl(O_NONBLOCK): %m");
+
+ if (fcntl(p->fd[1], F_SETFL, O_NONBLOCK) < 0)
+ die("fcntl(O_NONBLOCK): %m");
+}
+
+void
+pipe_drain(struct pipe *p)
+{
+ while (1) {
+ char buf[64];
+ int rv = read(p->fd[0], buf, sizeof(buf));
+ if ((rv < 0) && (errno == EAGAIN))
+ return;
+
+ if (rv == 0)
+ bug("wakeup read eof");
+ if ((rv < 0) && (errno != EINTR))
+ bug("wakeup read: %m");
+ }
+}
+
+int
+pipe_read_one(struct pipe *p)
+{
+ while (1) {
+ char v;
+ int rv = read(p->fd[0], &v, sizeof(v));
+ if (rv == 1)
+ return 1;
+ if ((rv < 0) && (errno == EAGAIN))
+ return 0;
+ if (rv > 1)
+ bug("wakeup read more bytes than expected: %d", rv);
+ if (rv == 0)
+ bug("wakeup read eof");
+ if (errno != EINTR)
+ bug("wakeup read: %m");
+ }
+}
+
+void
+pipe_kick(struct pipe *p)
+{
+ char v = 1;
+ int rv;
+
+ while (1) {
+ rv = write(p->fd[1], &v, sizeof(v));
+ if ((rv >= 0) || (errno == EAGAIN))
+ return;
+ if (errno != EINTR)
+ bug("wakeup write: %m");
+ }
+}
+
+void
+pipe_pollin(struct pipe *p, struct pfd *pfd)
+{
+ BUFFER_PUSH(pfd->pfd) = (struct pollfd) {
+ .fd = p->fd[0],
+ .events = POLLIN,
+ };
+ BUFFER_PUSH(pfd->loop) = NULL;
+}
+
+void
+pipe_free(struct pipe *p)
+{
+ close(p->fd[0]);
+ close(p->fd[1]);
+}
+
+static inline void
+wakeup_init(struct bird_thread *loop)
+{
+ pipe_new(&loop->wakeup);
+}
+
+static inline void
+wakeup_drain(struct bird_thread *loop)
+{
+ pipe_drain(&loop->wakeup);
+}
+
+static inline void
+wakeup_do_kick(struct bird_thread *loop)
+{
+ pipe_kick(&loop->wakeup);
+}
+
+static inline void
+wakeup_free(struct bird_thread *loop)
+{
+ pipe_free(&loop->wakeup);
+}
+
- sockets_fire(struct birdloop *loop, _Bool read, _Bool write)
++static inline bool
+birdloop_try_ping(struct birdloop *loop, u32 ltt)
+{
+ /* Somebody else is already pinging, be idempotent */
+ if (ltt & LTT_PING)
+ {
+ LOOP_TRACE(loop, DL_PING, "already being pinged");
+ return 0;
+ }
+
+ /* Thread moving is an implicit ping */
+ if (ltt & LTT_MOVE)
+ {
+ LOOP_TRACE(loop, DL_PING, "ping while moving");
+ return 1;
+ }
+
+ /* No more flags allowed */
+ ASSERT_DIE(!ltt);
+
+ /* No ping when not picked up */
+ if (!loop->thread)
+ {
+ LOOP_TRACE(loop, DL_PING, "not picked up yet, can't ping");
+ return 1;
+ }
+
+ /* No ping when masked */
+ if (loop == birdloop_wakeup_masked)
+ {
+ LOOP_TRACE(loop, DL_PING, "wakeup masked, can't ping");
+ birdloop_wakeup_masked_count++;
+ return 1;
+ }
+
+ /* Send meta event to ping */
+ if ((loop != loop->thread->meta) && (loop != &main_birdloop))
+ {
+ LOOP_TRACE(loop, DL_PING, "Ping by meta event to %p", loop->thread->meta);
+ ev_send_loop(loop->thread->meta, &loop->event);
+ return 1;
+ }
+
+ /* Do the real ping of Meta or Main */
+ LOOP_TRACE(loop, DL_WAKEUP, "sending pipe ping");
+ wakeup_do_kick(loop->thread);
+ return 0;
+}
+
+static inline void
+birdloop_do_ping(struct birdloop *loop)
+{
+ /* Register our ping effort */
+ u32 ltt = atomic_fetch_or_explicit(&loop->thread_transition, LTT_PING, memory_order_acq_rel);
+
+ /* Try to ping in multiple ways */
+ if (birdloop_try_ping(loop, ltt))
+ atomic_fetch_and_explicit(&loop->thread_transition, ~LTT_PING, memory_order_acq_rel);
+}
+
+void
+birdloop_ping(struct birdloop *loop)
+{
+ if (!birdloop_inside(loop))
+ {
+ LOOP_TRACE(loop, DL_PING, "ping from outside");
+ birdloop_do_ping(loop);
+ }
+ else
+ {
+ LOOP_TRACE(loop, DL_PING, "ping from inside, pending=%d", loop->ping_pending);
+ if (!loop->ping_pending)
+ loop->ping_pending++;
+ }
+}
+
+
+/*
+ * Sockets
+ */
+
+static void
+sockets_init(struct birdloop *loop)
+{
+ init_list(&loop->sock_list);
+ loop->sock_num = 0;
+}
+
+void
+socket_changed(sock *s)
+{
+ struct birdloop *loop = s->loop;
+ ASSERT_DIE(birdloop_inside(loop));
+
+ LOOP_TRACE(loop, DL_SOCKETS, "socket %p changed", s);
+ loop->sock_changed = 1;
+ birdloop_ping(loop);
+}
+
+void
+birdloop_add_socket(struct birdloop *loop, sock *s)
+{
+ ASSERT_DIE(birdloop_inside(loop));
+ ASSERT_DIE(!s->loop);
+
+ LOOP_TRACE(loop, DL_SOCKETS, "adding socket %p (total=%d)", s, loop->sock_num);
+ add_tail(&loop->sock_list, &s->n);
+ loop->sock_num++;
+
+ s->loop = loop;
+ s->index = -1;
+
+ socket_changed(s);
+}
+
+extern sock *stored_sock; /* mainloop hack */
+
+void
+birdloop_remove_socket(struct birdloop *loop, sock *s)
+{
+ ASSERT_DIE(!enlisted(&s->n) == !s->loop);
+
+ if (!s->loop)
+ return;
+
+ ASSERT_DIE(birdloop_inside(loop));
+ ASSERT_DIE(s->loop == loop);
+
+ /* Decouple the socket from the loop at all. */
+ LOOP_TRACE(loop, DL_SOCKETS, "removing socket %p (total=%d)", s, loop->sock_num);
+
+ if (loop->sock_active == s)
+ loop->sock_active = sk_next(s);
+
+ if ((loop == &main_birdloop) && (s == stored_sock))
+ stored_sock = sk_next(s);
+
+ rem_node(&s->n);
+ loop->sock_num--;
+
+ socket_changed(s);
+
+ s->loop = NULL;
+ s->index = -1;
+}
+
+void
+sk_reloop(sock *s, struct birdloop *loop)
+{
+ ASSERT_DIE(birdloop_inside(loop));
+ ASSERT_DIE(birdloop_inside(s->loop));
+
+ if (loop == s->loop)
+ return;
+
+ birdloop_remove_socket(s->loop, s);
+ birdloop_add_socket(loop, s);
+}
+
+void
+sk_pause_rx(struct birdloop *loop, sock *s)
+{
+ ASSERT_DIE(birdloop_inside(loop));
+ s->rx_hook = NULL;
+ socket_changed(s);
+}
+
+void
+sk_resume_rx(struct birdloop *loop, sock *s, int (*hook)(sock *, uint))
+{
+ ASSERT_DIE(birdloop_inside(loop));
+ ASSERT_DIE(hook);
+ s->rx_hook = hook;
+ socket_changed(s);
+}
+
+static inline uint sk_want_events(sock *s)
+{ return (s->rx_hook ? POLLIN : 0) | (sk_tx_pending(s) ? POLLOUT : 0); }
+
+void
+sockets_prepare(struct birdloop *loop, struct pfd *pfd)
+{
+ node *n;
+ WALK_LIST(n, loop->sock_list)
+ {
+ SKIP_BACK_DECLARE(sock, s, n, n);
+ uint w = sk_want_events(s);
+
+ if (!w)
+ {
+ s->index = -1;
+ continue;
+ }
+
+ s->index = pfd->pfd.used;
+ LOOP_TRACE(loop, DL_SOCKETS, "socket %p poll index is %d", s, s->index);
+
+ BUFFER_PUSH(pfd->pfd) = (struct pollfd) {
+ .fd = s->fd,
+ .events = sk_want_events(s),
+ };
+ BUFFER_PUSH(pfd->loop) = loop;
+ }
+}
+
+int sk_read(sock *s, int revents);
+int sk_write(sock *s);
+void sk_err(sock *s, int revents);
+
+static void
- static _Bool
++sockets_fire(struct birdloop *loop, bool read, bool write)
+{
+ if (EMPTY_LIST(loop->sock_list))
+ return;
+
+ times_update();
+
+ struct pollfd *pfd = loop->thread->pfd->pfd.data;
+ loop->sock_active = SKIP_BACK(sock, n, HEAD(loop->sock_list));
+
+ while (loop->sock_active)
+ {
+ sock *s = loop->sock_active;
+
+ int rev;
+ if ((s->index >= 0) && (rev = pfd[s->index].revents) && !(rev & POLLNVAL))
+ {
+ int e = 1;
+
+ if (write && (rev & POLLOUT))
+ {
+ /* Write until task limit is up */
+ while ((s == loop->sock_active) && (e = sk_write(s)) && task_still_in_limit())
+ ;
+
+ if (s != loop->sock_active)
+ continue;
+
+ if (!sk_tx_pending(s))
+ loop->thread->sock_changed = 1;
+ }
+
+ /* Read until task limit is up */
+ if (read && (rev & POLLIN))
+ while ((s == loop->sock_active) && s->rx_hook && sk_read(s, rev) && (s->fast_rx || task_still_in_limit()))
+ ;
+
+ if (s != loop->sock_active)
+ continue;
+
+ if (!(rev & (POLLOUT | POLLIN)) && (rev & POLLERR))
+ sk_err(s, rev);
+
+ if (s != loop->sock_active)
+ continue;
+ }
+
+ loop->sock_active = sk_next(s);
+ }
+}
+
+/*
+ * Threads
+ */
+
+static void bird_thread_start_event(void *_data);
+static void bird_thread_busy_set(struct bird_thread *thr, int val);
+
+struct birdloop_pickup_group {
+ DOMAIN(attrs) domain;
+ list loops;
+ list threads;
+ uint thread_count;
+ uint thread_busy_count;
+ uint loop_count;
+ uint loop_unassigned_count;
+ btime max_latency;
+ event start_threads;
+} pickup_groups[2] = {
+ {
+ /* all zeroes */
+ },
+ {
+ /* FIXME: make this dynamic, now it copies the loop_max_latency value from proto/bfd/config.Y */
+ .max_latency = 10 MS,
+ .start_threads.hook = bird_thread_start_event,
+ .start_threads.data = &pickup_groups[1],
+ },
+};
+
+static _Thread_local struct bird_thread *this_thread;
+
+static void
+birdloop_set_thread(struct birdloop *loop, struct bird_thread *thr, struct birdloop_pickup_group *group)
+{
+ struct bird_thread *old = loop->thread;
+ ASSERT_DIE(!thr != !old);
+
+ /* Signal our moving effort */
+ u32 ltt = atomic_fetch_or_explicit(&loop->thread_transition, LTT_MOVE, memory_order_acq_rel);
+ ASSERT_DIE((ltt & LTT_MOVE) == 0);
+
+ /* Wait until all previously started pings end */
+ while (ltt & LTT_PING)
+ {
+ birdloop_yield();
+ ltt = atomic_load_explicit(&loop->thread_transition, memory_order_acquire);
+ ASSERT_DIE(ltt & LTT_MOVE);
+ }
+ /* Now we are free of running pings */
+
+ if (!thr)
+ {
+ /* Unschedule from Meta */
+ ev_postpone(&loop->event);
+ tm_stop(&loop->timer);
+
+ /* Request local socket reload */
+ this_thread->sock_changed = 1;
+ }
+
+ /* Update the thread value */
+ loop->thread = thr;
+
+ /* Allow pings */
+ atomic_fetch_and_explicit(&loop->thread_transition, ~LTT_MOVE, memory_order_acq_rel);
+
+ /* Put into appropriate lists */
+ if (thr)
+ {
+ thr->loop_count++;
+ add_tail(&thr->loops, &loop->n);
+
+ if (!EMPTY_LIST(loop->sock_list))
+ thr->sock_changed = 1;
+ ev_send_loop(loop->thread->meta, &loop->event);
+ }
+ else
+ {
+ /* Put into pickup list */
+ LOCK_DOMAIN(attrs, group->domain);
+ add_tail(&group->loops, &loop->n);
+ group->loop_unassigned_count++;
+ UNLOCK_DOMAIN(attrs, group->domain);
+ }
+
+ loop->last_transition_ns = ns_now();
+}
+
+static void
+bird_thread_pickup_next(struct birdloop_pickup_group *group)
+{
+ /* This thread goes to the end of the pickup list */
+ rem_node(&this_thread->n);
+ add_tail(&group->threads, &this_thread->n);
+
+ /* If there are more loops to be picked up, wakeup the next thread in order */
+ if (!EMPTY_LIST(group->loops))
+ wakeup_do_kick(SKIP_BACK(struct bird_thread, n, HEAD(group->threads)));
+}
+
- _Bool thread_dropper_running = !!thread_dropper;
++static bool
+birdloop_hot_potato(struct birdloop *loop)
+{
+ if (!loop)
+ return 0;
+
+ return ns_now() - loop->last_transition_ns < 1 S TO_NS;
+}
+
+static void
+birdloop_take(struct birdloop_pickup_group *group)
+{
+ struct birdloop *loop = NULL;
+
+ if (birdloop_hot_potato(this_thread->meta))
+ return;
+
+ LOCK_DOMAIN(attrs, group->domain);
+
+ if (this_thread->busy_active &&
+ (group->thread_busy_count < group->thread_count) &&
+ (this_thread->loop_count > 1) &&
+ !EMPTY_LIST(group->loops) &&
+ birdloop_hot_potato(HEAD(group->loops)))
+ {
+ THREAD_TRACE(DL_SCHEDULING, "Loop drop requested (tbc=%d, tc=%d, lc=%d)",
+ group->thread_busy_count, group->thread_count, this_thread->loop_count);
+ UNLOCK_DOMAIN(attrs, group->domain);
+
+ uint dropped = 0;
+ node *n;
+ WALK_LIST2(loop, n, this_thread->loops, n)
+ {
+ birdloop_enter(loop);
+ if (ev_active(&loop->event) && !loop->stopped && !birdloop_hot_potato(loop))
+ {
+ /* Pass to another thread */
+ rem_node(&loop->n);
+ this_thread->loop_count--;
+ LOOP_TRACE(loop, DL_SCHEDULING, "Dropping from thread, remaining %u loops here", this_thread->loop_count);
+
+ /* This also unschedules the loop from Meta */
+ birdloop_set_thread(loop, NULL, group);
+
+ dropped++;
+ if (dropped * dropped > this_thread->loop_count)
+ {
+ birdloop_leave(loop);
+
+ LOCK_DOMAIN(attrs, group->domain);
+ bird_thread_pickup_next(group);
+ UNLOCK_DOMAIN(attrs, group->domain);
+
+ break;
+ }
+ }
+ birdloop_leave(loop);
+ }
+
+ if (dropped)
+ {
+ this_thread->meta->last_transition_ns = ns_now();
+ return;
+ }
+
+ this_thread->busy_counter = 0;
+ bird_thread_busy_set(this_thread, 0);
+ LOCK_DOMAIN(attrs, group->domain);
+ }
+
+ if (!EMPTY_LIST(group->loops))
+ {
+ THREAD_TRACE(DL_SCHEDULING, "Loop take requested");
+
+ /* Take a proportional amount of loops from the pickup list and unlock */
+ uint thread_count = group->thread_count + 1;
+ if (group->thread_busy_count < group->thread_count)
+ thread_count -= group->thread_busy_count;
+
+ uint assign = 1 + group->loop_unassigned_count / thread_count;
+ for (uint i=0; !EMPTY_LIST(group->loops) && i<assign; i++)
+ {
+ loop = SKIP_BACK(struct birdloop, n, HEAD(group->loops));
+ rem_node(&loop->n);
+ group->loop_unassigned_count--;
+ UNLOCK_DOMAIN(attrs, group->domain);
+
+ birdloop_enter(loop);
+ birdloop_set_thread(loop, this_thread, group);
+ LOOP_TRACE(loop, DL_SCHEDULING, "Picked up by thread");
+
+ node *n;
+ WALK_LIST(n, loop->sock_list)
+ SKIP_BACK(sock, n, n)->index = -1;
+
+ birdloop_leave(loop);
+
+ LOCK_DOMAIN(attrs, group->domain);
+ }
+
+ bird_thread_pickup_next(group);
+ }
+
+ UNLOCK_DOMAIN(attrs, group->domain);
+ this_thread->meta->last_transition_ns = ns_now();
+}
+
+static int
+poll_timeout(struct birdloop *loop)
+{
+ timer *t = timers_first(&loop->time);
+ if (!t)
+ {
+ THREAD_TRACE(DL_SCHEDULING, "No timers, no events in meta");
+ return -1;
+ }
+
+ btime remains = tm_remains(t);
+ int timeout = remains TO_MS + ((remains TO_MS) MS < remains);
+
+ THREAD_TRACE(DL_SCHEDULING, "Next meta timer in %d ms for %s", timeout,
+ LOOP_NAME(SKIP_BACK(struct birdloop, timer, t)));
+
+ return timeout;
+}
+
+static void
+bird_thread_busy_set(struct bird_thread *thr, int val)
+{
+ LOCK_DOMAIN(attrs, thr->group->domain);
+ if (thr->busy_active = val)
+ thr->group->thread_busy_count++;
+ else
+ thr->group->thread_busy_count--;
+ ASSERT_DIE(thr->group->thread_busy_count <= thr->group->thread_count);
+ UNLOCK_DOMAIN(attrs, thr->group->domain);
+}
+
+static void *
+bird_thread_main(void *arg)
+{
+ struct bird_thread *thr = this_thread = arg;
+
+ rcu_thread_start();
+
+ account_to(&thr->overhead);
+
+ birdloop_enter(thr->meta);
+ this_birdloop = thr->meta;
+
+ THREAD_TRACE(DL_SCHEDULING, "Started");
+
+ tmp_init(thr->pool);
+ init_list(&thr->loops);
+
+ defer_init(lp_new(thr->pool));
+
+ thr->sock_changed = 1;
+
+ struct pfd pfd;
+ BUFFER_INIT(pfd.pfd, thr->pool, 16);
+ BUFFER_INIT(pfd.loop, thr->pool, 16);
+ thr->pfd = &pfd;
+
+ while (1)
+ {
+ u64 thr_loop_start = ns_now();
+ int timeout;
+
+ /* Schedule all loops with timed out timers */
+ timers_fire(&thr->meta->time, 0);
+
+ /* Pickup new loops */
+ birdloop_take(thr->group);
+
+ /* Compute maximal time per loop */
+ u64 thr_before_run = ns_now();
+ if (thr->loop_count > 0)
+ {
+ thr->max_loop_time_ns = (thr->max_latency_ns / 2 - (thr_before_run - thr_loop_start)) / (u64) thr->loop_count;
+ if (thr->max_loop_time_ns NS > 300 MS)
+ thr->max_loop_time_ns = 300 MS TO_NS;
+ }
+
+ /* Run all scheduled loops */
+ int more_events = ev_run_list(&thr->meta->event_list);
+ if (more_events)
+ {
+ THREAD_TRACE(DL_SCHEDULING, "More metaevents to run from %s",
+ LOOP_NAME(SKIP_BACK(struct birdloop, event,
+ atomic_load_explicit(&thr->meta->event_list.receiver, memory_order_relaxed)))
+ );
+ timeout = 0;
+ }
+ else
+ timeout = poll_timeout(thr->meta);
+
+ /* Run priority events before sleeping */
+ ev_run_list(&thr->priority_events);
+
+ /* Do we have to refresh sockets? */
+ if (thr->sock_changed)
+ {
+ THREAD_TRACE(DL_SOCKETS, "Recalculating socket poll");
+ thr->sock_changed = 0;
+
+ BUFFER_FLUSH(pfd.pfd);
+ BUFFER_FLUSH(pfd.loop);
+
+ pipe_pollin(&thr->wakeup, &pfd);
+
+ node *nn;
+ struct birdloop *loop;
+ WALK_LIST2(loop, nn, thr->loops, n)
+ {
+ birdloop_enter(loop);
+ sockets_prepare(loop, &pfd);
+ birdloop_leave(loop);
+ }
+
+ ASSERT_DIE(pfd.loop.used == pfd.pfd.used);
+ THREAD_TRACE(DL_SOCKETS, "Total %d sockets", pfd.pfd.used);
+ }
+
+ /* Check thread busy indicator */
+ int idle_force = (timeout < 0) || (timeout > 300);
+ int busy_now = (timeout < 5) && !idle_force;
+
+ /* Nothing to do right now but there may be some loops for pickup */
+ if (idle_force)
+ {
+ LOCK_DOMAIN(attrs, thr->group->domain);
+ if (!EMPTY_LIST(thr->group->loops))
+ timeout = 0;
+ UNLOCK_DOMAIN(attrs, thr->group->domain);
+ }
+
+ if (busy_now && !thr->busy_active && (++thr->busy_counter == 4))
+ bird_thread_busy_set(thr, 1);
+
+ if (!busy_now && thr->busy_active && (idle_force || (--thr->busy_counter == 0)))
+ {
+ thr->busy_counter = 0;
+ bird_thread_busy_set(thr, 0);
+ }
+
+ account_to(&this_thread->idle);
+ birdloop_leave(thr->meta);
+poll_retry:;
+ int rv = poll(pfd.pfd.data, pfd.pfd.used, timeout);
+ if (rv < 0)
+ {
+ if (errno == EINTR || errno == EAGAIN)
+ goto poll_retry;
+ bug("poll in %p: %m", thr);
+ }
+
+ account_to(&this_thread->overhead);
+ birdloop_enter(thr->meta);
+
+ /* Drain wakeup fd */
+ if (pfd.pfd.data[0].revents & POLLIN)
+ {
+ THREAD_TRACE(DL_WAKEUP, "Ping received");
+ ASSERT_DIE(rv > 0);
+ rv--;
+ wakeup_drain(thr);
+ }
+
+ /* Unset ping information for Meta */
+ atomic_fetch_and_explicit(&thr->meta->thread_transition, ~LTT_PING, memory_order_acq_rel);
+
+ /* Schedule loops with active sockets */
+ if (rv)
+ for (uint i = 1; i < pfd.pfd.used; i++)
+ if (pfd.pfd.data[i].revents)
+ {
+ LOOP_TRACE(pfd.loop.data[i], DL_SOCKETS, "socket id %d got revents=0x%x", i, pfd.pfd.data[i].revents);
+ ev_send_loop(thr->meta, &pfd.loop.data[i]->event);
+ }
+ }
+
+ bug("An infinite loop has ended.");
+}
+
+static void
+bird_thread_cleanup(void *_thr)
+{
+ struct bird_thread *thr = _thr;
+ struct birdloop *meta = thr->meta;
+ ASSERT_DIE(birdloop_inside(&main_birdloop));
+
+ /* Wait until the thread actually finishes */
+ ASSERT_DIE(meta);
+ birdloop_enter(meta);
+ birdloop_leave(meta);
+
+ /* No more wakeup */
+ wakeup_free(thr);
+
+ /* Thread attributes no longer needed */
+ pthread_attr_destroy(&thr->thread_attr);
+
+ /* Free the meta loop */
+ thr->meta->thread = NULL;
+ thr->meta = NULL;
+ birdloop_free(meta);
+}
+
+static struct bird_thread *
+bird_thread_start(struct birdloop_pickup_group *group)
+{
+ ASSERT_DIE(birdloop_inside(&main_birdloop));
+
+ struct birdloop *meta = birdloop_new_no_pickup(&root_pool, DOMAIN_ORDER(meta), "Thread Meta");
+ pool *p = birdloop_pool(meta);
+
+ birdloop_enter(meta);
+ LOCK_DOMAIN(attrs, group->domain);
+
+ struct bird_thread *thr = mb_allocz(p, sizeof(*thr));
+ thr->pool = p;
+ thr->cleanup_event = (event) { .hook = bird_thread_cleanup, .data = thr, };
+ thr->group = group;
+ thr->max_latency_ns = (group->max_latency ?: 5 S) TO_NS;
+ thr->meta = meta;
+ thr->meta->thread = thr;
+
+ wakeup_init(thr);
+ ev_init_list(&thr->priority_events, NULL, "Thread direct event list");
+
+ add_tail(&group->threads, &thr->n);
+
+ int e = 0;
+
+ if (e = pthread_attr_init(&thr->thread_attr))
+ die("pthread_attr_init() failed: %M", e);
+
+ /* We don't have to worry about thread stack size so much.
+ if (e = pthread_attr_setstacksize(&thr->thread_attr, THREAD_STACK_SIZE))
+ die("pthread_attr_setstacksize(%u) failed: %M", THREAD_STACK_SIZE, e);
+ */
+
+ if (e = pthread_attr_setdetachstate(&thr->thread_attr, PTHREAD_CREATE_DETACHED))
+ die("pthread_attr_setdetachstate(PTHREAD_CREATE_DETACHED) failed: %M", e);
+
+ if (e = pthread_create(&thr->thread_id, &thr->thread_attr, bird_thread_main, thr))
+ die("pthread_create() failed: %M", e);
+
+ group->thread_count++;
+
+ UNLOCK_DOMAIN(attrs, group->domain);
+ birdloop_leave(meta);
+ return thr;
+}
+
+static void
+bird_thread_start_event(void *_data)
+{
+ struct birdloop_pickup_group *group = _data;
+ bird_thread_start(group);
+}
+
+static struct birdloop *thread_dropper;
+static event *thread_dropper_event;
+static uint thread_dropper_goal;
+
+static void
+bird_thread_dropper_free(void *data)
+{
+ struct birdloop *tdl_stop = data;
+ birdloop_free(tdl_stop);
+}
+
+static void
+bird_thread_shutdown(void * _ UNUSED)
+{
+ struct birdloop_pickup_group *group = this_thread->group;
+ LOCK_DOMAIN(attrs, group->domain);
+ int dif = group->thread_count - thread_dropper_goal;
+ struct birdloop *tdl_stop = NULL;
+
+ if (dif > 0)
+ ev_send_loop(thread_dropper, thread_dropper_event);
+ else
+ {
+ tdl_stop = thread_dropper;
+ thread_dropper = NULL;
+ }
+
+ UNLOCK_DOMAIN(attrs, group->domain);
+
+ THREAD_TRACE(DL_SCHEDULING, "Thread pickup size differs from dropper goal by %d%s", dif, tdl_stop ? ", stopping" : "");
+
+ if (tdl_stop)
+ {
+ birdloop_stop_self(tdl_stop, bird_thread_dropper_free, tdl_stop);
+ return;
+ }
+
+ struct bird_thread *thr = this_thread;
+
+ LOCK_DOMAIN(attrs, group->domain);
+ /* Leave the thread-picker list to get no more loops */
+ rem_node(&thr->n);
+ group->thread_count--;
+
+ /* Fix the busy count */
+ if (thr->busy_active)
+ group->thread_busy_count--;
+
+ UNLOCK_DOMAIN(attrs, group->domain);
+
+ /* Leave the thread-dropper loop as we aren't going to return. */
+ birdloop_leave(thread_dropper);
+
+ /* Last try to run the priority event list; ruin it then to be extra sure */
+ ev_run_list(&this_thread->priority_events);
+ memset(&this_thread->priority_events, 0xa5, sizeof(this_thread->priority_events));
+
+ /* Drop loops including the thread dropper itself */
+ while (!EMPTY_LIST(thr->loops))
+ {
+ struct birdloop *loop = HEAD(thr->loops);
+
+ /* Remove loop from this thread's list */
+ this_thread->loop_count--;
+ rem_node(&loop->n);
+
+ /* Unset loop's thread */
+ birdloop_set_thread(loop, NULL, group);
+ }
+
+ /* Let others know about new loops */
+ LOCK_DOMAIN(attrs, group->domain);
+ if (!EMPTY_LIST(group->loops))
+ wakeup_do_kick(SKIP_BACK(struct bird_thread, n, HEAD(group->threads)));
+ UNLOCK_DOMAIN(attrs, group->domain);
+
+ /* Request thread cleanup from main loop */
+ ev_send_loop(&main_birdloop, &thr->cleanup_event);
+
+ /* Local pages not needed anymore */
+ flush_local_pages();
+
+ /* Unregister from RCU */
+ rcu_thread_stop();
+
+ /* Now we can be cleaned up */
+ birdloop_leave(thr->meta);
+
+ /* Exit! */
+ THREAD_TRACE(DL_SCHEDULING, "Stopped");
+ pthread_exit(NULL);
+}
+
+void
+bird_thread_commit(struct config *new, struct config *old UNUSED)
+{
+ ASSERT_DIE(birdloop_inside(&main_birdloop));
+
+ if (new->shutdown)
+ return;
+
+ if (!new->thread_count)
+ new->thread_count = 1;
+
+ while (1)
+ {
+ struct birdloop_pickup_group *group = &pickup_groups[0];
+ LOCK_DOMAIN(attrs, group->domain);
+
+ int dif = group->thread_count - (thread_dropper_goal = new->thread_count);
- _Bool task_still_in_limit(void)
++ bool thread_dropper_running = !!thread_dropper;
+
+ UNLOCK_DOMAIN(attrs, group->domain);
+
+ if (dif < 0)
+ {
+ bird_thread_start(group);
+ continue;
+ }
+
+ if ((dif > 0) && !thread_dropper_running)
+ {
+ struct birdloop *tdl = birdloop_new(&root_pool, DOMAIN_ORDER(control), group->max_latency, "Thread dropper");
+ birdloop_enter(tdl);
+ event *tde = ev_new_init(tdl->pool, bird_thread_shutdown, NULL);
+
+ LOCK_DOMAIN(attrs, group->domain);
+ thread_dropper = tdl;
+ thread_dropper_event = tde;
+ UNLOCK_DOMAIN(attrs, group->domain);
+
+ ev_send_loop(thread_dropper, thread_dropper_event);
+ birdloop_leave(tdl);
+ }
+
+ return;
+ }
+}
+
+/* Cleanup after last thread */
+static void
+bird_thread_sync_finish(void *_sync)
+{
+ ASSERT_THE_BIRD_LOCKED;
+ struct bird_thread_syncer *sync = _sync;
+
+ /* Keep necessary pointers locally */
+ pool *p = sync->pool;
+ DOMAIN(control) lock = sync->lock;
+ LOCK_DOMAIN(control, lock);
+
+ /* This invalidates the `sync` pointer */
+ CALL(sync->finish, sync);
+
+ /* Free pool and domain */
+ rp_free(p);
+ UNLOCK_DOMAIN(control, lock);
+ DOMAIN_FREE(control, lock);
+}
+
+/* Process regular one thread hook */
+static void
+bird_thread_sync_one(void *_sync)
+{
+ struct bird_thread_syncer *sync = _sync;
+
+ LOCK_DOMAIN(control, sync->lock);
+ CALL(sync->hook, sync);
+ sync->done++;
+ if (sync->done == sync->total)
+ ev_send_loop(&main_birdloop, ev_new_init(sync->pool, bird_thread_sync_finish, sync));
+ UNLOCK_DOMAIN(control, sync->lock);
+}
+
+void
+bird_thread_sync_all(struct bird_thread_syncer *sync,
+ void (*hook)(struct bird_thread_syncer *),
+ void (*done)(struct bird_thread_syncer *), const char *name)
+{
+ sync->lock = DOMAIN_NEW(control);
+ LOCK_DOMAIN(control, sync->lock);
+
+ sync->pool = rp_new(&root_pool, sync->lock.control, name);
+ sync->hook = hook;
+ sync->finish = done;
+
+ for (int i=0; i<2; i++)
+ {
+ struct birdloop_pickup_group *group = &pickup_groups[i];
+
+ LOCK_DOMAIN(attrs, group->domain);
+
+ struct bird_thread *thr;
+ WALK_LIST(thr, group->threads)
+ {
+ sync->total++;
+ ev_send(&thr->priority_events, ev_new_init(sync->pool, bird_thread_sync_one, sync));
+ wakeup_do_kick(thr);
+ }
+
+ UNLOCK_DOMAIN(attrs, group->domain);
+ }
+
+ UNLOCK_DOMAIN(control, sync->lock);
+}
+
+
+struct bird_thread_show_data {
+ struct bird_thread_syncer sync;
+ cli *cli;
+ linpool *lp;
+ u8 show_loops;
+ uint line_pos;
+ uint line_max;
+ const char **lines;
+};
+
+#define tsd_append(...) do { \
+ if (!tsd->lines) \
+ tsd->lines = mb_allocz(tsd->sync.pool, sizeof(const char *) * tsd->line_max); \
+ if (tsd->line_pos >= tsd->line_max) \
+ tsd->lines = mb_realloc(tsd->lines, sizeof (const char *) * (tsd->line_max *= 2)); \
+ tsd->lines[tsd->line_pos++] = lp_sprintf(tsd->lp, __VA_ARGS__); \
+} while (0)
+
+static void
+bird_thread_show_cli_cont(struct cli *c UNUSED)
+{
+ /* Explicitly do nothing to prevent CLI from trying to parse another command. */
+}
+
+static int
+bird_thread_show_cli_cleanup(struct cli *c UNUSED)
+{
+ return 1; /* Defer the cleanup until the writeout is finished. */
+}
+
+static void
+bird_thread_show_spent_time(struct bird_thread_show_data *tsd, const char *name, struct spent_time *st)
+{
+ char b[TIME_BY_SEC_SIZE * sizeof("1234567890, ")], *bptr = b, *bend = b + sizeof(b);
+ uint cs = CURRENT_SEC;
+ uint fs = NSEC_TO_SEC(st->last_written_ns);
+
+ for (uint i = 0; i <= cs && i < TIME_BY_SEC_SIZE; i++)
+ bptr += bsnprintf(bptr, bend - bptr, "% 10lu ",
+ (cs - i > fs) ? 0 : st->by_sec_ns[(cs - i) % TIME_BY_SEC_SIZE]);
+ bptr[-1] = 0; /* Drop the trailing space */
+
+ tsd_append(" %s total time: % 9t s; last %d secs [ns]: %s", name, st->total_ns NS, MIN(CURRENT_SEC+1, TIME_BY_SEC_SIZE), b);
+}
+
+static void
+bird_thread_show_loop(struct bird_thread_show_data *tsd, struct birdloop *loop)
+{
+ tsd_append(" Loop %s", domain_name(loop->time.domain));
+ bird_thread_show_spent_time(tsd, "Working ", &loop->working);
+ bird_thread_show_spent_time(tsd, "Locking ", &loop->locking);
+}
+
+static void
+bird_thread_show(struct bird_thread_syncer *sync)
+{
+ SKIP_BACK_DECLARE(struct bird_thread_show_data, tsd, sync, sync);
+
+ if (!tsd->lp)
+ tsd->lp = lp_new(tsd->sync.pool);
+
+ if (tsd->show_loops)
+ tsd_append("Thread %p%s (busy counter %d)", this_thread, this_thread->busy_active ? " [busy]" : "", this_thread->busy_counter);
+
+ u64 total_time_ns = 0;
+ struct birdloop *loop;
+ WALK_LIST(loop, this_thread->loops)
+ {
+ if (tsd->show_loops)
+ bird_thread_show_loop(tsd, loop);
+
+ total_time_ns += loop->working.total_ns + loop->locking.total_ns;
+ }
+
+ if (tsd->show_loops)
+ {
+ tsd_append(" Total working time: %t", total_time_ns NS);
+ bird_thread_show_spent_time(tsd, "Overhead", &this_thread->overhead);
+ bird_thread_show_spent_time(tsd, "Idle ", &this_thread->idle);
+ }
+ else
+ tsd_append("Thread %p working %t s overhead %t s",
+ this_thread, total_time_ns NS, this_thread->overhead.total_ns NS);
+}
+
+static void
+cmd_show_threads_done(struct bird_thread_syncer *sync)
+{
+ SKIP_BACK_DECLARE(struct bird_thread_show_data, tsd, sync, sync);
+ ASSERT_DIE(birdloop_inside(&main_birdloop));
+
+ tsd->cli->cont = NULL;
+ tsd->cli->cleanup = NULL;
+
+ for (int i=0; i<2; i++)
+ {
+ struct birdloop_pickup_group *group = &pickup_groups[i];
+
+ LOCK_DOMAIN(attrs, group->domain);
+ uint count = 0;
+ u64 total_time_ns = 0;
+ if (!EMPTY_LIST(group->loops))
+ {
+ if (tsd->show_loops)
+ tsd_append("Unassigned loops in group %d:", i);
+
+ struct birdloop *loop;
+ WALK_LIST(loop, group->loops)
+ {
+ if (tsd->show_loops)
+ bird_thread_show_loop(tsd, loop);
+
+ total_time_ns += loop->working.total_ns + loop->locking.total_ns;
+ count++;
+ }
+
+ if (tsd->show_loops)
+ tsd_append(" Total working time: %t", total_time_ns NS);
+ else
+ tsd_append("Unassigned %d loops in group %d, total time %t", count, i, total_time_ns NS);
+ }
+ else
+ tsd_append("All loops in group %d are assigned.", i);
+
+ UNLOCK_DOMAIN(attrs, group->domain);
+ }
+
+ for (uint i = 0; i < tsd->line_pos - 1; i++)
+ cli_printf(tsd->cli, -1027, "%s", tsd->lines[i]);
+
+ cli_printf(tsd->cli, 1027, "%s", tsd->lines[tsd->line_pos-1]);
+ cli_write_trigger(tsd->cli);
+ mb_free(tsd);
+}
+
+void
+cmd_show_threads(int show_loops)
+{
+ struct bird_thread_show_data *tsd = mb_allocz(&root_pool, sizeof(struct bird_thread_show_data));
+ tsd->cli = this_cli;
+ tsd->show_loops = show_loops;
+ tsd->line_pos = 0;
+ tsd->line_max = 64;
+
+ this_cli->cont = bird_thread_show_cli_cont;
+ this_cli->cleanup = bird_thread_show_cli_cleanup;
+
+ bird_thread_sync_all(&tsd->sync, bird_thread_show, cmd_show_threads_done, "Show Threads");
+}
+
- _Bool task_before_halftime(void)
++bool task_still_in_limit(void)
+{
+ static u64 main_counter = 0;
+ if (this_birdloop == &main_birdloop)
+ return (++main_counter % 2048); /* This is a hack because of no accounting in mainloop */
+ else
+ return ns_now() < account_last + this_thread->max_loop_time_ns;
+}
+
++bool task_before_halftime(void)
+{
+ return ns_now() < account_last + this_thread->max_loop_time_ns / 2;
+}
+
+
+/*
+ * Birdloop
+ */
+
+static struct bird_thread main_thread;
+struct birdloop main_birdloop = { .thread = &main_thread, };
+_Thread_local struct birdloop *this_birdloop;
+
+static void birdloop_enter_locked(struct birdloop *loop);
+
+void
+birdloop_init(void)
+{
+ ns_init();
+
+ for (int i=0; i<2; i++)
+ {
+ struct birdloop_pickup_group *group = &pickup_groups[i];
+
+ group->domain = DOMAIN_NEW(attrs);
+ DOMAIN_SETUP(attrs, group->domain, "Loop Pickup", NULL);
+ init_list(&group->loops);
+ init_list(&group->threads);
+ }
+
+ wakeup_init(main_birdloop.thread);
+
+ main_birdloop.time.domain = the_bird_domain.the_bird;
+ main_birdloop.time.loop = &main_birdloop;
+
+ times_update();
+ timers_init(&main_birdloop.time, &root_pool);
+
+ birdloop_enter_locked(&main_birdloop);
+ this_birdloop = &main_birdloop;
+ this_thread = &main_thread;
+
+ defer_init(lp_new(&root_pool));
+}
+
+static void
+birdloop_stop_internal(struct birdloop *loop)
+{
+ LOOP_TRACE(loop, DL_SCHEDULING, "Stopping");
+
+ /* Block incoming pings */
+ u32 ltt = atomic_load_explicit(&loop->thread_transition, memory_order_acquire);
+ while (!atomic_compare_exchange_strong_explicit(
+ &loop->thread_transition, <t, LTT_PING,
+ memory_order_acq_rel, memory_order_acquire))
+ ;
+
+ /* Flush remaining events */
+ ASSERT_DIE(!ev_run_list(&loop->event_list));
+
+ /* Drop timers */
+ timer *t;
+ while (t = timers_first(&loop->time))
+ tm_stop(t);
+
+ /* Drop sockets */
+ sock *s;
+ WALK_LIST_FIRST2(s, n, loop->sock_list)
+ birdloop_remove_socket(loop, s);
+
+ /* Unschedule from Meta */
+ ev_postpone(&loop->event);
+ tm_stop(&loop->timer);
+
+ /* Remove from thread loop list */
+ ASSERT_DIE(loop->thread == this_thread);
+ rem_node(&loop->n);
+ loop->thread = NULL;
+
+ /* Uncount from thread group */
+ LOCK_DOMAIN(attrs, this_thread->group->domain);
+ this_thread->group->loop_count--;
+ UNLOCK_DOMAIN(attrs, this_thread->group->domain);
+
+ /* Leave the loop context without causing any other fuss */
+ ASSERT_DIE(!ev_active(&loop->event));
+ loop->ping_pending = 0;
+ account_to(&this_thread->overhead);
+ this_birdloop = this_thread->meta;
+ birdloop_leave(loop);
+
+ /* Request local socket reload */
+ this_thread->sock_changed = 1;
+
+ /* Call the stopped hook from the main loop */
+ loop->event.hook = loop->stopped;
+ loop->event.data = loop->stop_data;
+ ev_send_loop(&main_birdloop, &loop->event);
+}
+
+static void
+birdloop_run(void *_loop)
+{
+ /* Run priority events before the loop is executed */
+ ev_run_list(&this_thread->priority_events);
+
+ struct birdloop *loop = _loop;
+ account_to(&loop->locking);
+ birdloop_enter(loop);
+ this_birdloop = loop;
+
+ /* Wait until pingers end to wait for all events to actually arrive */
+ for (u32 ltt;
+ ltt = atomic_load_explicit(&loop->thread_transition, memory_order_acquire);
+ )
+ {
+ ASSERT_DIE(ltt == LTT_PING);
+ birdloop_yield();
+ }
+
+ /* Now we can actually do some work */
+ u64 dif = account_to(&loop->working);
+
+ struct global_runtime *gr = atomic_load_explicit(&global_runtime, memory_order_relaxed);
+ if (dif > this_thread->max_loop_time_ns + gr->latency_limit TO_NS)
+ LOOP_WARN(loop, "locked %lu us after its scheduled end time", dif NS TO_US);
+
+ uint repeat, loop_runs = 0;
+ do {
+ LOOP_TRACE(loop, DL_SCHEDULING, "Regular run (%d)", loop_runs);
+ loop_runs++;
+
+ if (loop->stopped)
+ /* Birdloop left inside the helper function */
+ return birdloop_stop_internal(loop);
+
+ /* Process socket TX */
+ sockets_fire(loop, 0, 1);
+
+ /* Run timers */
+ timers_fire(&loop->time, 0);
+
+ /* Run events */
+ repeat = ev_run_list(&loop->event_list);
+
+ /* Process socket RX */
+ sockets_fire(loop, 1, 0);
+
+ /* Check end time */
+ } while (repeat && task_still_in_limit());
+
+ /* Request meta timer */
+ timer *t = timers_first(&loop->time);
+ if (t)
+ tm_start_in(&loop->timer, tm_remains(t), this_thread->meta);
+ else
+ tm_stop(&loop->timer);
+
+ /* Request re-run if needed */
+ if (repeat)
+ ev_send_loop(this_thread->meta, &loop->event);
+
+ /* Collect socket change requests */
+ this_thread->sock_changed |= loop->sock_changed;
+ loop->sock_changed = 0;
+
+ account_to(&this_thread->overhead);
+ this_birdloop = this_thread->meta;
+ birdloop_leave(loop);
+}
+
+static void
+birdloop_run_timer(timer *tm)
+{
+ struct birdloop *loop = tm->data;
+ LOOP_TRACE(loop, DL_TIMERS, "Meta timer ready, requesting run");
+ ev_send_loop(loop->thread->meta, &loop->event);
+}
+
+static struct birdloop *
+birdloop_vnew_internal(pool *pp, uint order, struct birdloop_pickup_group *group, const char *name, va_list args)
+{
+ struct domain_generic *dg = domain_new(order, 1);
+ DG_LOCK(dg);
+
+ pool *p = rp_vnewf(pp, dg, name, args);
+ struct birdloop *loop = mb_allocz(p, sizeof(struct birdloop));
+ loop->pool = p;
+
+ loop->time.domain = dg;
+ loop->time.loop = loop;
+
+ atomic_store_explicit(&loop->thread_transition, 0, memory_order_relaxed);
+
+ birdloop_enter_locked(loop);
+
+ ev_init_list(&loop->event_list, loop, p->name);
+ timers_init(&loop->time, p);
+ sockets_init(loop);
+
+ loop->event = (event) { .hook = birdloop_run, .data = loop, };
+ loop->timer = (timer) { .hook = birdloop_run_timer, .data = loop, };
+
+ LOOP_TRACE(loop, DL_SCHEDULING, "New loop: %s", p->name);
+
+ if (group)
+ {
+ LOCK_DOMAIN(attrs, group->domain);
+ group->loop_count++;
+ group->loop_unassigned_count++;
+ add_tail(&group->loops, &loop->n);
+ if (EMPTY_LIST(group->threads))
+ ev_send(&global_event_list, &group->start_threads);
+ else
+ wakeup_do_kick(SKIP_BACK(struct bird_thread, n, HEAD(group->threads)));
+ UNLOCK_DOMAIN(attrs, group->domain);
+ }
+ else
+ loop->n.next = loop->n.prev = &loop->n;
+
+ birdloop_leave(loop);
+
+ return loop;
+}
+
+static struct birdloop *
+birdloop_new_no_pickup(pool *pp, uint order, const char *name, ...)
+{
+ va_list args;
+ va_start(args, name);
+ struct birdloop *loop = birdloop_vnew_internal(pp, order, NULL, name, args);
+ va_end(args);
+ return loop;
+}
+
+struct birdloop *
+birdloop_new(pool *pp, uint order, btime max_latency, const char *name, ...)
+{
+ va_list args;
+ va_start(args, name);
+ struct birdloop *loop = birdloop_vnew_internal(pp, order, max_latency ? &pickup_groups[1] : &pickup_groups[0], name, args);
+ va_end(args);
+ return loop;
+}
+
+static void
+birdloop_do_stop(struct birdloop *loop, void (*stopped)(void *data), void *data)
+{
+ LOOP_TRACE(loop, DL_SCHEDULING, "Stop requested");
+
+ loop->stopped = stopped;
+ loop->stop_data = data;
+
+ birdloop_do_ping(loop);
+}
+
+void
+birdloop_stop(struct birdloop *loop, void (*stopped)(void *data), void *data)
+{
+ DG_LOCK(loop->time.domain);
+ birdloop_do_stop(loop, stopped, data);
+ DG_UNLOCK(loop->time.domain);
+}
+
+void
+birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *data)
+{
+ ASSERT_DIE(loop == birdloop_current);
+ ASSERT_DIE(DG_IS_LOCKED(loop->time.domain));
+
+ birdloop_do_stop(loop, stopped, data);
+}
+
+void
+birdloop_free(struct birdloop *loop)
+{
+ ASSERT_DIE(loop->thread == NULL);
+
+ struct domain_generic *dg = loop->time.domain;
+ DG_LOCK(dg);
+ rp_free(loop->pool);
+ DG_UNLOCK(dg);
+ domain_free(dg);
+}
+
+static void
+birdloop_enter_locked(struct birdloop *loop)
+{
+ ASSERT_DIE(DG_IS_LOCKED(loop->time.domain));
+ ASSERT_DIE(!birdloop_inside(loop));
+
+ /* Store the old context */
+ loop->prev_loop = birdloop_current;
+
+ /* Put the new context */
+ birdloop_current = loop;
+}
+
+void
+birdloop_enter(struct birdloop *loop)
+{
+ DG_LOCK(loop->time.domain);
+ return birdloop_enter_locked(loop);
+}
+
+static void
+birdloop_leave_locked(struct birdloop *loop)
+{
+ /* Check the current context */
+ ASSERT_DIE(birdloop_current == loop);
+
+ /* Send pending pings */
+ if (loop->ping_pending)
+ {
+ LOOP_TRACE(loop, DL_PING, "sending pings on leave");
+ loop->ping_pending = 0;
+ birdloop_do_ping(loop);
+ }
+
+ /* Restore the old context */
+ birdloop_current = loop->prev_loop;
+}
+
+void
+birdloop_leave(struct birdloop *loop)
+{
+ birdloop_leave_locked(loop);
+ DG_UNLOCK(loop->time.domain);
+}
+
+void
+birdloop_mask_wakeups(struct birdloop *loop)
+{
+ ASSERT_DIE(birdloop_wakeup_masked == NULL);
+ birdloop_wakeup_masked = loop;
+}
+
+void
+birdloop_unmask_wakeups(struct birdloop *loop)
+{
+ ASSERT_DIE(birdloop_wakeup_masked == loop);
+ birdloop_wakeup_masked = NULL;
+ if (birdloop_wakeup_masked_count)
+ wakeup_do_kick(loop->thread);
+
+ birdloop_wakeup_masked_count = 0;
+}
+
+void
+birdloop_yield(void)
+{
+ usleep(100);
+}
+
+void
+ev_send_this_thread(event *e)
+{
+ if (this_thread == &main_thread)
+ ev_send_loop(&main_birdloop, e);
+ else
+ ev_send(&this_thread->priority_events, e);
+}
static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }
- _Bool
++bool
+sk_tx_pending(sock *s)
+{
+ return s->ttx != s->tpos;
+}
+
+
static int
sk_maybe_write(sock *s)
{
lp_flush(krt_filter_lp);
}
- static _Bool
-static void
++static bool
krt_init_scan(struct krt_proto *p)
{
- bmap_reset(&p->seen_map, 1024);
+ switch (p->sync_state)
+ {
+ case KPS_IDLE:
+ rt_refresh_begin(&p->p.main_channel->in_req);
+ bmap_reset(&p->seen_map, 1024);
+ p->sync_state = KPS_SCANNING;
+ return 1;
+
+ case KPS_SCANNING:
+ bug("Kernel scan double-init");
+
+ case KPS_PRUNING:
+ log(L_WARN "%s: Can't scan, still pruning", p->p.name);
+ return 0;
+
+ case KPS_FLUSHING:
+ bug("Can't scan, flushing");
+ }
+
+ bug("Bad kernel sync state");
}
static void