From: Maria Matejka Date: Wed, 26 Jun 2024 15:19:24 +0000 (+0200) Subject: Merge commit 'b95dc8f29f18eb177f91fdc4bf0716fac9b15366' into mq-config-ref X-Git-Tag: v3.0.0~108 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=b797444e94700afe8f8b842da833dbb4b4c5461b;p=thirdparty%2Fbird.git Merge commit 'b95dc8f29f18eb177f91fdc4bf0716fac9b15366' into mq-config-ref Also converted all _Bool's to bool. --- b797444e94700afe8f8b842da833dbb4b4c5461b diff --cc conf/confbase.Y index d1d3604be,ed3c1e6e1..56dd15e7f --- a/conf/confbase.Y +++ b/conf/confbase.Y @@@ -28,8 -27,6 +28,8 @@@ CF_HD CF_DEFINES - static _Bool this_sadr_from_hack_active; ++static bool this_sadr_from_hack_active; + static void check_u16(uint val) { diff --cc lib/io-loop.h index 03fe25292,000000000..b893aa453 mode 100644,000000..100644 --- a/lib/io-loop.h +++ b/lib/io-loop.h @@@ -1,73 -1,0 +1,73 @@@ +/* + * BIRD -- I/O and event loop + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_IO_LOOP_H_ +#define _BIRD_IO_LOOP_H_ + +#include "nest/bird.h" +#include "lib/lists.h" +#include "lib/locking.h" +#include "lib/resource.h" +#include "lib/event.h" +#include "lib/socket.h" + +extern struct birdloop main_birdloop; + +/* Currently running birdloop */ +extern _Thread_local struct birdloop *this_birdloop; + +/* Check that the task has enough time to do a bit more */ - _Bool task_still_in_limit(void); - _Bool task_before_halftime(void); ++bool task_still_in_limit(void); ++bool task_before_halftime(void); + +#define MAYBE_DEFER_TASK(target, event, fmt, args...) do { \ + if (!task_still_in_limit()) { \ + if (atomic_load_explicit(&global_runtime, memory_order_relaxed)->latency_debug & DL_SCHEDULING) \ + log(L_TRACE "Deferring " fmt, ##args); \ + return ev_send(target, event); \ + } } while (0) + +/* Start a new birdloop owned by given pool and domain */ +struct birdloop *birdloop_new(pool *p, uint order, btime max_latency, const char *fmt, ...); + +/* Stop the loop. At the end, the @stopped callback is called unlocked in tail + * position to finish cleanup. Run birdloop_free() from that callback to free + * the loop itself. */ +void birdloop_stop(struct birdloop *loop, void (*stopped)(void *data), void *data); +void birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *data); +void birdloop_free(struct birdloop *loop); + +/* Run this event in this thread's priority event list */ +void ev_send_this_thread(event *e); + +/* Get birdloop's time heap */ +struct timeloop *birdloop_time_loop(struct birdloop *loop); +#define birdloop_domain(l) (birdloop_time_loop((l))->domain) + +/* Get birdloop's pool */ +pool *birdloop_pool(struct birdloop *loop); + +/* Enter and exit the birdloop */ +void birdloop_enter(struct birdloop *loop); +void birdloop_leave(struct birdloop *loop); + - _Bool birdloop_inside(struct birdloop *loop); ++bool birdloop_inside(struct birdloop *loop); + +void birdloop_mask_wakeups(struct birdloop *loop); +void birdloop_unmask_wakeups(struct birdloop *loop); + +void birdloop_link(struct birdloop *loop); +void birdloop_unlink(struct birdloop *loop); + +void birdloop_ping(struct birdloop *loop); + +/* Setup sockets */ +void birdloop_add_socket(struct birdloop *, struct birdsock *); +void birdloop_remove_socket(struct birdloop *, struct birdsock *); + +void birdloop_init(void); + +#endif /* _BIRD_IO_LOOP_H_ */ diff --cc lib/lists.h index 86ff59c9c,7e6d54670..b106687f0 --- a/lib/lists.h +++ b/lib/lists.h @@@ -69,18 -69,6 +69,18 @@@ typedef union list { /* In fact two o #define EMPTY_LIST(list) (!(list).head->next) - static inline _Bool ++static inline bool +enlisted(node *n) +{ + switch ((!!n->next) + (!!n->prev)) + { + case 0: return 0; + case 2: return 1; + case 1: bug("Garbled event list node"); + } + + bug("Maths is broken. And you should see a new heaven and a new earth: for the first heaven and the first earth had been passed away."); +} #ifndef _BIRD_LISTS_C_ #define LIST_INLINE static inline diff --cc lib/lockfree.c index 17c17d189,000000000..2d57b46a2 mode 100644,000000..100644 --- a/lib/lockfree.c +++ b/lib/lockfree.c @@@ -1,455 -1,0 +1,455 @@@ +/* + * BIRD Library -- Generic lock-free structures + * + * (c) 2023--2024 Maria Matejka + * (c) 2023--2024 CZ.NIC, z.s.p.o. + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include "lib/birdlib.h" +#include "lib/lockfree.h" + +#define LOCAL_DEBUG + +void lfuc_unlock_deferred(struct deferred_call *dc) +{ + SKIP_BACK_DECLARE(struct lfuc_unlock_queue_item, luqi, dc, dc); + lfuc_unlock_immediately(luqi->c, luqi->el, luqi->ev); +} + +#if 0 +#define lfjour_debug(...) log(L_TRACE __VA_ARGS__) +#define lfjour_debug_detailed(...) log(L_TRACE __VA_ARGS__) +#elif 0 +#define lfjour_debug(...) log(L_TRACE __VA_ARGS__) +#define lfjour_debug_detailed(...) +#else +#define lfjour_debug(...) +#define lfjour_debug_detailed(...) +#endif + +#define LBI(j, b, p) ((struct lfjour_item *)(((void *) (b)->_block) + ((j)->item_size * (p)))) +#define LBP(j, b, i) ({ \ + off_t off = ((void *) (i)) - ((void *) (b)->_block); \ + u32 s = (j)->item_size; \ + ASSERT_DIE(off < page_size); \ + ASSERT_DIE((off % s) == 0); \ + off / s; \ + }) + +struct lfjour_item * +lfjour_push_prepare(struct lfjour *j) +{ + ASSERT_DIE(!j->domain || DG_IS_LOCKED(j->domain)); + ASSERT_DIE(!j->open); + + if (EMPTY_TLIST(lfjour_block, &j->pending) && + EMPTY_TLIST(lfjour_recipient, &j->recipients)) + return NULL; + + struct lfjour_block *block = NULL; + u32 end = 0; + + if (!EMPTY_TLIST(lfjour_block, &j->pending)) + { + block = j->pending.last; + end = atomic_load_explicit(&block->end, memory_order_relaxed); + if (end >= j->item_count) + { + ASSERT_DIE(end == j->item_count); + block = NULL; + end = 0; + } + } + + if (!block) + { + block = alloc_page(); + lfjour_debug("lfjour(%p)_push_prepare: allocating block %p", j, block); + *block = (struct lfjour_block) {}; + lfjour_block_add_tail(&j->pending, block); + } + + struct lfjour_item *i = LBI(j, block, end); + *i = (struct lfjour_item) { + .seq = j->next_seq++, + }; + + return j->open = i; +} + +void +lfjour_push_commit(struct lfjour *j) +{ + ASSERT_DIE(!j->domain || DG_IS_LOCKED(j->domain)); + ASSERT_DIE(j->open); + struct lfjour_block *b = PAGE_HEAD(j->open); + ASSERT_DIE(b == j->pending.last); + + lfjour_debug("lfjour(%p)_push_commit of %p, seq=%lu", j, j->open, j->open->seq); + + u32 end = atomic_fetch_add_explicit(&b->end, 1, memory_order_release); + ASSERT_DIE(j->open == LBI(j, b, end)); + + if (end == 0) + { + struct lfjour_block *prev = b->n.prev; - _Bool f = 0; ++ bool f = 0; + if (prev) + ASSERT_DIE(atomic_compare_exchange_strong_explicit(&prev->not_last, &f, 1, + memory_order_release, memory_order_relaxed)); + } + + /* Store the first item to announce (only if this is actually the first one). */ + struct lfjour_item *null_item = NULL; + if (atomic_compare_exchange_strong_explicit( + &j->first, &null_item, j->open, + memory_order_acq_rel, memory_order_relaxed)) + { + lfjour_debug("lfjour(%p) first set", j); + } + + j->open = NULL; + + if (!ev_active(&j->announce_kick_event)) + ev_send_loop(j->loop, &j->announce_kick_event); +} + +static struct lfjour_item * +lfjour_get_next(struct lfjour *j, const struct lfjour_item *last) +{ + /* This is lockless, no domain checks. */ + if (!last) + { + struct lfjour_item *first = atomic_load_explicit(&j->first, memory_order_acquire); + return first; + } + + struct lfjour_block *block = PAGE_HEAD(last); + ASSERT_DIE(block); + u32 end = atomic_load_explicit(&block->end, memory_order_acquire); + u32 pos = LBP(j, block, last); + ASSERT_DIE(pos < end); + + /* Next is in the same block. */ + if (++pos < end) + return LBI(j, block, pos); + + /* There is another block. */ + if (atomic_load_explicit(&block->not_last, memory_order_acquire)) + { + /* To avoid rare race conditions, we shall check the current block end once again */ + u32 new_end = atomic_load_explicit(&block->end, memory_order_acquire); + ASSERT_DIE(new_end >= end); + if (new_end > end) + return LBI(j, block, pos); + + /* Nothing in the previous one, let's move to the next block. + * This is OK to do non-atomically because of the not_last flag. */ + block = block->n.next; + return LBI(j, block, 0); + } + + /* There is nothing more. */ + return NULL; +} + +struct lfjour_item * +lfjour_get(struct lfjour_recipient *r) +{ + struct lfjour *j = lfjour_of_recipient(r); + + const struct lfjour_item *last = r->cur; + struct lfjour_item *next = NULL; + + if (last) + next = lfjour_get_next(j, r->cur); + else + { + /* The last pointer may get cleaned up under our hands. + * Indicating that we're using it, by RCU read. */ + + rcu_read_lock(); + last = atomic_load_explicit(&r->last, memory_order_acquire); + next = lfjour_get_next(j, last); + rcu_read_unlock(); + } + + if (last) + { + lfjour_debug_detailed("lfjour(%p)_get(recipient=%p) returns %p, seq=%lu, last %p", + j, r, next, next ? next->seq : 0ULL, last); + } + else + { + lfjour_debug("lfjour(%p)_get(recipient=%p) returns %p, seq=%lu, clean", + j, r, next, next ? next->seq : 0ULL); + } + + if (!next) + return NULL; + + if (!r->first_holding_seq) + r->first_holding_seq = next->seq; + + return r->cur = next; +} + +void lfjour_release(struct lfjour_recipient *r, const struct lfjour_item *it) +{ + /* Find out what we actually released last */ + rcu_read_lock(); + const struct lfjour_item *last = atomic_load_explicit(&r->last, memory_order_acquire); + struct lfjour_block *last_block = last ? PAGE_HEAD(last) : NULL; + rcu_read_unlock(); + + /* This is lockless, no domain checks. */ + ASSERT_DIE(r->cur); + + /* Partial or full release? */ + ASSERT_DIE(r->first_holding_seq); + ASSERT_DIE(it->seq >= r->first_holding_seq); + if (it->seq < r->cur->seq) + { + lfjour_debug("lfjour(%p)_release(recipient=%p) of %p, partial upto seq=%lu", + j, r, it, it->seq); + r->first_holding_seq = it->seq + 1; + atomic_store_explicit(&r->last, it, memory_order_release); + return; + } + + struct lfjour_block *block = PAGE_HEAD(r->cur); + u32 end = atomic_load_explicit(&block->end, memory_order_acquire); + + struct lfjour *j = lfjour_of_recipient(r); + u32 pos = LBP(j, block, r->cur); + ASSERT_DIE(pos < end); + + /* Releasing this export for cleanup routine */ + if (pos + 1 == end) + { + lfjour_debug("lfjour(%p)_release(recipient=%p) of %p, seq=%lu (end)", + j, r, r->cur, r->cur->seq); + } + else + { + lfjour_debug_detailed("lfjour(%p)_release(recipient=%p) of %p, seq=%lu (mid)", + j, r, r->cur, r->cur->seq); + } + + atomic_store_explicit(&r->last, r->cur, memory_order_release); + + /* The last block may be available to free */ + if ((pos + 1 == end) || last && (last_block != block)) + lfjour_schedule_cleanup(j); + + r->first_holding_seq = 0; + r->cur = NULL; +} + +void +lfjour_announce_now(struct lfjour *j) +{ + ASSERT_DIE(birdloop_inside(j->loop)); + settle_cancel(&j->announce_timer); + ev_postpone(&j->announce_kick_event); + + if (EMPTY_TLIST(lfjour_recipient, &j->recipients)) + return lfjour_schedule_cleanup(j); + + WALK_TLIST(lfjour_recipient, r, &j->recipients) + if (r->event) + ev_send(r->target, r->event); +} + +static void +lfjour_announce_settle_hook(struct settle *s) +{ + return lfjour_announce_now(SKIP_BACK(struct lfjour, announce_timer, s)); +} + +static void +lfjour_announce_kick_hook(void *_j) +{ + struct lfjour *j = _j; + settle_kick(&j->announce_timer, j->loop); +} + +u64 +lfjour_pending_items(struct lfjour *j) +{ + ASSERT_DIE(!j->domain || DG_IS_LOCKED(j->domain)); + + struct lfjour_item *first = atomic_load_explicit(&j->first, memory_order_relaxed); + if (!first) + return 0; + + ASSERT_DIE(j->next_seq > first->seq); + return j->next_seq - first->seq; +} + +void +lfjour_register(struct lfjour *j, struct lfjour_recipient *r) +{ + ASSERT_DIE(!j->domain || DG_IS_LOCKED(j->domain)); + ASSERT_DIE(!r->event == !r->target); + + atomic_store_explicit(&r->last, NULL, memory_order_relaxed); + ASSERT_DIE(!r->cur); + + lfjour_recipient_add_tail(&j->recipients, r); +} + +void +lfjour_unregister(struct lfjour_recipient *r) +{ + struct lfjour *j = lfjour_of_recipient(r); + ASSERT_DIE(!j->domain || DG_IS_LOCKED(j->domain)); + + if (r->cur) + lfjour_release(r, r->cur); + + lfjour_recipient_rem_node(&j->recipients, r); + lfjour_schedule_cleanup(j); +} + +static inline void lfjour_cleanup_unlock_helper(struct domain_generic **dg) +{ + if (!*dg) return; + DG_UNLOCK(*dg); +} + +static void +lfjour_cleanup_hook(void *_j) +{ + struct lfjour *j = _j; + + CLEANUP(lfjour_cleanup_unlock_helper) struct domain_generic *_locked = j->domain; + if (_locked) DG_LOCK(_locked); + + u64 min_seq = ~((u64) 0); + const struct lfjour_item *last_item_to_free = NULL; + struct lfjour_item *first = atomic_load_explicit(&j->first, memory_order_acquire); + + if (!first) + { + /* Nothing to cleanup, actually, just call the done callback */ + ASSERT_DIE(EMPTY_TLIST(lfjour_block, &j->pending)); + CALL(j->cleanup_done, j, 0, ~((u64) 0)); + return; + } + + WALK_TLIST(lfjour_recipient, r, &j->recipients) + { + const struct lfjour_item *last = atomic_load_explicit(&r->last, memory_order_acquire); + + if (!last) + /* No last export means that the channel has exported nothing since last cleanup */ + return; + + else if (min_seq > last->seq) + { + min_seq = last->seq; + last_item_to_free = last; + } + } + + /* Here we're sure that no receiver is going to use the first pointer soon. + * It is only used when the receiver's last pointer is NULL, which is avoided by the code above. + * Thus, we can just move the journal's first pointer forward. */ + struct lfjour_item *next = last_item_to_free ? lfjour_get_next(j, last_item_to_free) : NULL; + atomic_store_explicit(&j->first, next, memory_order_release); + + lfjour_debug("lfjour(%p) set first=%p (was %p)", j, next, first); + + WALK_TLIST(lfjour_recipient, r, &j->recipients) + { + const struct lfjour_item *last = last_item_to_free; + /* This either succeeds if this item is the most-behind-one, + * or fails and gives us the actual last for debug output. */ + if (atomic_compare_exchange_strong_explicit( + &r->last, &last, NULL, + memory_order_acq_rel, memory_order_acquire)) + { + lfjour_debug("lfjour(%p)_cleanup(recipient=%p): store last=NULL", j, r); + } + else + { + lfjour_debug("lfjour(%p)_cleanup(recipient=%p): keep last=%p", j, r, last); + } + } + + /* Now some recipients may have old last-pointers. We have to wait + * until they finish their routine, before we start cleaning up. */ + synchronize_rcu(); + + u64 orig_first_seq = first->seq; + + /* Now we do the actual cleanup */ + while (first && (first->seq <= min_seq)) + { + j->item_done(j, first); + + /* Find next journal item */ + struct lfjour_item *next = lfjour_get_next(j, first); + if (PAGE_HEAD(next) != PAGE_HEAD(first)) + { + /* This was the last one in its block */ + struct lfjour_block *block = PAGE_HEAD(first); + lfjour_debug("lfjour(%p)_cleanup: freeing block %p", j, block); + ASSERT_DIE(block == j->pending.first); + + /* Free this block */ + lfjour_block_rem_node(&j->pending, block); + + /* Wait for possible pending readers of the block */ + synchronize_rcu(); + + /* Now we can finally drop the block */ +#ifdef LOCAL_DEBUG + memset(block, 0xbe, page_size); +#endif + free_page(block); + + /* If no more blocks are remaining, we shall reset + * the sequence numbers */ + + if (EMPTY_TLIST(lfjour_block, &j->pending)) + { + lfjour_debug("lfjour(%p)_cleanup: seq reset", j); + WALK_TLIST(lfjour_recipient, r, &j->recipients) + atomic_fetch_or_explicit(&r->recipient_flags, LFJOUR_R_SEQ_RESET, memory_order_acq_rel); + + j->next_seq = 1; + } + } + + /* And now move on to the next item */ + first = next; + } + + CALL(j->cleanup_done, j, orig_first_seq, first ? first->seq : ~((u64) 0)); +} + +void +lfjour_init(struct lfjour *j, struct settle_config *scf) +{ + /* Expecting all other fields to be initialized to zeroes by the caller */ + ASSERT_DIE(j->loop); + ASSERT_DIE(j->item_size >= sizeof(struct lfjour_item)); + + j->item_size = BIRD_CPU_ALIGN(j->item_size); + j->item_count = (page_size - sizeof(struct lfjour_block)) / j->item_size; + + j->next_seq = 1; + j->announce_kick_event = (event) { + .hook = lfjour_announce_kick_hook, + .data = j, + }; + j->announce_timer = SETTLE_INIT(scf, lfjour_announce_settle_hook, j); + j->cleanup_event = (event) { + .hook = lfjour_cleanup_hook, + .data = j, + }; +} diff --cc lib/lockfree.h index f99704b36,000000000..ab7f7d0e2 mode 100644,000000..100644 --- a/lib/lockfree.h +++ b/lib/lockfree.h @@@ -1,284 -1,0 +1,284 @@@ +/* + * BIRD Library -- Generic lock-free structures + * + * (c) 2023--2024 Maria Matejka + * (c) 2023--2024 CZ.NIC, z.s.p.o. + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_LOCKFREE_H_ +#define _BIRD_LOCKFREE_H_ + +#include "lib/defer.h" +#include "lib/event.h" +#include "lib/rcu.h" +#include "lib/settle.h" +#include "lib/tlists.h" +#include "lib/io-loop.h" + +#include + +/** + * Lock-free usecounts. + */ + +struct lfuc { + _Atomic u64 uc; +}; + +#define LFUC_PU_SHIFT 44 +#define LFUC_IN_PROGRESS (1ULL << LFUC_PU_SHIFT) + +/** + * lfuc_lock - increase an atomic usecount + * @c: the usecount structure + */ +static inline u64 lfuc_lock(struct lfuc *c) +{ + /* Locking is trivial; somebody already holds the underlying data structure + * so we just increase the use count. Nothing can be freed underneath our hands. */ + u64 uc = atomic_fetch_add_explicit(&c->uc, 1, memory_order_acq_rel); + ASSERT_DIE(uc > 0); + return uc & (LFUC_IN_PROGRESS - 1); +} + +/** + * lfuc_lock_revive - increase an atomic usecount even if it's zero + * @c: the usecount structure + * + * If the caller is sure that they can't collide with the prune routine, + * they can call this even on structures with already zeroed usecount. + * Handy for situations with flapping routes. Use only from the same + * loop as which runs the prune routine. + */ +static inline u64 lfuc_lock_revive(struct lfuc *c) +{ + u64 uc = atomic_fetch_add_explicit(&c->uc, 1, memory_order_acq_rel); + return uc & (LFUC_IN_PROGRESS - 1); +} + +/** + * lfuc_unlock_immediately - decrease an atomic usecount + * @c: the usecount structure + * @el: prune event list + * @ev: prune event itself + * + * If the usecount reaches zero, a prune event is run to possibly free the object. + * The prune event MUST use lfuc_finished() to check the object state. + */ +static inline void lfuc_unlock_immediately(struct lfuc *c, event_list *el, event *ev) +{ + /* Unlocking is tricky. We do it lockless so at the same time, the prune + * event may be running, therefore if the unlock gets us to zero, it must be + * the last thing in this routine, otherwise the prune routine may find the + * source's usecount zeroed, freeing it prematurely. + * + * The usecount is split into two parts: + * the top 20 bits are an in-progress indicator + * the bottom 44 bits keep the actual usecount. + * + * Therefore at most 1 million of writers can simultaneously unlock the same + * structure, while at most ~17T different places can reference it. Both limits + * are insanely high from the 2022 point of view. Let's suppose that when 17T + * routes or 1M peers/tables get real, we get also 128bit atomic variables in the + * C norm. */ + + /* First, we push the in-progress indicator */ + u64 uc = atomic_fetch_add_explicit(&c->uc, LFUC_IN_PROGRESS, memory_order_acq_rel); + + /* Then we split the indicator to its parts. Remember, we got the value + * before the operation happened so we're re-doing the operation locally + * to get a view how the indicator _would_ look if nobody else was interacting. + */ + u64 pending = (uc >> LFUC_PU_SHIFT) + 1; + uc &= LFUC_IN_PROGRESS - 1; + + /* Obviously, there can't be more pending unlocks than the usecount itself */ + if (uc == pending) + /* If we're the last unlocker (every owner is already unlocking), schedule + * the owner's prune event */ + ev_send(el, ev); + else + ASSERT_DIE(uc > pending); + + /* And now, finally, simultaneously pop the in-progress indicator and the + * usecount, possibly allowing the pruning routine to free this structure */ + uc = atomic_fetch_sub_explicit(&c->uc, LFUC_IN_PROGRESS + 1, memory_order_acq_rel); + +// return uc - LFUC_IN_PROGRESS - 1; +} + +struct lfuc_unlock_queue_item { + struct deferred_call dc; + struct lfuc *c; + event_list *el; + event *ev; +}; + +void lfuc_unlock_deferred(struct deferred_call *dc); + +static inline void lfuc_unlock(struct lfuc *c, event_list *el, event *ev) +{ + struct lfuc_unlock_queue_item luqi = { + .dc.hook = lfuc_unlock_deferred, + .c = c, + .el = el, + .ev = ev, + }; + + defer_call(&luqi.dc, sizeof luqi); +} + +/** + * lfuc_finished - auxiliary routine for prune event + * @c: usecount structure + * + * This routine simply waits until all unlockers finish their job and leave + * the critical section of lfuc_unlock(). Then we decide whether the usecount + * is indeed zero or not, and therefore whether the structure is free to be freed. + */ - static inline _Bool ++static inline bool +lfuc_finished(struct lfuc *c) +{ + u64 uc; + /* Wait until all unlockers finish */ + while ((uc = atomic_load_explicit(&c->uc, memory_order_acquire)) >> LFUC_PU_SHIFT) + birdloop_yield(); + + /* All of them are now done and if the usecount is now zero, then we're + * the last place to reference the object and we can call it finished. */ + return (uc == 0); +} + +/** + * lfuc_init - auxiliary routine for usecount initialization + * @c: usecount structure + * + * Called on object initialization, sets the usecount to an initial one to make + * sure that the prune routine doesn't free it before somebody else references it. + */ +static inline void +lfuc_init(struct lfuc *c) +{ + atomic_store_explicit(&c->uc, 1, memory_order_release); +} + + +/** + * Lock-free journal. + */ + +/* Journal item. Put LFJOUR_ITEM_INHERIT(name) into your structure + * to inherit lfjour_item */ +#define LFJOUR_ITEM \ + u64 seq; \ + +struct lfjour_item { + LFJOUR_ITEM; +}; + +#define LFJOUR_ITEM_INHERIT(name) union { \ + struct lfjour_item name; \ + struct { LFJOUR_ITEM; }; \ +} + +/* Journal item block. Internal structure, no need to check out. */ +#define TLIST_PREFIX lfjour_block +#define TLIST_TYPE struct lfjour_block +#define TLIST_ITEM n +#define TLIST_WANT_ADD_TAIL + +struct lfjour_block { + TLIST_DEFAULT_NODE; + _Atomic u32 end; - _Atomic _Bool not_last; ++ _Atomic bool not_last; + + struct lfjour_item _block[0]; +}; + +/* Defines lfjour_block_list */ +#include "lib/tlists.h" + +/* Journal recipient. Inherit this in your implementation. */ +#define TLIST_PREFIX lfjour_recipient +#define TLIST_TYPE struct lfjour_recipient +#define TLIST_ITEM n +#define TLIST_WANT_ADD_TAIL +#define TLIST_WANT_WALK + +struct lfjour_recipient { + TLIST_DEFAULT_NODE; + event *event; /* Event running when something is in the journal */ + event_list *target; /* Event target */ + const struct lfjour_item * _Atomic last; /* Last item processed */ + u64 first_holding_seq; /* First item not released yet */ + struct lfjour_item *cur; /* Processing this now */ + _Atomic u64 recipient_flags; /* LFJOUR_R_* */ +}; + +enum lfjour_recipient_flags { + LFJOUR_R_SEQ_RESET = 1, /* Signalling of sequence number reset */ +}; + +/* Defines lfjour_recipient_list */ +#include "lib/tlists.h" + +/* Journal base structure. Include this. */ +struct lfjour { + struct domain_generic *domain; /* The journal itself belongs to this domain (if different from the loop) */ + struct birdloop *loop; /* Cleanup loop */ + u32 item_size, item_count; /* Allocation parameters */ + struct lfjour_block_list pending; /* List of packed journal blocks */ + struct lfjour_item * _Atomic first; /* First journal item to announce */ + struct lfjour_item *open; /* Journal item in progress */ + u64 next_seq; /* Next export to push has this ID */ + struct lfjour_recipient_list recipients; /* Announce updates to these */ + event announce_kick_event; /* Kicks announce_timer */ + struct settle announce_timer; /* Announces changes to recipients */ + event cleanup_event; /* Runs the journal cleanup routine */ + + /* Callback on item removal from journal */ + void (*item_done)(struct lfjour *, struct lfjour_item *); + + /* Callback when the cleanup routine is ending */ + void (*cleanup_done)(struct lfjour *, u64 begin_seq, u64 end_seq); +}; + +struct lfjour_item *lfjour_push_prepare(struct lfjour *); +void lfjour_push_commit(struct lfjour *); + +struct lfjour_item *lfjour_get(struct lfjour_recipient *); +void lfjour_release(struct lfjour_recipient *, const struct lfjour_item *); - static inline _Bool lfjour_reset_seqno(struct lfjour_recipient *r) ++static inline bool lfjour_reset_seqno(struct lfjour_recipient *r) +{ + return atomic_fetch_and_explicit(&r->recipient_flags, ~LFJOUR_R_SEQ_RESET, memory_order_acq_rel) & LFJOUR_R_SEQ_RESET; +} + +void lfjour_announce_now(struct lfjour *); +u64 lfjour_pending_items(struct lfjour *); + +static inline void lfjour_schedule_cleanup(struct lfjour *j) +{ ev_send_loop(j->loop, &j->cleanup_event); } + +static inline void lfjour_do_cleanup_now(struct lfjour *j) +{ + /* This requires the caller to own the cleanup event loop */ + ev_postpone(&j->cleanup_event); + j->cleanup_event.hook(j->cleanup_event.data); +} + +void lfjour_register(struct lfjour *, struct lfjour_recipient *); +void lfjour_unregister(struct lfjour_recipient *); +static inline uint lfjour_count_recipients(struct lfjour *j) +{ return TLIST_LENGTH(lfjour_recipient, &j->recipients); } + +void lfjour_init(struct lfjour *, struct settle_config *); + + +static inline struct lfjour *lfjour_of_recipient(struct lfjour_recipient *r) +{ + struct lfjour_recipient_list *list = lfjour_recipient_enlisted(r); + return list ? SKIP_BACK(struct lfjour, recipients, list) : NULL; +} +#endif diff --cc lib/locking.h index 30450535f,000000000..0251f8769 mode 100644,000000..100644 --- a/lib/locking.h +++ b/lib/locking.h @@@ -1,531 -1,0 +1,531 @@@ +/* + * BIRD Library -- Locking + * + * (c) 2020--2021 Maria Matejka + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_LOCKING_H_ +#define _BIRD_LOCKING_H_ + +#include "lib/birdlib.h" +#include "lib/macro.h" +#include "lib/rcu.h" + +struct domain_generic; +struct pool; + +#define LOCK_ORDER \ + the_bird, \ + meta, \ + control, \ + proto, \ + service, \ + rtable, \ + attrs, \ + logging, \ + resource, \ + +/* Here define the global lock order; first to last. */ +struct lock_order { +#define LOCK_ORDER_EXPAND(p) struct domain_generic *p; + MACRO_FOREACH(LOCK_ORDER_EXPAND, LOCK_ORDER) +#undef LOCK_ORDER_EXPAND +}; + +#define LOCK_ORDER_EXPAND(p) struct domain__##p { struct domain_generic *p; }; + MACRO_FOREACH(LOCK_ORDER_EXPAND, LOCK_ORDER) +#undef LOCK_ORDER_EXPAND + +extern _Thread_local struct lock_order locking_stack; +extern _Thread_local struct domain_generic **last_locked; + +#define DOMAIN(type) struct domain__##type +#define DOMAIN_ORDER(type) OFFSETOF(struct lock_order, type) + +#define DOMAIN_NEW(type) (DOMAIN(type)) { .type = domain_new(DOMAIN_ORDER(type), 1) } +#define DOMAIN_NEW_RCU_SYNC(type) (DOMAIN(type)) { .type = domain_new(DOMAIN_ORDER(type), 0) } - struct domain_generic *domain_new(uint order, _Bool allow_rcu); ++struct domain_generic *domain_new(uint order, bool allow_rcu); + +#define DOMAIN_FREE(type, d) domain_free((d).type) +void domain_free(struct domain_generic *); + +#define DOMAIN_NAME(type, d) domain_name((d).type) +const char *domain_name(struct domain_generic *); + +#define DOMAIN_SETUP(type, d, n, p) domain_setup((d).type, n, p) +void domain_setup(struct domain_generic *, const char *name, struct pool *); + +#define DOMAIN_NULL(type) (DOMAIN(type)) {} + +#define LOCK_DOMAIN(type, d) do_lock(((d).type), &(locking_stack.type)) +#define UNLOCK_DOMAIN(type, d) do_unlock(((d).type), &(locking_stack.type)) + +#define DOMAIN_IS_LOCKED(type, d) (((d).type) == (locking_stack.type)) +#define DG_IS_LOCKED(d) ((d) == *(DG_LSP(d))) + +/* Internal for locking */ +void do_lock(struct domain_generic *dg, struct domain_generic **lsp); +void do_unlock(struct domain_generic *dg, struct domain_generic **lsp); + +uint dg_order(struct domain_generic *dg); + +#define DG_LSP(d) ((struct domain_generic **) (((void *) &locking_stack) + dg_order(d))) +#define DG_LOCK(d) do_lock(d, DG_LSP(d)) +#define DG_UNLOCK(d) do_unlock(d, DG_LSP(d)) + +/* Use with care. To be removed in near future. */ +extern DOMAIN(the_bird) the_bird_domain; + +#define the_bird_lock() LOCK_DOMAIN(the_bird, the_bird_domain) +#define the_bird_unlock() UNLOCK_DOMAIN(the_bird, the_bird_domain) +#define the_bird_locked() DOMAIN_IS_LOCKED(the_bird, the_bird_domain) + +#define ASSERT_THE_BIRD_LOCKED ({ if (!the_bird_locked()) bug("The BIRD lock must be locked here: %s:%d", __FILE__, __LINE__); }) + +/* + * RW spinlocks + */ + +#define RWS_READ_PENDING_POS 0 +#define RWS_READ_ACTIVE_POS 20 +#define RWS_WRITE_PENDING_POS 40 +#define RWS_WRITE_ACTIVE_POS 56 + +#define RWS_READ_PENDING (1ULL << RWS_READ_PENDING_POS) +#define RWS_READ_ACTIVE (1ULL << RWS_READ_ACTIVE_POS) +#define RWS_WRITE_PENDING (1ULL << RWS_WRITE_PENDING_POS) +#define RWS_WRITE_ACTIVE (1ULL << RWS_WRITE_ACTIVE_POS) + +#define RWS_READ_PENDING_MASK (RWS_READ_ACTIVE - 1) +#define RWS_READ_ACTIVE_MASK ((RWS_WRITE_PENDING - 1) & ~(RWS_READ_ACTIVE - 1)) +#define RWS_WRITE_PENDING_MASK ((RWS_WRITE_ACTIVE - 1) & ~(RWS_WRITE_PENDING - 1)) +#define RWS_WRITE_ACTIVE_MASK (~(RWS_WRITE_ACTIVE - 1)) + +typedef struct { + u64 _Atomic spin; +} rw_spinlock; + +#ifdef DEBUGGING +#define MAX_RWS_AT_ONCE 32 +extern _Thread_local rw_spinlock *rw_spinlocks_taken[MAX_RWS_AT_ONCE]; +extern _Thread_local btime rw_spinlocks_time[MAX_RWS_AT_ONCE]; +extern _Thread_local u32 rw_spinlocks_taken_cnt; +extern _Thread_local u32 rw_spinlocks_taken_write; + +/* Borrowed from lib/timer.h */ +btime current_time_now(void); + - static inline void rws_mark(rw_spinlock *p, _Bool write, _Bool lock) ++static inline void rws_mark(rw_spinlock *p, bool write, bool lock) +{ + if (lock) { + ASSERT_DIE(rw_spinlocks_taken_cnt < MAX_RWS_AT_ONCE); + if (write) + rw_spinlocks_taken_write |= (1 << rw_spinlocks_taken_cnt); + else + rw_spinlocks_taken_write &= ~(1 << rw_spinlocks_taken_cnt); + rw_spinlocks_time[rw_spinlocks_taken_cnt] = current_time_now(); + rw_spinlocks_taken[rw_spinlocks_taken_cnt++] = p; + + } + else { + ASSERT_DIE(rw_spinlocks_taken_cnt > 0); + ASSERT_DIE(rw_spinlocks_taken[--rw_spinlocks_taken_cnt] == p); + ASSERT_DIE(!(rw_spinlocks_taken_write & (1 << rw_spinlocks_taken_cnt)) == !write); + btime tdif = current_time_now() - rw_spinlocks_time[rw_spinlocks_taken_cnt]; + if (tdif > 1 S_) + log(L_WARN "Spent an alarming time %t s in spinlock %p (%s); " + "if this happens often to you, please contact the developers.", + tdif, p, write ? "write" : "read"); + } +} +#else +#define rws_mark(...) +#endif + +static inline void rws_init(rw_spinlock *p) +{ + atomic_store_explicit(&p->spin, 0, memory_order_relaxed); +} + +static inline void rws_read_lock(rw_spinlock *p) +{ + u64 old = atomic_fetch_add_explicit(&p->spin, RWS_READ_PENDING, memory_order_acquire); + + while (1) + { + /* Wait until all writers end */ + while (old & (RWS_WRITE_PENDING_MASK | RWS_WRITE_ACTIVE_MASK)) + { + birdloop_yield(); + old = atomic_load_explicit(&p->spin, memory_order_acquire); + } + + /* Convert to active */ + old = atomic_fetch_add_explicit(&p->spin, RWS_READ_ACTIVE - RWS_READ_PENDING, memory_order_acq_rel); + + if (old & RWS_WRITE_ACTIVE_MASK) + /* Oh but some writer was faster */ + old = atomic_fetch_sub_explicit(&p->spin, RWS_READ_ACTIVE - RWS_READ_PENDING, memory_order_acq_rel); + else + /* No writers, approved */ + break; + } + + rws_mark(p, 0, 1); +} + +static inline void rws_read_unlock(rw_spinlock *p) +{ + rws_mark(p, 0, 0); + u64 old = atomic_fetch_sub_explicit(&p->spin, RWS_READ_ACTIVE, memory_order_release); + ASSERT_DIE(old & RWS_READ_ACTIVE_MASK); +} + +static inline void rws_write_lock(rw_spinlock *p) +{ + u64 old = atomic_fetch_add_explicit(&p->spin, RWS_WRITE_PENDING, memory_order_acquire); + + /* Wait until all active readers end */ + while (1) + { + while (old & (RWS_READ_ACTIVE_MASK | RWS_WRITE_ACTIVE_MASK)) + { + birdloop_yield(); + old = atomic_load_explicit(&p->spin, memory_order_acquire); + } + + /* Mark self as active */ + u64 updated = atomic_fetch_or_explicit(&p->spin, RWS_WRITE_ACTIVE, memory_order_acquire); + + /* And it's us */ + if (!(updated & RWS_WRITE_ACTIVE)) + { + if (updated & RWS_READ_ACTIVE_MASK) + /* But some reader was faster */ + atomic_fetch_and_explicit(&p->spin, ~RWS_WRITE_ACTIVE, memory_order_release); + else + /* No readers, approved */ + break; + } + } + + /* It's us, then we aren't actually pending */ + u64 updated = atomic_fetch_sub_explicit(&p->spin, RWS_WRITE_PENDING, memory_order_acquire); + ASSERT_DIE(updated & RWS_WRITE_PENDING_MASK); + rws_mark(p, 1, 1); +} + +static inline void rws_write_unlock(rw_spinlock *p) +{ + rws_mark(p, 1, 0); + u64 old = atomic_fetch_and_explicit(&p->spin, ~RWS_WRITE_ACTIVE, memory_order_release); + ASSERT_DIE(old & RWS_WRITE_ACTIVE); +} + + +/* + * Unwind stored lock state helpers + */ +struct locking_unwind_status { + struct lock_order *desired; + enum { + LOCKING_UNWIND_SAME, + LOCKING_UNWIND_UNLOCK, + } state; +}; + +static inline struct locking_unwind_status locking_unwind_helper(struct locking_unwind_status status, uint order) +{ + struct domain_generic **lsp = ((void *) &locking_stack) + order; + struct domain_generic **dp = ((void *) status.desired) + order; + + if (!status.state) + { + /* Just checking that the rest of the stack is consistent */ + if (*lsp != *dp) + bug("Mangled lock unwind state at order %d", order); + } + else if (*dp) + /* Stored state expects locked */ + if (*lsp == *dp) + /* Indeed is locked, switch to check mode */ + status.state = 0; + else + /* Not locked or locked elsewhere */ + bug("Mangled lock unwind state at order %d", order); + else if (*lsp) + /* Stored state expects unlocked but we're locked */ + DG_UNLOCK(*lsp); + + return status; +} + +static inline void locking_unwind(struct lock_order *desired) +{ + struct locking_unwind_status status = { + .desired = desired, + .state = LOCKING_UNWIND_UNLOCK, + }; + +#define LOCK_ORDER_POS_HELPER(x) DOMAIN_ORDER(x), +#define LOCK_ORDER_POS MACRO_FOREACH(LOCK_ORDER_POS_HELPER, LOCK_ORDER) + MACRO_RPACK(locking_unwind_helper, status, LOCK_ORDER_POS); +#undef LOCK_ORDER_POS_HELPER +} + +/** + * Objects bound with domains + * + * First, we need some object to have its locked and unlocked part. + * This is accomplished typically by the following pattern: + * + * struct foo_public { + * ... // Public fields + * DOMAIN(bar) lock; // The assigned domain + * }; + * + * struct foo_private { + * struct foo_public; // Importing public fields + * struct foo_private **locked_at; // Auxiliary field for locking routines + * ... // Private fields + * }; + * + * typedef union foo { + * struct foo_public; + * struct foo_private priv; + * } foo; + * + * All persistently stored object pointers MUST point to the public parts. + * If accessing the locked object from embedded objects, great care must + * be applied to always SKIP_BACK to the public object version, not the + * private one. + * + * To access the private object parts, either the private object pointer + * is explicitly given to us, therefore assuming somewhere else the domain + * has been locked, or we have to lock the domain ourselves. To do that, + * there are some handy macros. + */ + +#define LOBJ_LOCK_SIMPLE(_obj, _level) \ + ({ LOCK_DOMAIN(_level, (_obj)->lock); &(_obj)->priv; }) + +#define LOBJ_UNLOCK_SIMPLE(_obj, _level) \ + UNLOCK_DOMAIN(_level, (_obj)->lock) + +/* + * These macros can be used to define specific macros for given class. + * + * #define FOO_LOCK_SIMPLE(foo) LOBJ_LOCK_SIMPLE(foo, bar) + * #define FOO_UNLOCK_SIMPLE(foo) LOBJ_UNLOCK_SIMPLE(foo, bar) + * + * Then these can be used like this: + * + * void foo_frobnicate(foo *f) + * { + * // Unlocked context + * ... + * struct foo_private *fp = FOO_LOCK_SIMPLE(f); + * // Locked context + * ... + * FOO_UNLOCK_SIMPLE(f); + * // Unlocked context + * ... + * } + * + * These simple calls have two major drawbacks. First, if you return + * from locked context, you don't unlock, which may lock you dead. + * And second, the foo_private pointer is still syntactically valid + * even after unlocking. + * + * To fight this, we need more magic and the switch should stay in that + * position. + * + * First, we need an auxiliary _function_ for unlocking. This function + * is intended to be called in a local variable cleanup context. + */ + +#define LOBJ_UNLOCK_CLEANUP_NAME(_stem) _lobj__##_stem##_unlock_cleanup + +#define LOBJ_UNLOCK_CLEANUP(_stem, _level) \ + static inline void LOBJ_UNLOCK_CLEANUP_NAME(_stem)(struct _stem##_private **obj) { \ + if (!*obj) return; \ + ASSERT_DIE(LOBJ_IS_LOCKED((*obj), _level)); \ + ASSERT_DIE((*obj)->locked_at == obj); \ + (*obj)->locked_at = NULL; \ + UNLOCK_DOMAIN(_level, (*obj)->lock); \ + } + +#define LOBJ_LOCK(_obj, _pobj, _stem, _level) \ + CLEANUP(LOBJ_UNLOCK_CLEANUP_NAME(_stem)) struct _stem##_private *_pobj = LOBJ_LOCK_SIMPLE(_obj, _level); _pobj->locked_at = &_pobj; + +/* + * And now the usage of these macros. You first need to declare the auxiliary + * cleanup function. + * + * LOBJ_UNLOCK_CLEANUP(foo, bar); + * + * And then declare the lock-local macro: + * + * #define FOO_LOCK(foo, fpp) LOBJ_LOCK(foo, fpp, foo, bar) + * + * This construction then allows you to lock much more safely: + * + * void foo_frobnicate_safer(foo *f) + * { + * // Unlocked context + * ... + * do { + * FOO_LOCK(foo, fpp); + * // Locked context, fpp is valid here + * + * if (something) return; // This implicitly unlocks + * if (whatever) break; // This unlocks too + * + * // Finishing context with no unlock at all + * } while (0); + * + * // Here is fpp invalid and the object is back unlocked. + * ... + * } + * + * There is no explicit unlock statement. To unlock, simply leave the block + * with locked context. + * + * This may be made even nicer to use by employing a for-cycle. + */ + +#define LOBJ_LOCKED(_obj, _pobj, _stem, _level) \ + for (CLEANUP(LOBJ_UNLOCK_CLEANUP_NAME(_stem)) struct _stem##_private *_pobj = LOBJ_LOCK_SIMPLE(_obj, _level); \ + _pobj ? (_pobj->locked_at = &_pobj) : NULL; \ + LOBJ_UNLOCK_CLEANUP_NAME(_stem)(&_pobj), _pobj = NULL) + +/* + * This for-cycle employs heavy magic to hide as much of the boilerplate + * from the user as possibly needed. Here is how it works. + * + * First, the for-1 clause is executed, setting up _pobj, to the private + * object pointer. It has a cleanup hook set. + * + * Then, the for-2 clause is checked. As _pobj is non-NULL, _pobj->locked_at + * is initialized to the _pobj address to ensure that the cleanup hook unlocks + * the right object. + * + * Now the user block is executed. If it ends by break or return, the cleanup + * hook fires for _pobj, triggering object unlock. + * + * If the user block executed completely, the for-3 clause is run, executing + * the cleanup hook directly and then deactivating it by setting _pobj to NULL. + * + * Finally, the for-2 clause is checked again but now with _pobj being NULL, + * causing the loop to end. As the object has already been unlocked, nothing + * happens after leaving the context. + * + * #define FOO_LOCKED(foo, fpp) LOBJ_LOCKED(foo, fpp, foo, bar) + * + * Then the previous code can be modified like this: + * + * void foo_frobnicate_safer(foo *f) + * { + * // Unlocked context + * ... + * FOO_LOCKED(foo, fpp) + * { + * // Locked context, fpp is valid here + * + * if (something) return; // This implicitly unlocks + * if (whatever) break; // This unlocks too + * + * // Finishing context with no unlock at all + * } + * + * // Unlocked context + * ... + * + * // Locking once again without an explicit block + * FOO_LOCKED(foo, fpp) + * do_something(fpp); + * + * // Here is fpp invalid and the object is back unlocked. + * ... + * } + * + * + * For many reasons, a lock-check macro is handy. + * + * #define FOO_IS_LOCKED(foo) LOBJ_IS_LOCKED(foo, bar) + */ + +#define LOBJ_IS_LOCKED(_obj, _level) DOMAIN_IS_LOCKED(_level, (_obj)->lock) + +/* + * An example implementation is available in lib/locking_test.c + */ + + +/* + * Please don't use this macro unless you at least try to prove that + * it's completely safe. It's a can of worms. + * + * NEVER RETURN OR BREAK FROM THIS MACRO, it will crash. + */ + +#define LOBJ_UNLOCKED_TEMPORARILY(_obj, _pobj, _stem, _level) \ + for (union _stem *_obj = SKIP_BACK(union _stem, priv, _pobj), **_lataux = (union _stem **) _pobj->locked_at; \ + _obj ? (_pobj->locked_at = NULL, LOBJ_UNLOCK_SIMPLE(_obj, _level), _obj) : NULL; \ + LOBJ_LOCK_SIMPLE(_obj, _level), _pobj->locked_at = (struct _stem##_private **) _lataux, _obj = NULL) + +/* + * Get the locked object when the lock is already taken + */ + +#define LOBJ_PRIV(_obj, _level) \ + ({ ASSERT_DIE(DOMAIN_IS_LOCKED(_level, (_obj)->lock)); &(_obj)->priv; }) + + +/* + * RCU retry unwinder + * + * Start a retriable operation with RCU_ANCHOR() and pass the _i object along + * with the code which may then call RCU_RETRY() to return back to RCU_ANCHOR + * and try again. + */ + +struct rcu_unwinder { + struct lock_order locking_stack; + u32 retry; + u8 fast; + jmp_buf buf; +}; + +static inline void _rcu_unwinder_unlock_(struct rcu_unwinder *o UNUSED) +{ + rcu_read_unlock(); +} + +#define RCU_UNWIND_WARN 4096 + +#define RCU_ANCHOR(_i) \ + CLEANUP(_rcu_unwinder_unlock_) struct rcu_unwinder _s##_i = {}; \ + struct rcu_unwinder *_i = &_s##_i; \ + if (setjmp(_i->buf)) { \ + rcu_read_unlock(); \ + locking_unwind(&_i->locking_stack); \ + if (_i->fast) _i->fast = 0; \ + else { \ + birdloop_yield(); \ + if (!(++_i->retry % RCU_UNWIND_WARN)) \ + log(L_WARN "Suspiciously many RCU_ANCHORs retried (%lu)" \ + " at %s:%d", _i->retry, __FILE__, __LINE__); \ + } \ + } \ + _i->locking_stack = locking_stack; \ + rcu_read_lock(); \ + +#define RCU_RETRY(_i) do { if (_i) longjmp(_i->buf, 1); else bug("No rcu retry allowed here"); } while (0) + +#define RCU_RETRY_FAST(_i) do { (_i)->fast++; RCU_RETRY(_i); } while (0) + +#define RCU_WONT_RETRY ((struct rcu_unwinder *) NULL) +#endif diff --cc lib/locking_test.c index 38faed61b,000000000..8792194bc mode 100644,000000..100644 --- a/lib/locking_test.c +++ b/lib/locking_test.c @@@ -1,180 -1,0 +1,180 @@@ +#include "test/birdtest.h" +#include "test/bt-utils.h" + +#include "lib/locking.h" +#include +#include + +#define FOO_PUBLIC \ + const char *name; \ + _Atomic uint counter; \ + DOMAIN(proto) lock; \ + +struct foo_private { + struct { FOO_PUBLIC; }; + struct foo_private **locked_at; + uint private_counter; +}; + +typedef union foo { + struct { FOO_PUBLIC; }; + struct foo_private priv; +} foo; + +LOBJ_UNLOCK_CLEANUP(foo, proto); +#define FOO_LOCK(_foo, _fpp) LOBJ_LOCK(_foo, _fpp, foo, proto) +#define FOO_LOCKED(_foo, _fpp) LOBJ_LOCKED(_foo, _fpp, foo, proto) +#define FOO_IS_LOCKED(_foo) LOBJ_IS_LOCKED(_foo, proto) + +static uint +inc_public(foo *f) +{ + return atomic_fetch_add_explicit(&f->counter, 1, memory_order_relaxed) + 1; +} + +static uint +inc_private(foo *f) +{ + FOO_LOCKED(f, fp) return ++fp->private_counter; + bug("Returning always"); +} + +#define BLOCKCOUNT 4096 +#define THREADS 16 +#define REPEATS 128 + +static void * +thread_run(void *_foo) +{ + foo *f = _foo; + + for (int i=0; irws[i]); + rws_read_lock(&d->rws[i+1]); + + ASSERT_DIE(d->data[i] >= 0); + ASSERT_DIE(d->data[i+1] >= 0); + if (d->data[i] > d->data[i+1]) + sorted = 0; + + rws_read_unlock(&d->rws[i+1]); + rws_read_unlock(&d->rws[i]); + } + + for (int i=0; (irws[i]); + rws_write_lock(&d->rws[i+1]); + + int first = d->data[i]; + int second = d->data[i+1]; + + ASSERT_DIE(first >= 0); + ASSERT_DIE(second >= 0); + + d->data[i] = d->data[i+1] = -1; + + if (first > second) + { + d->data[i] = second; + d->data[i+1] = first; + } + else + { + d->data[i] = first; + d->data[i+1] = second; + } + + rws_write_unlock(&d->rws[i+1]); + rws_write_unlock(&d->rws[i]); + } + } + + return NULL; +} + +static int +t_rwspin(void) +{ + struct rws_test_data d; + + /* Setup an array to sort */ + for (int i=0; i + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include "lib/birdlib.h" +#include "lib/netindex_private.h" + +#define NETINDEX_INIT_BLOCK_SIZE 128 + +#define NETINDEX_KEY(n) (n)->hash, (n)->addr +#define NETINDEX_NEXT(n) (n)->next +#define NETINDEX_EQ(h,n,i,o) ((h == i) && net_equal(n,o)) +#define NETINDEX_FN(h,n) (h) +#define NETINDEX_ORDER 12 /* Initial */ + +#define NETINDEX_REHASH netindex_rehash +#define NETINDEX_PARAMS /8, *2, 2, 2, 12, 28 + +static void NETINDEX_REHASH(void *_v) { + log(L_TRACE "Netindex rehash: begin"); + netindex_spinhash *v = _v; + int step; + SPINHASH_REHASH_PREPARE(v,NETINDEX,struct netindex,step); + + log(L_TRACE "Netindex rehash: step=%d", step); + if (!step) return; + + if (step > 0) SPINHASH_REHASH_UP(v,NETINDEX,struct netindex,step); + if (step < 0) SPINHASH_REHASH_DOWN(v,NETINDEX,struct netindex,-step); + + log(L_TRACE "Netindex rehash: time to finish"); + SPINHASH_REHASH_FINISH(v,NETINDEX); + log(L_TRACE "Netindex rehash: done"); +} + +static void netindex_hash_cleanup(void *netindex_hash); + +static struct netindex * +net_lock_revive_unlock(netindex_hash *h, struct netindex *i) +{ + if (!i) + return NULL; + + lfuc_lock_revive(&i->uc); + lfuc_unlock(&i->uc, h->cleanup_list, &h->cleanup_event); + return i; +} + +/* + * Index initialization + */ +netindex_hash * +netindex_hash_new(pool *sp, event_list *cleanup_target, u8 type) +{ + DOMAIN(attrs) dom = DOMAIN_NEW_RCU_SYNC(attrs); + LOCK_DOMAIN(attrs, dom); + + pool *p = rp_new(sp, dom.attrs, "Network index"); + + struct netindex_hash_private *nh = mb_allocz(p, sizeof *nh); + nh->lock = dom; + nh->pool = p; + nh->net_type = type; + + nh->slab = net_addr_length[type] ? sl_new(nh->pool, sizeof (struct netindex) + net_addr_length[type]) : NULL; + + SPINHASH_INIT(nh->hash, NETINDEX, nh->pool, cleanup_target); + atomic_store_explicit(&nh->block_size, NETINDEX_INIT_BLOCK_SIZE, memory_order_release); + atomic_store_explicit(&nh->block, + mb_allocz(nh->pool, NETINDEX_INIT_BLOCK_SIZE * sizeof *nh->block), + memory_order_release); + + hmap_init(&nh->id_map, nh->pool, 128); + + nh->cleanup_list = cleanup_target; + nh->cleanup_event = (event) { .hook = netindex_hash_cleanup, nh }; + + UNLOCK_DOMAIN(attrs, dom); + return SKIP_BACK(netindex_hash, priv, nh); +} + +static uint +netindex_hash_cleanup_removed(struct netindex_hash_private *nh, struct netindex * _Atomic *block, struct netindex **removed, uint cnt) +{ + synchronize_rcu(); + + uint kept = 0; + for (uint q = 0; q < cnt; q++) + { + struct netindex *ni = removed[q]; + + /* Now no reader can possibly still have the old pointer, + * unless somebody found it inbetween and ref'd it. */ + if (!lfuc_finished(&ni->uc)) + { + /* Collision, return the netindex back. */ + ASSERT_DIE(NULL == atomic_exchange_explicit(&block[ni->index], ni, memory_order_acq_rel)); + SPINHASH_INSERT(nh->hash, NETINDEX, ni); + kept++; + continue; + } + + /* Now the netindex is definitely obsolete, we can free it */ + hmap_clear(&nh->id_map, ni->index); + + if (nh->slab) + sl_free(ni); + else + mb_free(ni); + } + + return kept; +} + +static void +netindex_hash_cleanup(void *_nh) +{ + struct netindex_hash_private *nh = _nh; + + DOMAIN(attrs) dom = nh->lock; + LOCK_DOMAIN(attrs, dom); + + uint kept = 0; + + uint bs = atomic_load_explicit(&nh->block_size, memory_order_relaxed); + struct netindex * _Atomic *block = atomic_load_explicit(&nh->block, memory_order_relaxed); + +#define REMOVED_MAX 256 + struct netindex *removed[REMOVED_MAX]; + uint removed_cnt = 0; + + for (uint i = 0; i < bs; i++) + { + struct netindex *ni = atomic_load_explicit(&block[i], memory_order_acquire); + if (!ni) + continue; + + /* We may use the acquired netindex pointer as we are + * the only process which deletes them */ + ASSERT_DIE(i == ni->index); + + /* Check finished */ + if (!lfuc_finished(&ni->uc)) + { + kept++; + continue; + } + + /* Looks finished, try dropping */ + ASSERT_DIE(ni == atomic_exchange_explicit(&block[i], NULL, memory_order_acq_rel)); + SPINHASH_REMOVE(nh->hash, NETINDEX, ni); + + /* Store into the removed-block */ + removed[removed_cnt++] = ni; + + /* If removed-block is full, flush it */ + if (removed_cnt == REMOVED_MAX) + { + kept += netindex_hash_cleanup_removed(nh, block, removed, removed_cnt); + removed_cnt = 0; + } + } + + /* Flush remaining netindexes */ + if (removed_cnt) + kept += netindex_hash_cleanup_removed(nh, block, removed, removed_cnt); + + /* Return now unless we're deleted */ + if (kept || !nh->deleted_event) + { + UNLOCK_DOMAIN(attrs, dom); + return; + } + + ev_postpone(&nh->cleanup_event); + + event *e = nh->deleted_event; + event_list *t = nh->deleted_target; + + /* Check cleanliness */ + SPINHASH_WALK(nh->hash, NETINDEX, i) + bug("Stray netindex in deleted hash"); + SPINHASH_WALK_END; + + /* Cleanup the spinhash itself */ + SPINHASH_FREE(nh->hash); + + /* Pool free is enough to drop everything else */ + rp_free(nh->pool); + + /* And only the lock remains */ + UNLOCK_DOMAIN(attrs, dom); + DOMAIN_FREE(attrs, dom); + + /* Notify the requestor */ + ev_send(t, e); +} + +void +netindex_hash_delete(netindex_hash *h, event *e, event_list *t) +{ + NH_LOCK(h, hp); + + hp->deleted_event = e; + hp->deleted_target = t; + + ev_send(hp->cleanup_list, &hp->cleanup_event); +} + +/* + * Private index manipulation + */ +static struct netindex * +net_find_index_fragile(netindex_hash *nh, const net_addr *n) +{ + ASSERT_DIE(n->type == nh->net_type); + + u32 h = net_hash(n); + return SPINHASH_FIND(nh->hash, NETINDEX, h, n); +} + - static _Bool ++static bool +net_validate_index(netindex_hash *h, struct netindex *ni) +{ + struct netindex * _Atomic *block = atomic_load_explicit(&h->block, memory_order_relaxed); + u32 bs = atomic_load_explicit(&h->block_size, memory_order_relaxed); + + ASSERT_DIE(ni->index < bs); + struct netindex *bni = atomic_load_explicit(&block[ni->index], memory_order_acquire); + return (bni == ni); +} + +static struct netindex * +net_new_index_locked(struct netindex_hash_private *hp, const net_addr *n) +{ + ASSERT_DIE(!hp->deleted_event); + + u32 i = hmap_first_zero(&hp->id_map); + hmap_set(&hp->id_map, i); + + struct netindex *ni = hp->slab ? + sl_alloc(hp->slab) : + mb_alloc(hp->pool, n->length + sizeof *ni); + + *ni = (struct netindex) { + .hash = net_hash(n), + .index = i, + }; + net_copy(ni->addr, n); + + SPINHASH_INSERT(hp->hash, NETINDEX, ni); + + struct netindex * _Atomic *block = atomic_load_explicit(&hp->block, memory_order_relaxed); + u32 bs = atomic_load_explicit(&hp->block_size, memory_order_relaxed); + u32 nbs = bs; + while (nbs <= i) + nbs *= 2; + + if (nbs > bs) + { + struct netindex * _Atomic *nb = mb_alloc(hp->pool, bs * 2 * sizeof *nb); + memcpy(nb, block, bs * sizeof *nb); + memset(&nb[bs], 0, (nbs - bs) * sizeof *nb); + + ASSERT_DIE(block == atomic_exchange_explicit(&hp->block, nb, memory_order_acq_rel)); + ASSERT_DIE(bs == atomic_exchange_explicit(&hp->block_size, nbs, memory_order_acq_rel)); + synchronize_rcu(); + + mb_free(block); + block = nb; + + hp->block_epoch++; + } + + ASSERT_DIE(i < nbs); + atomic_store_explicit(&block[i], ni, memory_order_release); + + return ni; +} + + +/* + * Public entry points + */ + +void net_lock_index(netindex_hash *h UNUSED, struct netindex *i) +{ +// log(L_TRACE "Lock index %p", i); + lfuc_lock(&i->uc); +} + +void net_unlock_index(netindex_hash *h, struct netindex *i) +{ +// log(L_TRACE "Unlock index %p", i); + lfuc_unlock(&i->uc, h->cleanup_list, &h->cleanup_event); +} + +struct netindex * +net_find_index(netindex_hash *h, const net_addr *n) +{ + RCU_ANCHOR(u); + struct netindex *ni = net_find_index_fragile(h, n); + return (ni && net_validate_index(h, ni)) ? net_lock_revive_unlock(h, ni) : NULL; +} + +struct netindex * +net_get_index(netindex_hash *h, const net_addr *n) +{ + struct netindex *ni = net_find_index(h, n); + if (ni) return ni; + + NH_LOCK(h, hp); + + /* Somebody may have added one inbetween */ + return net_lock_revive_unlock(h, + (net_find_index_fragile(h, n) ?: + net_new_index_locked(hp, n))); +} + +struct netindex net_index_out_of_range; + +struct netindex * +net_resolve_index(netindex_hash *h, u32 i) +{ + RCU_ANCHOR(u); + + struct netindex * _Atomic *block = atomic_load_explicit(&h->block, memory_order_relaxed); + u32 bs = atomic_load_explicit(&h->block_size, memory_order_relaxed); + + if (i >= bs) + return &net_index_out_of_range; + + struct netindex *ni = atomic_load_explicit(&block[i], memory_order_acquire); + if (ni == NULL) + return NULL; + + return net_lock_revive_unlock(h, ni); +} diff --cc lib/rcu.c index 212d166a8,000000000..25c575f1f mode 100644,000000..100644 --- a/lib/rcu.c +++ b/lib/rcu.c @@@ -1,110 -1,0 +1,110 @@@ +/* + * BIRD Library -- Read-Copy-Update Basic Operations + * + * (c) 2021 Maria Matejka + * (c) 2021 CZ.NIC z.s.p.o. + * + * Can be freely distributed and used under the terms of the GNU GPL. + * Note: all the relevant patents shall be expired. + * + * Using the Supplementary Material for User-Level Implementations of Read-Copy-Update + * by Matthieu Desnoyers, Paul E. McKenney, Alan S. Stern, Michel R. Dagenais and Jonathan Walpole + * obtained from https://www.efficios.com/pub/rcu/urcu-supp-accepted.pdf + */ + +#include "lib/rcu.h" +#include "lib/io-loop.h" +#include "lib/locking.h" + +_Atomic u64 rcu_global_phase = RCU_GP_PHASE; +_Thread_local struct rcu_thread this_rcu_thread; +_Thread_local uint rcu_blocked; + +static struct rcu_thread * _Atomic rcu_thread_list = NULL; + +static _Atomic uint rcu_thread_spinlock = 0; + +static int +rcu_critical(struct rcu_thread *t, u64 phase) +{ + uint val = atomic_load_explicit(&t->ctl, memory_order_acquire); + return + (val & RCU_NEST_MASK) /* Active */ + && ((val & ~RCU_NEST_MASK) <= phase); /* In an older phase */ +} + +void +synchronize_rcu(void) +{ + if (!rcu_blocked && (last_locked > &locking_stack.meta)) + bug("Forbidden to synchronize RCU unless an appropriate lock is taken"); + + /* Increment phase */ + u64 phase = atomic_fetch_add_explicit(&rcu_global_phase, RCU_GP_PHASE, memory_order_acq_rel); + + while (1) { + /* Spinlock */ + while (atomic_exchange_explicit(&rcu_thread_spinlock, 1, memory_order_acq_rel)) + birdloop_yield(); + + /* Check all threads */ - _Bool critical = 0; ++ bool critical = 0; + for (struct rcu_thread * _Atomic *tp = &rcu_thread_list, *t; + t = atomic_load_explicit(tp, memory_order_acquire); + tp = &t->next) + /* Found a critical */ + if (critical = rcu_critical(t, phase)) + break; + + /* Unlock */ + ASSERT_DIE(atomic_exchange_explicit(&rcu_thread_spinlock, 0, memory_order_acq_rel)); + + /* Done if no critical */ + if (!critical) + return; + + /* Wait and retry if critical */ + birdloop_yield(); + } +} + +void +rcu_thread_start(void) +{ + /* Insert this thread to the thread list, no spinlock is needed */ + struct rcu_thread *next = atomic_load_explicit(&rcu_thread_list, memory_order_acquire); + do atomic_store_explicit(&this_rcu_thread.next, next, memory_order_relaxed); + while (!atomic_compare_exchange_strong_explicit( + &rcu_thread_list, &next, &this_rcu_thread, + memory_order_acq_rel, memory_order_acquire)); +} + +void +rcu_thread_stop(void) +{ + /* Spinlock */ + while (atomic_exchange_explicit(&rcu_thread_spinlock, 1, memory_order_acq_rel)) + birdloop_yield(); + + /* Find this thread */ + for (struct rcu_thread * _Atomic *tp = &rcu_thread_list, *t; + t = atomic_load_explicit(tp, memory_order_acquire); + tp = &t->next) + if (t == &this_rcu_thread) + { + /* Remove this thread */ + atomic_store_explicit(tp, atomic_load_explicit(&t->next, memory_order_acquire), memory_order_release); + + /* Unlock and go */ + ASSERT_DIE(atomic_exchange_explicit(&rcu_thread_spinlock, 0, memory_order_acq_rel)); + return; + } + + bug("Failed to find a stopped rcu thread"); +} + +void +rcu_init(void) +{ + rcu_thread_start(); +} diff --cc lib/rcu.h index 214a568b1,000000000..a92440771 mode 100644,000000..100644 --- a/lib/rcu.h +++ b/lib/rcu.h @@@ -1,72 -1,0 +1,72 @@@ +/* + * BIRD Library -- Read-Copy-Update Basic Operations + * + * (c) 2021 Maria Matejka + * (c) 2021 CZ.NIC z.s.p.o. + * + * Can be freely distributed and used under the terms of the GNU GPL. + * Note: all the relevant patents shall be expired. + */ + +#ifndef _BIRD_RCU_H_ +#define _BIRD_RCU_H_ + +#include "lib/birdlib.h" +#include "lib/lists.h" +#include + +#define RCU_GP_PHASE 0x100 +#define RCU_NEST_MASK (RCU_GP_PHASE-1) +#define RCU_NEST_CNT 1 + +extern _Atomic u64 rcu_global_phase; + +struct rcu_thread { + struct rcu_thread * _Atomic next; + u64 local_ctl; + _Atomic u64 ctl; +}; + +extern _Thread_local struct rcu_thread this_rcu_thread; +extern _Thread_local uint rcu_blocked; + +static inline void rcu_read_lock(void) +{ + /* Increment the nesting counter */ + atomic_store_explicit(&this_rcu_thread.ctl, (this_rcu_thread.local_ctl += RCU_NEST_CNT), memory_order_release); + + /* Just nested */ + u64 local_nest = this_rcu_thread.local_ctl & RCU_NEST_MASK; + if (local_nest > RCU_NEST_CNT) + return; + + ASSUME(local_nest == RCU_NEST_CNT); + + /* Update the phase */ + u64 new = atomic_load_explicit(&rcu_global_phase, memory_order_acquire) + RCU_NEST_CNT; + atomic_store_explicit(&this_rcu_thread.ctl, new, memory_order_release); + this_rcu_thread.local_ctl = new; +} + +static inline void rcu_read_unlock(void) +{ + /* Just decrement the nesting counter; when unlocked, nobody cares */ + atomic_fetch_sub_explicit(&this_rcu_thread.ctl, RCU_NEST_CNT, memory_order_acq_rel); + this_rcu_thread.local_ctl--; +} + - static inline _Bool rcu_read_active(void) ++static inline bool rcu_read_active(void) +{ + return !!(this_rcu_thread.local_ctl & RCU_NEST_MASK); +} + +void synchronize_rcu(void); + +/* Registering and unregistering a birdloop. To be called from birdloop implementation */ +void rcu_thread_start(void); +void rcu_thread_stop(void); + +/* Run this from resource init */ +void rcu_init(void); + +#endif diff --cc lib/rcu_test.c index 7cc697104,000000000..5b89c7831 mode 100644,000000..100644 --- a/lib/rcu_test.c +++ b/lib/rcu_test.c @@@ -1,202 -1,0 +1,202 @@@ +/* + * BIRD Library -- Auto storage attribute cleanup test + * + * (c) 2023 Maria Matejka + * (c) 2023 CZ.NIC z.s.p.o. + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include "test/birdtest.h" + +#include "lib/rcu.h" +#include "lib/io-loop.h" + +#include + +#define WRITERS 3 +#define READERS 28 + +#define WRITER_ROUNDS 20 + +static struct block { + struct block * _Atomic next; + u64 value; +} ball[WRITERS][WRITER_ROUNDS]; + +static struct block *_Atomic bin; +static _Atomic uint seen = 0; + +static void * +t_rcu_basic_reader(void *_ UNUSED) +{ + rcu_thread_start(); + + while (atomic_load_explicit(&bin, memory_order_acquire) == NULL) + birdloop_yield(); + + atomic_fetch_add_explicit(&seen, 1, memory_order_release); + + while (atomic_load_explicit(&bin, memory_order_acquire)) + { + rcu_read_lock(); + + uint mod = 0; + for (struct block * _Atomic *bp = &bin, *b; + b = atomic_load_explicit(bp, memory_order_acquire); + bp = &b->next) + { + uint val = b->value % WRITERS + 1; + ASSERT_DIE(val > mod); + mod = val; + } + + ASSERT_DIE(mod <= WRITERS); + + rcu_read_unlock(); + } + + rcu_thread_stop(); + return NULL; +} + +static _Atomic uint spinlock = 0; + +static inline void +spin_lock(void) +{ + while (atomic_exchange_explicit(&spinlock, 1, memory_order_acq_rel)) + birdloop_yield(); +} + +static inline void +spin_unlock(void) +{ + ASSERT_DIE(atomic_exchange_explicit(&spinlock, 0, memory_order_acq_rel)); +} + +static void * +t_rcu_basic_writer(void *order_ptr) +{ + rcu_thread_start(); + + uint order = (uintptr_t) order_ptr; + struct block *cur = &ball[order][0]; + + /* Insert the object */ + spin_lock(); + for (struct block * _Atomic *bp = &bin; bp; ) + { + struct block *b = atomic_load_explicit(bp, memory_order_acquire); + if (b && ((b->value % WRITERS) < order)) + bp = &b->next; + else + { + ASSERT_DIE(cur->value == 0xbabababababababa); + cur->value = order; + atomic_store_explicit(&cur->next, b, memory_order_relaxed); + atomic_store_explicit(bp, cur, memory_order_release); + break; + } + } + spin_unlock(); + + /* Wait for readers */ + while (atomic_load_explicit(&seen, memory_order_acquire) != READERS) + birdloop_yield(); + + /* Update the object */ + for (uint i=1; ivalue == 0xbabababababababa); + next->value = order + i*WRITERS; + + spin_lock(); - _Bool seen = 0; ++ bool seen = 0; + for (struct block * _Atomic *bp = &bin, *b; + b = atomic_load_explicit(bp, memory_order_acquire); + bp = &b->next) + if (b == cur) + { + struct block *link = atomic_load_explicit(&b->next, memory_order_relaxed); + atomic_store_explicit(&next->next, link, memory_order_relaxed); + atomic_store_explicit(bp, next, memory_order_release); + seen = 1; + break; + } + ASSERT_DIE(seen); + spin_unlock(); + + synchronize_rcu(); + + ASSERT_DIE(cur->value + WRITERS == next->value); + cur->value = 0xd4d4d4d4d4d4d4d4; + atomic_store_explicit(&cur->next, ((void *) 0xd8d8d8d8d8d8d8d8), memory_order_relaxed); + + cur = next; + } + + /* Remove the object */ + spin_lock(); - _Bool seen = 0; ++ bool seen = 0; + for (struct block * _Atomic *bp = &bin, *b; + b = atomic_load_explicit(bp, memory_order_acquire); + bp = &b->next) + if (b == cur) + { + struct block *link = atomic_load_explicit(&b->next, memory_order_relaxed); + atomic_store_explicit(bp, link, memory_order_relaxed); + seen = 1; + break; + } + ASSERT_DIE(seen); + spin_unlock(); + + synchronize_rcu(); + + cur->value = 0xd4d4d4d4d4d4d4d4; + atomic_store_explicit(&cur->next, ((void *) 0xd8d8d8d8d8d8d8d8), memory_order_relaxed); + + rcu_thread_stop(); + return NULL; +} + +static int +t_rcu_basic(void) +{ + memset(ball, 0xba, sizeof ball); + + pthread_t readers[READERS]; + pthread_t writers[WRITERS]; + + for (uint i=0; i + * (c) 2022 Maria Matejka + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _BIRD_LIB_ROUTE_H_ +#define _BIRD_LIB_ROUTE_H_ + +#undef RT_SOURCE_DEBUG + +#include "lib/type.h" +#include "lib/rcu.h" +#include "lib/hash.h" +#include "lib/event.h" +#include "lib/lockfree.h" + +struct network; +struct proto; +struct cli; +struct rtable_private; +struct rte_storage; + +#define RTE_IN_TABLE_WRITABLE \ + byte pflags; /* Protocol-specific flags; may change in-table (!) */ \ + byte flags; /* Table-specific flags */ \ + u8 stale_cycle; /* Auxiliary value for route refresh; may change in-table (!) */ \ + +typedef struct rte { + RTE_IN_TABLE_WRITABLE; + u8 generation; /* If this route import is based on other previously exported route, + this value should be 1 + MAX(generation of the parent routes). + Otherwise the route is independent and this value is zero. */ + u32 id; /* Table specific route id */ + struct ea_list *attrs; /* Attributes of this route */ + const net_addr *net; /* Network this RTE belongs to */ + struct rte_src *src; /* Route source that created the route */ + struct rt_import_hook *sender; /* Import hook used to send the route to the routing table */ + btime lastmod; /* Last modified (set by table) */ +} rte; + +#define REF_FILTERED 2 /* Route is rejected by import filter */ +#define REF_OBSOLETE 16 /* Route is obsolete, pending propagation */ +#define REF_PENDING 32 /* Route has not propagated completely yet */ + +/* Route is valid for propagation (may depend on other flags in the future), accepts NULL */ +static inline int rte_is_valid(const rte *r) { return r && !(r->flags & REF_FILTERED); } + +/* Route just has REF_FILTERED flag */ +static inline int rte_is_filtered(const rte *r) { return !!(r->flags & REF_FILTERED); } + +/* Strip the route of the table-specific values */ +static inline rte rte_init_from(const rte *r) +{ + return (rte) { + .attrs = r->attrs, + .net = r->net, + .src = r->src, + }; +} + +int rte_same(const rte *, const rte *); + +struct rte_src { + struct rte_src *next; /* Hash chain */ + struct rte_owner *owner; /* Route source owner */ + u64 private_id; /* Private ID, assigned by the protocol */ + u32 global_id; /* Globally unique ID of the source */ + struct lfuc uc; /* Use count */ +}; + +struct rte_owner_class { + void (*get_route_info)(const rte *, byte *buf); /* Get route information (for `show route' command) */ + int (*rte_better)(const rte *, const rte *); + int (*rte_mergable)(const rte *, const rte *); + u32 (*rte_igp_metric)(const rte *); +}; + +struct rte_owner { + struct rte_owner_class *class; + int (*rte_recalculate)(struct rtable_private *, struct network *, struct rte_storage *new, struct rte_storage *, struct rte_storage *); + HASH(struct rte_src) hash; + const char *name; + u32 hash_key; + u32 uc; + u32 debug; + event_list *list; + event *prune; + event *stop; +}; + +extern DOMAIN(attrs) attrs_domain; + +#define RTA_LOCK LOCK_DOMAIN(attrs, attrs_domain) +#define RTA_UNLOCK UNLOCK_DOMAIN(attrs, attrs_domain) + +#define RTE_SRC_PU_SHIFT 44 +#define RTE_SRC_IN_PROGRESS (1ULL << RTE_SRC_PU_SHIFT) + +/* Get a route source. This also locks the source, therefore the caller has to + * unlock the source after the route has been propagated. */ +struct rte_src *rt_get_source_o(struct rte_owner *o, u32 id); +#define rt_get_source(p, id) rt_get_source_o(&(p)->sources, (id)) + +struct rte_src *rt_find_source_global(u32 id); + +#ifdef RT_SOURCE_DEBUG +#define rt_lock_source _rt_lock_source_internal +#define rt_unlock_source _rt_unlock_source_internal +#endif + +static inline void rt_lock_source(struct rte_src *src) +{ + lfuc_lock(&src->uc); +} + +static inline void rt_unlock_source(struct rte_src *src) +{ + lfuc_unlock(&src->uc, src->owner->list, src->owner->prune); +} + +#ifdef RT_SOURCE_DEBUG +#undef rt_lock_source +#undef rt_unlock_source + +#define rt_lock_source(x) ( log(L_INFO "Lock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_lock_source_internal(x) ) +#define rt_unlock_source(x) ( log(L_INFO "Unlock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_unlock_source_internal(x) ) +#endif + +void rt_init_sources(struct rte_owner *, const char *name, event_list *list); +void rt_destroy_sources(struct rte_owner *, event *); + +void rt_dump_sources(struct rte_owner *); + +/* + * Route Attributes + * + * Beware: All standard BGP attributes must be represented here instead + * of making them local to the route. This is needed to ensure proper + * construction of BGP route attribute lists. + */ + +/* Nexthop structure */ +struct nexthop { + ip_addr gw; /* Next hop */ + struct iface *iface; /* Outgoing interface */ + byte flags; + byte weight; + byte labels; /* Number of all labels */ + u32 label[0]; +}; + +/* For packing one into eattrs */ +struct nexthop_adata { + struct adata ad; + /* There is either a set of nexthops or a special destination (RTD_*) */ + union { + struct nexthop nh; + uint dest; + }; +}; + +/* For MPLS label stack generation */ +struct nexthop_adata_mpls { + struct nexthop_adata nhad; + u32 label_space[MPLS_MAX_LABEL_STACK]; +}; + +#define NEXTHOP_DEST_SIZE (OFFSETOF(struct nexthop_adata, dest) + sizeof(uint) - OFFSETOF(struct adata, data)) +#define NEXTHOP_DEST_LITERAL(x) ((struct nexthop_adata) { \ + .ad.length = NEXTHOP_DEST_SIZE, .dest = (x), }) + +#define RNF_ONLINK 0x1 /* Gateway is onlink regardless of IP ranges */ + + +#define RTS_STATIC 1 /* Normal static route */ +#define RTS_INHERIT 2 /* Route inherited from kernel */ +#define RTS_DEVICE 3 /* Device route */ +#define RTS_STATIC_DEVICE 4 /* Static device route */ +#define RTS_REDIRECT 5 /* Learned via redirect */ +#define RTS_RIP 6 /* RIP route */ +#define RTS_OSPF 7 /* OSPF route */ +#define RTS_OSPF_IA 8 /* OSPF inter-area route */ +#define RTS_OSPF_EXT1 9 /* OSPF external route type 1 */ +#define RTS_OSPF_EXT2 10 /* OSPF external route type 2 */ +#define RTS_BGP 11 /* BGP route */ +#define RTS_PIPE 12 /* Inter-table wormhole */ +#define RTS_BABEL 13 /* Babel route */ +#define RTS_RPKI 14 /* Route Origin Authorization */ +#define RTS_PERF 15 /* Perf checker */ +#define RTS_L3VPN 16 /* MPLS L3VPN */ +#define RTS_AGGREGATED 17 /* Aggregated route */ +#define RTS_MAX 18 + +#define RTD_NONE 0 /* Undefined next hop */ +#define RTD_UNICAST 1 /* A standard next hop */ +#define RTD_BLACKHOLE 2 /* Silently drop packets */ +#define RTD_UNREACHABLE 3 /* Reject as unreachable */ +#define RTD_PROHIBIT 4 /* Administratively prohibited */ +#define RTD_MAX 5 + +extern const char * rta_dest_names[RTD_MAX]; + +static inline const char *rta_dest_name(uint n) +{ return (n < RTD_MAX) ? rta_dest_names[n] : "???"; } + + +/* + * Extended Route Attributes + */ + +typedef struct eattr { + word id; /* EA_CODE(PROTOCOL_..., protocol-dependent ID) */ + byte flags; /* Protocol-dependent flags */ + byte type; /* Attribute type */ + byte rfu:5; + byte originated:1; /* The attribute has originated locally */ + byte fresh:1; /* An uncached attribute (e.g. modified in export filter) */ + byte undef:1; /* Explicitly undefined */ + + PADDING(unused, 3, 3); + + union bval u; +} eattr; + + +#define EA_CODE_MASK 0xffff +#define EA_ALLOW_UNDEF 0x10000 /* ea_find: allow EAF_TYPE_UNDEF */ +#define EA_BIT(n) ((n) << 24) /* Used in bitfield accessors */ +#define EA_BIT_GET(ea) ((ea) >> 24) + +typedef struct ea_list { + struct ea_list *next; /* In case we have an override list */ + byte flags; /* Flags: EALF_... */ + byte stored:5; /* enum ea_stored */ + byte rfu:3; + word count; /* Number of attributes */ + eattr attrs[0]; /* Attribute definitions themselves */ +} ea_list; + +enum ea_stored { + EALS_NONE = 0, /* This is a temporary ea_list */ + EALS_PREIMPORT = 1, /* State when route entered rte_update() */ + EALS_FILTERED = 2, /* State after filters */ + EALS_IN_TABLE = 3, /* State in table */ + EALS_KEY = 4, /* EA list used as key */ + EALS_CUSTOM = 0x10, /* OR this with custom values */ + EALS_MAX = 0x20, +}; + +struct ea_storage { + struct ea_storage *next_hash; /* Next in hash chain */ + _Atomic u64 uc; /* Use count */ + u32 hash_key; /* List hash */ + PADDING(unused, 0, 4); /* Sorry, we need u64 for the usecount */ + ea_list l[0]; /* The list itself */ +}; + +#define EALF_SORTED 1 /* Attributes are sorted by code */ +#define EALF_BISECT 2 /* Use interval bisection for searching */ +#define EALF_HUGE 8 /* List is too big to fit into slab */ + +struct ea_class { +#define EA_CLASS_INSIDE \ + const char *name; /* Name (both print and filter) */ \ + struct symbol *sym; /* Symbol to export to configs */ \ + uint id; /* Autoassigned attribute ID */ \ + uint uc; /* Reference count */ \ + btype type; /* Data type ID */ \ + u16 flags; /* Protocol-dependent flags */ \ + uint readonly:1; /* This attribute can't be changed by filters */ \ + uint conf:1; /* Requested by config */ \ + uint hidden:1; /* Technical attribute, do not show, do not expose to filters */ \ + void (*format)(const eattr *ea, byte *buf, uint size); \ + void (*stored)(const eattr *ea); /* When stored into global hash */ \ + void (*freed)(const eattr *ea); /* When released from global hash */ \ + + EA_CLASS_INSIDE; +}; + +struct ea_class_ref { + resource r; + struct ea_class *class; +}; + +void ea_register_init(struct ea_class *); +struct ea_class_ref *ea_register_alloc(pool *, struct ea_class); +struct ea_class_ref *ea_ref_class(pool *, struct ea_class *); /* Reference for an attribute alias */ + +#define EA_REGISTER_ALL_HELPER(x) ea_register_init(x); +#define EA_REGISTER_ALL(...) MACRO_FOREACH(EA_REGISTER_ALL_HELPER, __VA_ARGS__) + +struct ea_class *ea_class_find_by_id(uint id); +struct ea_class *ea_class_find_by_name(const char *name); +static inline struct ea_class *ea_class_self(struct ea_class *self) { return self; } +#define ea_class_find(_arg) _Generic((_arg), \ + uint: ea_class_find_by_id, \ + word: ea_class_find_by_id, \ + char *: ea_class_find_by_name, \ + const char *: ea_class_find_by_name, \ + struct ea_class *: ea_class_self)(_arg) + +struct ea_walk_state { + ea_list *eattrs; /* Ccurrent ea_list, initially set by caller */ + eattr *ea; /* Current eattr, initially NULL */ + u32 visited[4]; /* Bitfield, limiting max to 128 */ +}; + +#define ea_find(_l, _arg) _Generic((_arg), uint: ea_find_by_id, struct ea_class *: ea_find_by_class, char *: ea_find_by_name)(_l, _arg) +eattr *ea_find_by_id(ea_list *, unsigned ea); +static inline eattr *ea_find_by_class(ea_list *l, const struct ea_class *def) +{ return ea_find_by_id(l, def->id); } +static inline eattr *ea_find_by_name(ea_list *l, const char *name) +{ + const struct ea_class *def = ea_class_find_by_name(name); + return def ? ea_find_by_class(l, def) : NULL; +} + +#define ea_get_int(_l, _ident, _def) ({ \ + struct ea_class *cls = ea_class_find((_ident)); \ + ASSERT_DIE(cls->type & EAF_EMBEDDED); \ + const eattr *ea = ea_find((_l), cls->id); \ + (ea ? ea->u.data : (_def)); \ + }) + +#define ea_get_ip(_l, _ident, _def) ({ \ + struct ea_class *cls = ea_class_find((_ident)); \ + ASSERT_DIE(cls->type == T_IP); \ + const eattr *ea = ea_find((_l), cls->id); \ + (ea ? *((const ip_addr *) ea->u.ptr->data) : (_def)); \ + }) + +#define ea_get_adata(_l, _ident) ({ \ + struct ea_class *cls = ea_class_find((_ident)); \ + ASSERT_DIE(!(cls->type & EAF_EMBEDDED)); \ + const eattr *ea = ea_find((_l), cls->id); \ + (ea ? ea->u.ptr : &null_adata); \ + }) + +eattr *ea_walk(struct ea_walk_state *s, uint id, uint max); +void ea_dump(ea_list *); +int ea_same(ea_list *x, ea_list *y); /* Test whether two ea_lists are identical */ +uint ea_hash(ea_list *e); /* Calculate attributes hash value */ +ea_list *ea_append(ea_list *to, ea_list *what); +void ea_format_bitfield(const struct eattr *a, byte *buf, int bufsize, const char **names, int min, int max); + +/* Normalize ea_list; allocates the result from tmp_linpool */ +ea_list *ea_normalize(ea_list *e, u32 upto); + +uint ea_list_size(ea_list *); +void ea_list_copy(ea_list *dest, ea_list *src, uint size); + +#define EA_LOCAL_LIST(N) struct { ea_list l; eattr a[N]; } + +#define EA_LITERAL_EMBEDDED(_class, _flags, _val) ({ \ + btype _type = (_class)->type; \ + ASSERT_DIE(_type & EAF_EMBEDDED); \ + EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.i = _val); \ + }) + +#define EA_LITERAL_STORE_ADATA(_class, _flags, _buf, _len) ({ \ + btype _type = (_class)->type; \ + ASSERT_DIE(!(_type & EAF_EMBEDDED)); \ + EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.ad = tmp_store_adata((_buf), (_len))); \ + }) + +#define EA_LITERAL_DIRECT_ADATA(_class, _flags, _adata) ({ \ + btype _type = (_class)->type; \ + ASSERT_DIE(!(_type & EAF_EMBEDDED)); \ + EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.ad = _adata); \ + }) + +#define EA_LITERAL_GENERIC(_id, _type, _flags, ...) \ + ((eattr) { .id = _id, .type = _type, .flags = _flags, __VA_ARGS__ }) + +static inline eattr * +ea_set_attr(ea_list **to, eattr a) +{ + if (!a.id) + bug("You have forgotten to register your EA class"); + + EA_LOCAL_LIST(1) *ea = tmp_alloc(sizeof(*ea)); + *ea = (typeof(*ea)) { + .l.flags = EALF_SORTED, + .l.count = 1, + .l.next = *to, + .a[0] = a, + }; + + *to = &ea->l; + return &ea->a[0]; +} + +static inline void - ea_unset_attr(ea_list **to, _Bool local, const struct ea_class *def) ++ea_unset_attr(ea_list **to, bool local, const struct ea_class *def) +{ + ea_set_attr(to, EA_LITERAL_GENERIC(def->id, 0, 0, + .fresh = local, .originated = local, .undef = 1)); +} + +static inline void +ea_set_attr_u32(ea_list **to, const struct ea_class *def, uint flags, u64 data) +{ ea_set_attr(to, EA_LITERAL_EMBEDDED(def, flags, data)); } + +static inline void +ea_set_attr_data(ea_list **to, const struct ea_class *def, uint flags, const void *data, uint len) +{ ea_set_attr(to, EA_LITERAL_STORE_ADATA(def, flags, data, len)); } + +static inline void +ea_copy_attr(ea_list **to, ea_list *from, const struct ea_class *def) +{ + eattr *e = ea_find_by_class(from, def); + if (e) + if (e->type & EAF_EMBEDDED) + ea_set_attr_u32(to, def, e->flags, e->u.data); + else + ea_set_attr_data(to, def, e->flags, e->u.ptr->data, e->u.ptr->length); + else + ea_unset_attr(to, 0, def); +} + +/* + * Common route attributes + */ + +/* Preference: first-order comparison */ +extern struct ea_class ea_gen_preference; +static inline u32 rt_get_preference(const rte *rt) +{ return ea_get_int(rt->attrs, &ea_gen_preference, 0); } + +/* IGP metric: second-order comparison */ +extern struct ea_class ea_gen_igp_metric; +u32 rt_get_igp_metric(const rte *rt); +#define IGP_METRIC_UNKNOWN 0x80000000 /* Default igp_metric used when no other + protocol-specific metric is availabe */ + +/* From: Advertising router */ +extern struct ea_class ea_gen_from; + + +/* MPLS Label, Policy and Class */ +extern struct ea_class ea_gen_mpls_label, + ea_gen_mpls_policy, ea_gen_mpls_class; + + +/* Source: An old method to devise the route source protocol and kind. + * To be superseded in a near future by something more informative. */ +extern struct ea_class ea_gen_source; +static inline u32 rt_get_source_attr(const rte *rt) +{ return ea_get_int(rt->attrs, &ea_gen_source, 0); } + +/* Flowspec validation result */ +enum flowspec_valid { + FLOWSPEC_UNKNOWN = 0, + FLOWSPEC_VALID = 1, + FLOWSPEC_INVALID = 2, + FLOWSPEC__MAX, +}; + +extern const char * flowspec_valid_names[FLOWSPEC__MAX]; +static inline const char *flowspec_valid_name(enum flowspec_valid v) +{ return (v < FLOWSPEC__MAX) ? flowspec_valid_names[v] : "???"; } + +extern struct ea_class ea_gen_flowspec_valid; +static inline enum flowspec_valid rt_get_flowspec_valid(const rte *rt) +{ return ea_get_int(rt->attrs, &ea_gen_flowspec_valid, FLOWSPEC_UNKNOWN); } + +/* Next hop: For now, stored as adata */ +extern struct ea_class ea_gen_nexthop; + +static inline void ea_set_dest(struct ea_list **to, uint flags, uint dest) +{ + struct nexthop_adata nhad = NEXTHOP_DEST_LITERAL(dest); + ea_set_attr_data(to, &ea_gen_nexthop, flags, &nhad.ad.data, nhad.ad.length); +} + +/* Next hop structures */ + +#define NEXTHOP_ALIGNMENT (_Alignof(struct nexthop)) +#define NEXTHOP_MAX_SIZE (sizeof(struct nexthop) + sizeof(u32)*MPLS_MAX_LABEL_STACK) +#define NEXTHOP_SIZE(_nh) NEXTHOP_SIZE_CNT(((_nh)->labels)) +#define NEXTHOP_SIZE_CNT(cnt) BIRD_ALIGN((sizeof(struct nexthop) + sizeof(u32) * (cnt)), NEXTHOP_ALIGNMENT) +#define nexthop_size(nh) NEXTHOP_SIZE((nh)) + +#define NEXTHOP_NEXT(_nh) ((void *) (_nh) + NEXTHOP_SIZE(_nh)) +#define NEXTHOP_END(_nhad) ((_nhad)->ad.data + (_nhad)->ad.length) +#define NEXTHOP_VALID(_nh, _nhad) ((void *) (_nh) < (void *) NEXTHOP_END(_nhad)) +#define NEXTHOP_ONE(_nhad) (NEXTHOP_NEXT(&(_nhad)->nh) == NEXTHOP_END(_nhad)) + +#define NEXTHOP_WALK(_iter, _nhad) for ( \ + struct nexthop *_iter = &(_nhad)->nh; \ + (void *) _iter < (void *) NEXTHOP_END(_nhad); \ + _iter = NEXTHOP_NEXT(_iter)) + + +static inline int nexthop_same(struct nexthop_adata *x, struct nexthop_adata *y) +{ return adata_same(&x->ad, &y->ad); } +struct nexthop_adata *nexthop_merge(struct nexthop_adata *x, struct nexthop_adata *y, int max, linpool *lp); +struct nexthop_adata *nexthop_sort(struct nexthop_adata *x, linpool *lp); +int nexthop_is_sorted(struct nexthop_adata *x); + +#define NEXTHOP_IS_REACHABLE(nhad) ((nhad)->ad.length > NEXTHOP_DEST_SIZE) + +static inline struct nexthop_adata * +rte_get_nexthops(rte *r) +{ + eattr *nhea = ea_find(r->attrs, &ea_gen_nexthop); + return nhea ? SKIP_BACK(struct nexthop_adata, ad, nhea->u.ptr) : NULL; +} + +/* Route has regular, reachable nexthop (i.e. not RTD_UNREACHABLE and like) */ +static inline int rte_is_reachable(rte *r) +{ + struct nexthop_adata *nhad = rte_get_nexthops(r); + return nhad && NEXTHOP_IS_REACHABLE(nhad); +} + +static inline int nhea_dest(eattr *nhea) +{ + if (!nhea) + return RTD_NONE; + + struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL; + if (NEXTHOP_IS_REACHABLE(nhad)) + return RTD_UNICAST; + else + return nhad->dest; +} + +static inline int rte_dest(const rte *r) +{ + return nhea_dest(ea_find(r->attrs, &ea_gen_nexthop)); +} + +void rta_init(void); + +ea_list *ea_lookup_slow(ea_list *r, u32 squash_upto, enum ea_stored oid); + +static inline struct ea_storage *ea_get_storage(ea_list *r) +{ + ASSERT_DIE(r->stored); + return SKIP_BACK(struct ea_storage, l[0], r); +} + +static inline ea_list *ea_ref(ea_list *r) +{ + ASSERT_DIE(0 < atomic_fetch_add_explicit(&ea_get_storage(r)->uc, 1, memory_order_acq_rel)); + return r; +} + +static inline ea_list *ea_lookup(ea_list *r, u32 squash_upto, enum ea_stored oid) +{ + ASSERT_DIE(oid); + if ((r->stored == oid) || BIT32_TEST(&squash_upto, r->stored)) + return ea_ref(r); + else + return ea_lookup_slow(r, squash_upto, oid); +} + +struct ea_free_deferred { + struct deferred_call dc; + ea_list *attrs; +}; + +void ea_free_deferred(struct deferred_call *dc); + +static inline ea_list *ea_free_later(ea_list *r) +{ + if (!r) + return NULL; + + struct ea_free_deferred efd = { + .dc.hook = ea_free_deferred, + .attrs = r, + }; + + defer_call(&efd.dc, sizeof efd); + return r; +} + +#define ea_free ea_free_later + +static inline ea_list *ea_lookup_tmp(ea_list *r, u32 squash_upto, enum ea_stored oid) +{ + return ea_free_later(ea_lookup(r, squash_upto, oid)); +} + +static inline ea_list *ea_ref_tmp(ea_list *r) +{ + ASSERT_DIE(r->stored); + return ea_free_later(ea_ref(r)); +} + +static inline ea_list *ea_strip_to(ea_list *r, u32 strip_to) +{ + ASSERT_DIE(strip_to); + while (r && !BIT32_TEST(&strip_to, r->stored)) + r = r->next; + + return r; +} + +void ea_dump(ea_list *); +void ea_dump_all(void); +void ea_show_list(struct cli *, ea_list *); + +#endif diff --cc lib/socket.h index 4c80b96a4,231c10d86..302c8e1e9 --- a/lib/socket.h +++ b/lib/socket.h @@@ -86,12 -84,8 +86,12 @@@ typedef struct birdsock sock *sock_new(pool *); /* Allocate new socket */ #define sk_new(X) sock_new(X) /* Wrapper to avoid name collision with OpenSSL */ -int sk_open(sock *); /* Open socket */ +int sk_open(sock *, struct birdloop *); /* Open socket */ +void sk_reloop(sock *, struct birdloop *); /* Move socket to another loop. Both loops must be locked. */ +static inline void sk_close(sock *s) { rfree(&s->r); } /* Explicitly close socket */ + int sk_rx_ready(sock *s); - _Bool sk_tx_pending(sock *s); ++bool sk_tx_pending(sock *s); int sk_send(sock *, uint len); /* Send data, <0=err, >0=ok, 0=sleep */ int sk_send_to(sock *, uint len, ip_addr to, uint port); /* sk_send to given destination */ void sk_reallocate(sock *); /* Free and allocate tbuf & rbuf */ diff --cc nest/mpls.c index a362ef80b,9cdcd572a..b81f82c1f --- a/nest/mpls.c +++ b/nest/mpls.c @@@ -874,31 -760,10 +874,31 @@@ mpls_fec_map_reconfigure(struct mpls_fe /* Remove old unused handles */ if (old_d && !old_d->label_count) - mpls_free_handle(m->domain, old_d); + mpls_free_handle(old_d); if (old_s && !old_s->label_count) - mpls_free_handle(m->domain, old_s); + mpls_free_handle(old_s); +} + +static void +mpls_fec_map_cleanup(void *_m) +{ + struct mpls_fec_map *m = _m; - _Bool finished = (m->channel->channel_state == CS_STOP); ++ bool finished = (m->channel->channel_state == CS_STOP); + HASH_WALK_DELSAFE(m->label_hash, next_l, fec) + if (lfuc_finished(&fec->uc)) + mpls_free_fec(m, fec); + else + finished = 0; + HASH_WALK_DELSAFE_END; + + DBGL("FEC Map %p Cleanup: %sfinished", m, finished ? "" : "not "); + + if (finished) + { + ev_postpone(m->cleanup_event); + channel_del_obstacle(m->channel); + } } void @@@ -1435,17 -1290,9 +1435,17 @@@ mpls_show_ranges(struct mpls_show_range mpls_show_ranges_dom(cmd, cmd->domain->domain); else { - struct mpls_domain *m; - WALK_LIST(m, mpls_domains) + struct mpls_domain_pub *m; - _Bool first = 1; ++ bool first = 1; + WALK_LIST(m, MPLS_GLOBAL->domains) + { + if (first) + first = 0; + else + cli_msg(-1026, ""); + mpls_show_ranges_dom(cmd, m); + } } cli_msg(0, ""); diff --cc nest/proto.c index 279ac0698,88f4813ef..81d1e9240 --- a/nest/proto.c +++ b/nest/proto.c @@@ -667,129 -478,67 +667,129 @@@ channel_start_export(struct channel *c } static void -channel_reload_loop(void *ptr) +channel_check_stopped(struct channel *c) { - struct channel *c = ptr; + switch (c->channel_state) + { + case CS_STOP: + if (c->obstacles || !EMPTY_LIST(c->roa_subscriptions) || c->in_req.hook) + return; - /* Start reload */ - if (!c->reload_active) - c->reload_pending = 0; + ASSERT_DIE(rt_export_get_state(&c->out_req) == TES_DOWN); + ASSERT_DIE(!rt_export_feed_active(&c->reimporter)); - if (!rt_reload_channel(c)) - { - ev_schedule_work(c->reload_event); - return; + channel_set_state(c, CS_DOWN); + proto_send_event(c->proto, c->proto->event); + + break; + case CS_PAUSE: + if (c->obstacles || !EMPTY_LIST(c->roa_subscriptions)) + return; + + ASSERT_DIE(rt_export_get_state(&c->out_req) == TES_DOWN); + ASSERT_DIE(!rt_export_feed_active(&c->reimporter)); + + channel_set_state(c, CS_START); + break; } - /* Restart reload */ - if (c->reload_pending) - channel_request_reload(c); + DBG("%s.%s: Channel requests/hooks stopped (in state %s)\n", c->proto->name, c->name, c_states[c->channel_state]); } -static void -channel_reset_import(struct channel *c) +void +channel_add_obstacle(struct channel *c) { - /* Need to abort feeding */ - ev_postpone(c->reload_event); - rt_reload_channel_abort(c); - - rt_prune_sync(c->in_table, 1); + c->obstacles++; } -static void -channel_reset_export(struct channel *c) +void +channel_del_obstacle(struct channel *c) { - /* Just free the routes */ - rt_prune_sync(c->out_table, 1); + if (!--c->obstacles) + channel_check_stopped(c); } -/* Called by protocol to activate in_table */ void -channel_setup_in_table(struct channel *c) +channel_import_stopped(struct rt_import_request *req) { - struct rtable_config *cf = mb_allocz(c->proto->pool, sizeof(struct rtable_config)); + SKIP_BACK_DECLARE(struct channel, c, in_req, req); - cf->name = "import"; - cf->addr_type = c->net_type; - cf->internal = 1; + mb_free(c->in_req.name); + c->in_req.name = NULL; - c->in_table = cf->table = rt_setup(c->proto->pool, cf); + bmap_free(&c->imported_map); - c->reload_event = ev_new_init(c->proto->pool, channel_reload_loop, c); + channel_check_stopped(c); } -/* Called by protocol to activate out_table */ -void -channel_setup_out_table(struct channel *c) +static u32 +channel_reimport_next_feed_index(struct rt_export_feeder *f, u32 try_this) { - struct rtable_config *cf = mb_allocz(c->proto->pool, sizeof(struct rtable_config)); - cf->name = "export"; - cf->addr_type = c->net_type; - cf->internal = 1; + SKIP_BACK_DECLARE(struct channel, c, reimporter, f); + while (!bmap_test(&c->imported_map, try_this)) + if (!(try_this & (try_this - 1))) /* return every power of two to check for maximum */ + return try_this; + else + try_this++; + + return try_this; +} + +static void +channel_do_reload(void *_c) +{ + struct channel *c = _c; + + RT_FEED_WALK(&c->reimporter, f) + { - _Bool seen = 0; ++ bool seen = 0; + for (uint i = 0; i < f->count_routes; i++) + { + rte *r = &f->block[i]; + + if (r->flags & REF_OBSOLETE) + break; + + if (r->sender == c->in_req.hook) + { + /* Strip the table-specific information */ + rte new = rte_init_from(r); + + /* Strip the later attribute layers */ + new.attrs = ea_strip_to(new.attrs, BIT32_ALL(EALS_PREIMPORT)); + + /* And reload the route */ + rte_update(c, r->net, &new, new.src); - c->out_table = rt_setup(c->proto->pool, cf); + seen = 1; + } + } + + if (!seen) + bmap_clear(&c->imported_map, f->ni->index); + + /* Local data needed no more */ + tmp_flush(); + + MAYBE_DEFER_TASK(proto_work_list(c->proto), &c->reimport_event, + "%s.%s reimport", c->proto->name, c->name); + } +} + +/* Called by protocol to activate in_table */ +static void +channel_setup_in_table(struct channel *c) +{ + c->reimporter = (struct rt_export_feeder) { + .name = mb_sprintf(c->proto->pool, "%s.%s.reimport", c->proto->name, c->name), + .trace_routes = c->debug, + .next_feed_index = channel_reimport_next_feed_index, + }; + c->reimport_event = (event) { + .hook = channel_do_reload, + .data = c, + }; + rt_feeder_subscribe(&c->table->export_all, &c->reimporter); } diff --cc nest/route.h index c640a8a21,659783a8b..3b56ccb6b --- a/nest/route.h +++ b/nest/route.h @@@ -46,300 -27,108 +46,300 @@@ struct f_trie_walk_state struct cli; /* - * Generic data structure for storing network prefixes. Also used - * for the master routing table. Currently implemented as a hash - * table. + * Master Routing Tables. Generally speaking, each of them contains a FIB + * with each entry pointing to a list of route entries representing routes + * to given network (with the selected one at the head). + * + * Each of the RTE's contains variable data (the preference and protocol-dependent + * metrics) and a pointer to a route attribute block common for many routes). * - * Available operations: - * - insertion of new entry - * - deletion of entry - * - searching for entry by network prefix - * - asynchronous retrieval of fib contents + * It's guaranteed that there is at most one RTE for every (prefix,proto) pair. */ -struct fib_node { - struct fib_node *next; /* Next in hash chain */ - struct fib_iterator *readers; /* List of readers of this node */ - net_addr addr[0]; +struct rtable_config { + node n; + char *name; + union rtable *table; + struct proto_config *krt_attached; /* Kernel syncer attached to this table */ + uint addr_type; /* Type of address data stored in table (NET_*) */ + uint gc_threshold; /* Maximum number of operations before GC is run */ + uint gc_period; /* Approximate time between two consecutive GC runs */ + u32 debug; /* Debugging flags (D_*) */ + byte sorted; /* Routes of network are sorted according to rte_better() */ + byte trie_used; /* Rtable has attached trie */ + struct rt_cork_threshold { + u64 low, high; + } cork_threshold; /* Cork threshold values */ + struct settle_config export_settle; /* Export announcement settler */ + struct settle_config export_rr_settle;/* Export announcement settler config valid when any + route refresh is running */ + struct settle_config digest_settle; /* Settle times for digests */ + struct rtable_config *roa_aux_table; /* Auxiliary table config for ROA connections */ + struct rt_stream_config { + struct rtable_config *src; + void (*setup)(union rtable *); + void (*stop)(union rtable *); + } master; /* Data source (this table is aux) */ }; -struct fib_iterator { /* See lib/slists.h for an explanation */ - struct fib_iterator *prev, *next; /* Must be synced with struct fib_node! */ - byte efef; /* 0xff to distinguish between iterator and node */ - byte pad[3]; - struct fib_node *node; /* Or NULL if freshly merged */ - uint hash; +/* + * Route export journal + * + * The journal itself is held in struct rt_exporter. + * Workflow: + * (1) Initialize by rt_exporter_init() + * (2) Push data by rt_exporter_push() (the export item is copied) + * (3) Shutdown by rt_exporter_shutdown(), event is called after cleanup + * + * Subscribers: + * (1) Initialize by rt_export_subscribe() + * (2a) Get data by rt_export_get(); + * (2b) Release data after processing by rt_export_release() + * (3) Request refeed by rt_export_refeed() + * (4) Unsubscribe by rt_export_unsubscribe() + */ + +struct rt_export_request { + /* Formal name */ + char *name; + + /* Memory */ + pool *pool; + + /* State information */ + enum rt_export_state { +#define RT_EXPORT_STATES \ + DOWN, \ + FEEDING, \ + PARTIAL, \ + READY, \ + STOP, \ + +#define RT_EXPORT_STATES_ENUM_HELPER(p) TES_##p, + MACRO_FOREACH(RT_EXPORT_STATES_ENUM_HELPER, RT_EXPORT_STATES) + TES_MAX +#undef RT_EXPORT_STATES_ENUM_HELPER + } _Atomic export_state; + btime last_state_change; + + /* Table feeding contraption */ + struct rt_export_feeder { + /* Formal name */ + char *name; + + /* Enlisting */ + struct rt_exporter * _Atomic exporter; + DOMAIN(rtable) domain; /* Lock this instead of RCU */ + + /* Prefiltering, useful for more scenarios */ + struct rt_prefilter { + /* Network prefilter mode (TE_ADDR_*) */ + enum { + TE_ADDR_NONE = 0, /* No address matching */ + TE_ADDR_EQUAL, /* Exact query - show route */ + TE_ADDR_FOR, /* Longest prefix match - show route for */ + TE_ADDR_IN, /* Interval query - show route in */ + TE_ADDR_TRIE, /* Query defined by trie */ + TE_ADDR_HOOK, /* Query processed by supplied custom hook */ + } mode; + + union { + const struct f_trie *trie; + const net_addr *addr; + int (*hook)(const struct rt_prefilter *, const net_addr *); + }; + } prefilter; + +#define TLIST_PREFIX rt_export_feeder +#define TLIST_TYPE struct rt_export_feeder +#define TLIST_ITEM n +#define TLIST_WANT_WALK +#define TLIST_WANT_ADD_TAIL + + /* Feeding itself */ + u32 feed_index; /* Index of the feed in progress */ + u32 (*next_feed_index)(struct rt_export_feeder *, u32 try_this); + struct rt_feeding_request { + struct rt_feeding_request *next; /* Next in request chain */ + void (*done)(struct rt_feeding_request *);/* Called when this refeed finishes */ + struct rt_prefilter prefilter; /* Reload only matching nets */ + PACKED enum { + RFRS_INACTIVE = 0, /* Inactive request */ + RFRS_PENDING, /* Request enqueued, do not touch */ + RFRS_RUNNING, /* Request active, do not touch */ + } state; + } *feeding, *feed_pending; + TLIST_DEFAULT_NODE; + u8 trace_routes; + } feeder; + + /* Regular updates */ + struct bmap seq_map; /* Which lfjour items are already processed */ + struct bmap feed_map; /* Which nets were already fed (for initial feeding) */ + struct lfjour_recipient r; + struct rt_export_union *cur; + + /* Statistics */ + struct rt_export_stats { + u32 updates_received; /* Number of route updates received */ + u32 withdraws_received; /* Number of route withdraws received */ + } stats; + + /* Tracing */ + u8 trace_routes; + void (*dump)(struct rt_export_request *req); + void (*fed)(struct rt_export_request *req); }; -typedef void (*fib_init_fn)(struct fib *, void *); - -struct fib { - pool *fib_pool; /* Pool holding all our data */ - slab *fib_slab; /* Slab holding all fib nodes */ - struct fib_node **hash_table; /* Node hash table */ - uint hash_size; /* Number of hash table entries (a power of two) */ - uint hash_order; /* Binary logarithm of hash_size */ - uint hash_shift; /* 32 - hash_order */ - uint addr_type; /* Type of address data stored in fib (NET_*) */ - uint node_size; /* FIB node size, 0 for nonuniform */ - uint node_offset; /* Offset of fib_node struct inside of user data */ - uint entries; /* Number of entries */ - uint entries_min, entries_max; /* Entry count limits (else start rehashing) */ - fib_init_fn init; /* Constructor */ +#include "lib/tlists.h" + +struct rt_export_union { + enum rt_export_kind { + RT_EXPORT_STOP = 1, + RT_EXPORT_FEED, + RT_EXPORT_UPDATE, + } kind; + const struct rt_export_item { + LFJOUR_ITEM_INHERIT(li); /* Member of lockfree journal */ + char data[0]; /* Memcpy helper */ + const rte *new, *old; /* Route update */ + } *update; + const struct rt_export_feed { + uint count_routes, count_exports; + struct netindex *ni; + rte *block; + u64 *exports; + char data[0]; + } *feed; + struct rt_export_request *req; }; -static inline void * fib_node_to_user(struct fib *f, struct fib_node *e) -{ return e ? (void *) ((char *) e - f->node_offset) : NULL; } +struct rt_exporter { + struct lfjour journal; /* Journal for update keeping */ + TLIST_LIST(rt_export_feeder) feeders; /* List of active feeder structures */ - _Bool _Atomic feeders_lock; /* Spinlock for the above list */ ++ bool _Atomic feeders_lock; /* Spinlock for the above list */ + u8 trace_routes; /* Debugging flags (D_*) */ + u8 net_type; /* Which net this exporter provides */ + DOMAIN(rtable) domain; /* Lock this instead of RCU */ + u32 _Atomic max_feed_index; /* Stop feeding at this index */ + const char *name; /* Name for logging */ + netindex_hash *netindex; /* Table for net <-> id conversion */ + void (*stopped)(struct rt_exporter *); /* Callback when exporter can stop */ + void (*cleanup_done)(struct rt_exporter *, u64 end); /* Callback when cleanup has been done */ - struct rt_export_feed *(*feed_net)(struct rt_exporter *, struct rcu_unwinder *, u32, _Bool (*)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *, const struct rt_export_item *first); ++ struct rt_export_feed *(*feed_net)(struct rt_exporter *, struct rcu_unwinder *, u32, bool (*)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *, const struct rt_export_item *first); + void (*feed_cleanup)(struct rt_exporter *, struct rt_export_feeder *); +}; -static inline struct fib_node * fib_user_to_node(struct fib *f, void *e) -{ return e ? (void *) ((char *) e + f->node_offset) : NULL; } +extern struct rt_export_feed rt_feed_index_out_of_range; -void fib_init(struct fib *f, pool *p, uint addr_type, uint node_size, uint node_offset, uint hash_order, fib_init_fn init); -void *fib_find(struct fib *, const net_addr *); /* Find or return NULL if doesn't exist */ -void *fib_get_chain(struct fib *f, const net_addr *a); /* Find first node in linked list from hash table */ -void *fib_get(struct fib *, const net_addr *); /* Find or create new if nonexistent */ -void *fib_route(struct fib *, const net_addr *); /* Longest-match routing lookup */ -void fib_delete(struct fib *, void *); /* Remove fib entry */ -void fib_free(struct fib *); /* Destroy the fib */ -void fib_check(struct fib *); /* Consistency check for debugging */ +/* Exporter API */ +void rt_exporter_init(struct rt_exporter *, struct settle_config *); +struct rt_export_item *rt_exporter_push(struct rt_exporter *, const struct rt_export_item *); +struct rt_export_feed *rt_alloc_feed(uint routes, uint exports); +void rt_exporter_shutdown(struct rt_exporter *, void (*stopped)(struct rt_exporter *)); -void fit_init(struct fib_iterator *, struct fib *); /* Internal functions, don't call */ -struct fib_node *fit_get(struct fib *, struct fib_iterator *); -void fit_put(struct fib_iterator *, struct fib_node *); -void fit_put_next(struct fib *f, struct fib_iterator *i, struct fib_node *n, uint hpos); -void fit_put_end(struct fib_iterator *i); -void fit_copy(struct fib *f, struct fib_iterator *dst, struct fib_iterator *src); +/* Standalone feeds */ +void rt_feeder_subscribe(struct rt_exporter *, struct rt_export_feeder *); +void rt_feeder_unsubscribe(struct rt_export_feeder *); +void rt_export_refeed_feeder(struct rt_export_feeder *, struct rt_feeding_request *); +struct rt_export_feed *rt_export_next_feed(struct rt_export_feeder *); +#define RT_FEED_WALK(_feeder, _f) \ + for (const struct rt_export_feed *_f; _f = rt_export_next_feed(_feeder); ) \ - static inline _Bool rt_export_feed_active(struct rt_export_feeder *f) -#define FIB_WALK(fib, type, z) do { \ - struct fib_node *fn_, **ff_ = (fib)->hash_table; \ - uint count_ = (fib)->hash_size; \ - type *z; \ - while (count_--) \ - for (fn_ = *ff_++; z = fib_node_to_user(fib, fn_); fn_=fn_->next) ++static inline bool rt_export_feed_active(struct rt_export_feeder *f) +{ return !!atomic_load_explicit(&f->exporter, memory_order_acquire); } -#define FIB_WALK_END } while (0) +/* Full blown exports */ +void rtex_export_subscribe(struct rt_exporter *, struct rt_export_request *); +void rtex_export_unsubscribe(struct rt_export_request *); -#define FIB_ITERATE_INIT(it, fib) fit_init(it, fib) +const struct rt_export_union * rt_export_get(struct rt_export_request *); +void rt_export_release(const struct rt_export_union *); +void rt_export_retry_later(const struct rt_export_union *); +void rt_export_processed(struct rt_export_request *, u64); +void rt_export_refeed_request(struct rt_export_request *rer, struct rt_feeding_request *rfr); -#define FIB_ITERATE_START(fib, it, type, z) do { \ - struct fib_node *fn_ = fit_get(fib, it); \ - uint count_ = (fib)->hash_size; \ - uint hpos_ = (it)->hash; \ - type *z; \ - for(;;) { \ - if (!fn_) \ - { \ - if (++hpos_ >= count_) \ - break; \ - fn_ = (fib)->hash_table[hpos_]; \ - continue; \ - } \ - z = fib_node_to_user(fib, fn_); +static inline enum rt_export_state rt_export_get_state(struct rt_export_request *r) +{ return atomic_load_explicit(&r->export_state, memory_order_acquire); } +const char *rt_export_state_name(enum rt_export_state state); -#define FIB_ITERATE_END fn_ = fn_->next; } } while(0) +static inline void rt_export_walk_cleanup(const struct rt_export_union **up) +{ + if (*up) + rt_export_release(*up); +} -#define FIB_ITERATE_PUT(it) fit_put(it, fn_) +#define RT_EXPORT_WALK(_reader, _u) \ + for (CLEANUP(rt_export_walk_cleanup) const struct rt_export_union *_u;\ + _u = rt_export_get(_reader); \ + rt_export_release(_u)) \ + +/* Convenince common call to request refeed */ +#define rt_export_refeed(h, r) _Generic((h), \ + struct rt_export_feeder *: rt_export_refeed_feeder, \ + struct rt_export_request *: rt_export_refeed_request, \ + void *: bug)(h, r) + +/* Subscription to regular table exports needs locking */ +#define rt_export_subscribe(_t, _kind, f) do { \ + RT_LOCKED(_t, tp) { \ + rt_lock_table(tp); \ + rtex_export_subscribe(&tp->export_##_kind, f); \ + }} while (0) \ + +#define rt_export_unsubscribe(_kind, _fx) do { \ + struct rt_export_request *_f = _fx; \ + struct rt_exporter *e = atomic_load_explicit(&_f->feeder.exporter, memory_order_acquire); \ + RT_LOCKED(SKIP_BACK(rtable, export_##_kind, e), _tp) { \ + rtex_export_unsubscribe(_f); \ + rt_unlock_table(_tp); \ + }} while (0) \ + +static inline int rt_prefilter_net(const struct rt_prefilter *p, const net_addr *n) +{ + switch (p->mode) + { + case TE_ADDR_NONE: return 1; + case TE_ADDR_IN: return net_in_netX(n, p->addr); + case TE_ADDR_EQUAL: return net_equal(n, p->addr); + case TE_ADDR_FOR: return net_in_netX(p->addr, n); + case TE_ADDR_TRIE: return trie_match_net(p->trie, n); + case TE_ADDR_HOOK: return p->hook(p, n); + } + + bug("Crazy prefilter application attempt failed wildly."); +} + - static inline _Bool ++static inline bool +rt_net_is_feeding_feeder(struct rt_export_feeder *ref, const net_addr *n) +{ + if (!rt_prefilter_net(&ref->prefilter, n)) + return 0; + + if (!ref->feeding) + return 1; -#define FIB_ITERATE_PUT_NEXT(it, fib) fit_put_next(fib, it, fn_, hpos_) + for (struct rt_feeding_request *rfr = ref->feeding; rfr; rfr = rfr->next) + if (rt_prefilter_net(&rfr->prefilter, n)) + return 1; -#define FIB_ITERATE_PUT_END(it) fit_put_end(it) + return 0; +} - static inline _Bool -#define FIB_ITERATE_UNLINK(it, fib) fit_get(fib, it) ++static inline bool +rt_net_is_feeding_request(struct rt_export_request *req, const net_addr *n) +{ + struct netindex *ni = NET_TO_INDEX(n); + return + !bmap_test(&req->feed_map, ni->index) + && rt_net_is_feeding_feeder(&req->feeder, n); +} -#define FIB_ITERATE_COPY(dst, src, fib) fit_copy(fib, dst, src) +#define rt_net_is_feeding(h, n) _Generic((h), \ + struct rt_export_feeder *: rt_net_is_feeding_feeder, \ + struct rt_export_request *: rt_net_is_feeding_request, \ + void *: bug)(h, n) /* @@@ -400,266 -181,105 +400,266 @@@ struct rtable_private * delete as soon as use_count becomes 0 and remove * obstacle from this routing table. */ - struct event *rt_event; /* Routing table event */ + struct rt_export_request best_req; /* Internal request from best route announcement cleanup */ + struct rt_uncork_callback nhu_uncork; /* Helper event to schedule NHU on uncork */ + struct rt_uncork_callback hcu_uncork; /* Helper event to schedule HCU on uncork */ struct timer *prune_timer; /* Timer for periodic pruning / GC */ + struct event *prune_event; /* Event for prune execution */ btime last_rt_change; /* Last time when route changed */ - btime base_settle_time; /* Start time of rtable settling interval */ btime gc_time; /* Time of last GC */ uint gc_counter; /* Number of operations since last GC */ + uint rr_counter; /* Number of currently running route refreshes, + in fact sum of (stale_set - stale_pruned) over all importers + + one for each TIS_FLUSHING importer */ + uint wait_counter; /* Number of imports in TIS_WAITING state */ byte prune_state; /* Table prune state, 1 -> scheduled, 2-> running */ byte prune_trie; /* Prune prefix trie during next table prune */ - byte hcu_scheduled; /* Hostcache update is scheduled */ + byte imports_flushing; /* Some imports are being flushed right now */ byte nhu_state; /* Next Hop Update state */ - struct fib_iterator prune_fit; /* Rtable prune FIB iterator */ - struct fib_iterator nhu_fit; /* Next Hop Update FIB iterator */ + byte nhu_corked; /* Next Hop Update is corked with this state */ + byte export_used; /* Pending Export pruning is scheduled */ + byte cork_active; /* Cork has been activated */ + struct rt_cork_threshold cork_threshold; /* Threshold for table cork */ + u32 prune_index; /* Rtable prune FIB iterator */ + u32 nhu_index; /* Next Hop Update FIB iterator */ + event *nhu_event; /* Nexthop updater */ struct f_trie *trie_new; /* New prefix trie defined during pruning */ - struct f_trie *trie_old; /* Old prefix trie waiting to be freed */ + const struct f_trie *trie_old; /* Old prefix trie waiting to be freed */ u32 trie_lock_count; /* Prefix trie locked by walks */ u32 trie_old_lock_count; /* Old prefix trie locked by walks */ + struct tbf rl_pipe; /* Rate limiting token buffer for pipe collisions */ - list subscribers; /* Subscribers for notifications */ - struct timer *settle_timer; /* Settle time for notifications */ - list flowspec_links; /* List of flowspec links, src for NET_IPx and dst for NET_FLOWx */ struct f_trie *flowspec_trie; /* Trie for evaluation of flowspec notifications */ // struct mpls_domain *mpls_domain; /* Label allocator for MPLS */ -} rtable; + u32 rte_free_deferred; /* Counter of deferred rte_free calls */ -struct rt_subscription { - node n; - rtable *tab; - void (*hook)(struct rt_subscription *b); - void *data; + struct rt_digestor *export_digest; /* Route export journal for digest tries */ + struct rt_stream *master; /* Data source (this table is aux) */ }; -struct rt_flowspec_link { - node n; - rtable *src; - rtable *dst; - u32 uc; +/* The final union private-public rtable structure */ +typedef union rtable { + struct { + RTABLE_PUBLIC; + }; + struct rtable_private priv; +} rtable; + +/* Define the lock cleanup function */ +LOBJ_UNLOCK_CLEANUP(rtable, rtable); + +#define RT_IS_LOCKED(tab) LOBJ_IS_LOCKED((tab), rtable) +#define RT_LOCKED(tab, tp) LOBJ_LOCKED((tab), tp, rtable, rtable) +#define RT_LOCK(tab, tp) LOBJ_LOCK((tab), tp, rtable, rtable) + +#define RT_LOCK_SIMPLE(tab) LOBJ_LOCK_SIMPLE((tab), rtable) +#define RT_UNLOCK_SIMPLE(tab) LOBJ_UNLOCK_SIMPLE((tab), rtable) + +#define RT_UNLOCKED_TEMPORARILY(tab, tp) LOBJ_UNLOCKED_TEMPORARILY((tab), tp, rtable, rtable) + +#define RT_PUB(tab) SKIP_BACK(rtable, priv, tab) + +#define RT_UNCORKING (1ULL << 44) + +extern struct rt_cork { + _Atomic u64 active; + DOMAIN(resource) dom; + event_list queue; +} rt_cork; + +static inline void rt_cork_acquire(void) +{ + atomic_fetch_add_explicit(&rt_cork.active, 1, memory_order_acq_rel); +} + +static inline void rt_cork_release(void) +{ + u64 upd = atomic_fetch_add_explicit(&rt_cork.active, RT_UNCORKING, memory_order_acq_rel) + RT_UNCORKING; + + /* Actualy released? */ + if ((upd >> 44) == (upd & (RT_UNCORKING - 1))) + { + LOCK_DOMAIN(resource, rt_cork.dom); + synchronize_rcu(); + ev_run_list(&rt_cork.queue); + UNLOCK_DOMAIN(resource, rt_cork.dom); + } + + atomic_fetch_sub_explicit(&rt_cork.active, RT_UNCORKING + 1, memory_order_acq_rel); +} + +void rt_cork_send_callback(void *_data); + - static inline _Bool rt_cork_check(struct rt_uncork_callback *rcc) ++static inline bool rt_cork_check(struct rt_uncork_callback *rcc) +{ + /* Wait until all uncorks have finished */ + while (1) + { + rcu_read_lock(); + + /* Not corked */ + u64 corked = atomic_load_explicit(&rt_cork.active, memory_order_acquire); + if (!corked) + { + rcu_read_unlock(); + return 0; + } + + /* Yes, corked */ + if (corked < RT_UNCORKING) + { + if (!rcc->ev.hook) + { + rcc->ev.hook = rt_cork_send_callback; + rcc->ev.data = rcc; + } + + ev_send(&rt_cork.queue, &rcc->ev); + rcu_read_unlock(); + return 1; + } + + /* In progress, retry */ + rcu_read_unlock(); + birdloop_yield(); + } +} + +struct rt_pending_export { + struct rt_export_item it; + struct rt_pending_export *_Atomic next; /* Next export for the same net */ + u64 seq_all; /* Interlink from BEST to ALL */ }; -#define NHU_CLEAN 0 -#define NHU_SCHEDULED 1 -#define NHU_RUNNING 2 -#define NHU_DIRTY 3 +struct rt_net_pending_export { + struct rt_pending_export * _Atomic first, * _Atomic last; +}; typedef struct network { - struct rte *routes; /* Available routes for this network */ - struct fib_node n; /* FIB flags reserved for kernel syncer */ + struct rte_storage * _Atomic routes; /* Available routes for this network */ + + /* Uncleaned pending exports */ + struct rt_net_pending_export all; + struct rt_net_pending_export best; } net; -struct hostcache { - slab *slab; /* Slab holding all hostentries */ - struct hostentry **hash_table; /* Hash table for hostentries */ - unsigned hash_order, hash_shift; - unsigned hash_max, hash_min; - unsigned hash_items; - linpool *lp; /* Linpool for trie */ - struct f_trie *trie; /* Trie of prefixes that might affect hostentries */ - list hostentries; /* List of all hostentries */ - byte update_hostcache; +struct rte_storage { + struct rte_storage * _Atomic next; /* Next in chain */ + union { + struct { + RTE_IN_TABLE_WRITABLE; + }; + const struct rte rte; /* Route data */ + }; }; -struct hostentry { - node ln; - ip_addr addr; /* IP address of host, part of key */ - ip_addr link; /* (link-local) IP address of host, used as gw - if host is directly attached */ - rtable *tab; /* Dependent table, part of key */ - rtable *owner; /* Nexthop owner table */ - struct hostentry *next; /* Next in hash chain */ - unsigned hash_key; /* Hash key */ - unsigned uc; /* Use count */ - struct rta *src; /* Source rta entry */ - byte dest; /* Chosen route destination type (RTD_...) */ - byte nexthop_linkable; /* Nexthop list is completely non-device */ - u32 igp_metric; /* Chosen route IGP metric */ +#define RTE_COPY(r) ((r) ? (r)->rte : (rte) {}) +#define RTE_COPY_VALID(r) (((r) && (rte_is_valid((r)))) ? *(r) : (rte) {}) +#define RTE_OR_NULL(r) ((r) ? &((r)->rte) : NULL) +#define RTE_VALID_OR_NULL(r) (((r) && (rte_is_valid((r)))) ? (r) : NULL) + +#define RTES_WRITE(r) (((r) != ((struct rte_storage *) 0)) ? ((struct rte *) &(r)->rte) : NULL) + +#define RTE_GET_NETINDEX(e) NET_TO_INDEX((e)->net) + +/* Table import */ + +struct rt_import_request { + struct rt_import_hook *hook; /* The table part of importer */ + char *name; + u8 trace_routes; + + struct birdloop *loop; /* Where to schedule cleanup event */ + + void (*dump_req)(struct rt_import_request *req); + void (*log_state_change)(struct rt_import_request *req, u8 state); + /* Preimport is called when the @new route is just-to-be inserted, replacing @old. + * Return a route (may be different or modified in-place) to continue or NULL to withdraw. */ + int (*preimport)(struct rt_import_request *req, struct rte *new, const struct rte *old); +}; + +struct rt_import_hook { + node n; + rtable *table; /* The connected table */ + struct rt_import_request *req; /* The requestor */ + + struct rt_import_stats { + /* Import - from protocol to core */ + u32 pref; /* Number of routes selected as best in the (adjacent) routing table */ + u32 updates_ignored; /* Number of route updates rejected as already in route table */ + u32 updates_accepted; /* Number of route updates accepted and imported */ + u32 withdraws_ignored; /* Number of route withdraws rejected as already not in route table */ + u32 withdraws_accepted; /* Number of route withdraws accepted and processed */ + } stats; + + u64 flush_seq; /* Table export seq when the channel announced flushing */ + btime last_state_change; /* Time of last state transition */ + + u8 import_state; /* IS_* */ + u8 stale_set; /* Set this stale_cycle to imported routes */ + u8 stale_valid; /* Routes with this stale_cycle and bigger are considered valid */ + u8 stale_pruned; /* Last prune finished when this value was set at stale_valid */ + u8 stale_pruning; /* Last prune started when this value was set at stale_valid */ + + void (*stopped)(struct rt_import_request *); /* Stored callback when import is stopped */ + event cleanup_event; /* Used to finally unhook the import from the table */ }; -typedef struct rte { - struct rte *next; - net *net; /* Network this RTE belongs to */ - struct rte_src *src; /* Route source that created the route */ - struct channel *sender; /* Channel used to send the route to the routing table */ - struct rta *attrs; /* Attributes of this route */ - u32 id; /* Table specific route id */ - byte flags; /* Flags (REF_...) */ - byte pflags; /* Protocol-specific flags */ - btime lastmod; /* Last modified */ -} rte; -#define REF_COW 1 /* Copy this rte on write */ -#define REF_FILTERED 2 /* Route is rejected by import filter */ -#define REF_STALE 4 /* Route is stale in a refresh cycle */ -#define REF_DISCARD 8 /* Route is scheduled for discard */ -#define REF_MODIFY 16 /* Route is scheduled for modify */ +#define TIS_DOWN 0 +#define TIS_UP 1 +#define TIS_STOP 2 +#define TIS_FLUSHING 3 +#define TIS_WAITING 4 +#define TIS_CLEARED 5 +#define TIS_MAX 6 -/* Route is valid for propagation (may depend on other flags in the future), accepts NULL */ -static inline int rte_is_valid(rte *r) { return r && !(r->flags & REF_FILTERED); } -/* Route just has REF_FILTERED flag */ -static inline int rte_is_filtered(rte *r) { return !!(r->flags & REF_FILTERED); } +void rt_request_import(rtable *tab, struct rt_import_request *req); +void rt_stop_import(struct rt_import_request *, void (*stopped)(struct rt_import_request *)); +const char *rt_import_state_name(u8 state); +static inline u8 rt_import_get_state(struct rt_import_hook *ih) { return ih ? ih->import_state : TIS_DOWN; } + +void rte_import(struct rt_import_request *req, const net_addr *net, rte *new, struct rte_src *src); + +/* When rtable is just a view / aggregate, this is the basis for its source */ +struct rt_stream { + struct rt_import_request dst; + rtable *dst_tab; +}; + + +#if 0 +/* + * For table export processing + */ + +/* Get next rpe. If src is given, it must match. */ +struct rt_pending_export *rpe_next(struct rt_pending_export *rpe, struct rte_src *src); + +/* Walk all rpe's */ +#define RPE_WALK(first, it, src) \ + for (struct rt_pending_export *it = (first); it; it = rpe_next(it, (src))) + +/* Mark the pending export processed */ +void rpe_mark_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); + +#define rpe_mark_seen_all(hook, first, last, src) do { \ + RPE_WALK((first), _rpe, (src)) { \ + rpe_mark_seen((hook), _rpe); \ + if (_rpe == last) break; \ + }} while (0) + +/* Get pending export seen status */ +int rpe_get_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe); + +#endif + +/* + * Channel export hooks. To be refactored out. + */ + +int channel_preimport(struct rt_import_request *req, rte *new, const rte *old); /* Types of route announcement, also used as flags */ diff --cc nest/rt-attr.c index 6e96bc521,c8ef8e081..ff0d54fa6 --- a/nest/rt-attr.c +++ b/nest/rt-attr.c @@@ -913,8 -615,6 +913,8 @@@ ea_do_sort(ea_list *e while (ss); } - static _Bool eattr_same_value(const eattr *a, const eattr *b); ++static bool eattr_same_value(const eattr *a, const eattr *b); + /** * In place discard duplicates and undefs in sorted ea_list. We use stable sort * for this reason. @@@ -1083,63 -733,9 +1083,63 @@@ ea_merge(ea_list *e, ea_list *t, u32 up t->count += e->count; d += e->count; e = e->next; + + if (e && BIT32_TEST(&upto, e->stored)) + break; } + + t->next = e; +} + +ea_list * +ea_normalize(ea_list *e, u32 upto) +{ +#if 0 + debug("(normalize)"); + ea_dump(e); + debug(" ----> "); +#endif + ea_list *t = tmp_allocz(ea_scan(e, upto)); + ea_merge(e, t, upto); + ea_sort(t); +#if 0 + ea_dump(t); + debug("\n"); +#endif + + return t; +} + - static _Bool ++static bool +eattr_same_value(const eattr *a, const eattr *b) +{ + if ( + a->id != b->id || + a->flags != b->flags || + a->type != b->type || + a->undef != b->undef + ) + return 0; + + if (a->undef) + return 1; + + if (a->type & EAF_EMBEDDED) + return a->u.data == b->u.data; + else + return adata_same(a->u.ptr, b->u.ptr); } - static _Bool ++static bool +eattr_same(const eattr *a, const eattr *b) +{ + return + eattr_same_value(a, b) && + a->originated == b->originated && + a->fresh == b->fresh; +} + + /** * ea_same - compare two &ea_list's * @x: attribute list diff --cc nest/rt-export.c index cc5cd353b,000000000..e7cfeab84 mode 100644,000000..100644 --- a/nest/rt-export.c +++ b/nest/rt-export.c @@@ -1,576 -1,0 +1,576 @@@ +/* + * BIRD -- Route Export Mechanisms + * + * (c) 2024 Maria Matejka + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include "nest/bird.h" +#include "nest/route.h" +#include "nest/protocol.h" + +struct rt_export_feed rt_feed_index_out_of_range; + +#define rtex_trace(_req, _cat, msg, args...) do { \ + if ((_req)->trace_routes & _cat) \ + log(L_TRACE "%s: " msg, (_req)->name, ##args); \ +} while (0) + +static inline enum rt_export_state +rt_export_change_state(struct rt_export_request *r, u32 expected_mask, enum rt_export_state state) +{ + r->last_state_change = current_time(); + enum rt_export_state old = atomic_exchange_explicit(&r->export_state, state, memory_order_acq_rel); + if (!((1 << old) & expected_mask)) + bug("Unexpected export state change from %s to %s, expected mask %02x", + rt_export_state_name(old), + rt_export_state_name(state), + expected_mask + ); + + rtex_trace(r, D_STATES, "Export state changed from %s to %s", + rt_export_state_name(old), rt_export_state_name(state)); + + return old; +} + +const struct rt_export_union * +rt_export_get(struct rt_export_request *r) +{ + ASSERT_DIE(!r->cur); + +#define EXPORT_FOUND(_kind) do { \ + struct rt_export_union *reu = tmp_alloc(sizeof *reu); \ + *reu = (struct rt_export_union) { \ + .kind = _kind, \ + .req = r, \ + .update = update, \ + .feed = feed, \ + }; \ + return (r->cur = reu); \ +} while (0) + +#define NOT_THIS_UPDATE \ + lfjour_release(&r->r, &update->li); \ + continue; + + while (1) + { + enum rt_export_state es = rt_export_get_state(r); + switch (es) + { + case TES_DOWN: + rtex_trace(r, (D_ROUTES|D_STATES), "Export is down"); + return NULL; + + case TES_STOP: + rtex_trace(r, (D_ROUTES|D_STATES), "Received stop event"); + struct rt_export_union *reu = tmp_alloc(sizeof *reu); + *reu = (struct rt_export_union) { + .kind = RT_EXPORT_STOP, + .req = r, + }; + return (r->cur = reu); + + case TES_PARTIAL: + case TES_FEEDING: + case TES_READY: + break; + + case TES_MAX: + bug("invalid export state"); + } + + /* Process sequence number reset event */ + if (lfjour_reset_seqno(&r->r)) + bmap_reset(&r->seq_map, 4); + + /* Get a new update */ + SKIP_BACK_DECLARE(struct rt_export_item, update, li, lfjour_get(&r->r)); + SKIP_BACK_DECLARE(struct rt_exporter, e, journal, lfjour_of_recipient(&r->r)); + struct rt_export_feed *feed = NULL; + + /* No update, try feed */ + if (!update) + { + if (es == TES_READY) + { + /* Fed up of feeding */ + rtex_trace(r, D_ROUTES, "Export drained"); + return NULL; + } + else if (feed = rt_export_next_feed(&r->feeder)) + { + /* Feeding more */ + bmap_set(&r->feed_map, feed->ni->index); + rtex_trace(r, D_ROUTES, "Feeding %N", feed->ni->addr); + + EXPORT_FOUND(RT_EXPORT_FEED); + } + else if (rt_export_get_state(r) == TES_DOWN) + { + /* Torn down inbetween */ + rtex_trace(r, D_STATES, "Export ended itself"); + return NULL; + } + else + { + /* No more food */ + rt_export_change_state(r, BIT32_ALL(TES_FEEDING, TES_PARTIAL), TES_READY); + rtex_trace(r, D_STATES, "Fed up"); + CALL(r->fed, r); + return NULL; + } + } + + /* There actually is an update */ + if (bmap_test(&r->seq_map, update->seq)) + { + /* But this update has been already processed, let's try another one */ + rtex_trace(r, D_ROUTES, "Skipping an already processed update %lu", update->seq); + NOT_THIS_UPDATE; + } + + /* Is this update allowed by prefilter? */ + const net_addr *n = (update->new ?: update->old)->net; + struct netindex *ni = NET_TO_INDEX(n); + + if (!rt_prefilter_net(&r->feeder.prefilter, n)) + { + rtex_trace(r, D_ROUTES, "Not exporting %N due to prefilter", n); + NOT_THIS_UPDATE; + } + + if ((es != TES_READY) && rt_net_is_feeding(r, n)) + { + /* But this net shall get a feed first! */ + rtex_trace(r, D_ROUTES, "Expediting %N feed due to pending update %lu", n, update->seq); + if (r->feeder.domain.rtable) + { + LOCK_DOMAIN(rtable, r->feeder.domain); + feed = e->feed_net(e, NULL, ni->index, NULL, NULL, update); + UNLOCK_DOMAIN(rtable, r->feeder.domain); + } + else + { + RCU_ANCHOR(u); + feed = e->feed_net(e, u, ni->index, NULL, NULL, update); + } + + bmap_set(&r->feed_map, ni->index); + ASSERT_DIE(feed && (feed != &rt_feed_index_out_of_range)); + + EXPORT_FOUND(RT_EXPORT_FEED); + } + + /* OK, now this actually is an update, thank you for your patience */ + rtex_trace(r, D_ROUTES, "Updating %N, seq %lu", n, update->seq); + + EXPORT_FOUND(RT_EXPORT_UPDATE); + } + +#undef NOT_THIS_UPDATE +#undef EXPORT_FOUND +} + +void +rt_export_release(const struct rt_export_union *u) +{ + /* May be already released */ + if (!u->req) + return; + + struct rt_export_request *r = u->req; + + /* Must be crosslinked */ + ASSERT_DIE(r->cur == u); + r->cur = NULL; + + switch (u->kind) + { + case RT_EXPORT_FEED: + for (uint i = 0; i < u->feed->count_exports; i++) + bmap_set(&r->seq_map, u->feed->exports[i]); + + if (!u->update) + break; + + /* fall through */ + + case RT_EXPORT_UPDATE: + rtex_trace(r, D_ROUTES, "Export %lu released", u->update->seq); + lfjour_release(&r->r, &u->update->li); + + break; + + case RT_EXPORT_STOP: + /* Checking that we have indeed stopped the exporter */ + ASSERT_DIE(rt_export_get_state(r) == TES_DOWN); + rtex_trace(r, D_ROUTES, "Export stopped"); + break; + + default: + bug("strange export kind"); + } +} + +void +rt_export_processed(struct rt_export_request *r, u64 seq) +{ + rtex_trace(r, D_ROUTES, "Marking export %lu as processed", seq); + + /* Check sequence number reset event */ + if (lfjour_reset_seqno(&r->r)) + bmap_reset(&r->seq_map, 4); + + ASSERT_DIE(!bmap_test(&r->seq_map, seq)); + bmap_set(&r->seq_map, seq); +} + +struct rt_export_feed * +rt_alloc_feed(uint routes, uint exports) +{ + struct rt_export_feed *feed; + uint size = sizeof *feed + + routes * sizeof *feed->block + _Alignof(typeof(*feed->block)) + + exports * sizeof *feed->exports + _Alignof(typeof(*feed->exports)); + + feed = tmp_alloc(size); + + feed->count_routes = routes; + feed->count_exports = exports; + BIRD_SET_ALIGNED_POINTER(feed->block, feed->data); + BIRD_SET_ALIGNED_POINTER(feed->exports, &feed->block[routes]); + + /* Consistency check */ + ASSERT_DIE(((void *) &feed->exports[exports]) <= ((void *) feed) + size); + + return feed; +} + +static struct rt_export_feed * +rt_export_get_next_feed(struct rt_export_feeder *f, struct rcu_unwinder *u) +{ + for (uint retry = 0; retry < (u ? 1024 : ~0U); retry++) + { + ASSERT_DIE(u || DOMAIN_IS_LOCKED(rtable, f->domain)); + + struct rt_exporter *e = atomic_load_explicit(&f->exporter, memory_order_acquire); + if (!e) + { + rtex_trace(f, (D_ROUTES|D_STATES), "Exporter kicked us away"); + return NULL; + } + + struct rt_export_feed *feed = e->feed_net(e, u, f->feed_index, + rt_net_is_feeding_feeder, f, NULL); + if (feed == &rt_feed_index_out_of_range) + { + rtex_trace(f, D_ROUTES, "Nothing more to feed", f->feed_index); + f->feed_index = ~0; + return NULL; + } + +#define NEXT_INDEX(f) f->feed_index = f->next_feed_index ? f->next_feed_index(f, f->feed_index + 1) : f->feed_index + 1 + +#define NOT_THIS_FEED(...) { \ + rtex_trace(f, D_ROUTES, __VA_ARGS__); \ + NEXT_INDEX(f); \ + continue; \ +} + + if (!feed) + NOT_THIS_FEED("Nothing found for index %u", f->feed_index); + + NEXT_INDEX(f); + return feed; + } + + RCU_RETRY_FAST(u); +} + +struct rt_export_feed * +rt_export_next_feed(struct rt_export_feeder *f) +{ + ASSERT_DIE(f); + + struct rt_export_feed *feed = NULL; + if (f->domain.rtable) + { + LOCK_DOMAIN(rtable, f->domain); + feed = rt_export_get_next_feed(f, NULL); + UNLOCK_DOMAIN(rtable, f->domain); + } + else + { + RCU_ANCHOR(u); + feed = rt_export_get_next_feed(f, u); + } + + if (feed) + return feed; + + /* Feeding done */ + struct rt_feeding_request *reverse = NULL; + while (f->feeding) + { + struct rt_feeding_request *rfr = f->feeding; + f->feeding = rfr->next; + rfr->next = reverse; + reverse = rfr; + } + + /* Call the done hook in the same order as requests came in */ + while (reverse) + { + struct rt_feeding_request *rfr = reverse; + reverse = rfr->next; + CALL(rfr->done, rfr); + } + + f->feed_index = 0; + + uint count = 0; + for (struct rt_feeding_request *rfr = f->feed_pending; rfr; rfr = rfr->next) + count++; + + rtex_trace(f, D_STATES, "Feeding done, %u refeed request%s pending", + count, (count == 1) ? "" : "s"); + + if (!f->feed_pending) + return NULL; + + f->feeding = f->feed_pending; + f->feed_pending = NULL; + return rt_export_next_feed(f); +} + +static void +rt_feeding_request_default_done(struct rt_feeding_request *rfr) +{ + mb_free(rfr); +} + +void +rt_export_refeed_feeder(struct rt_export_feeder *f, struct rt_feeding_request *rfr) +{ + if (!rfr) + return; + + rfr->next = f->feed_pending; + f->feed_pending = rfr; +} + +void rt_export_refeed_request(struct rt_export_request *rer, struct rt_feeding_request *rfr) +{ + if (!rfr) + { + rfr = mb_allocz(rer->pool, sizeof *rfr); + rfr->done = rt_feeding_request_default_done; + } + + bmap_reset(&rer->feed_map, 4); + rt_export_refeed_feeder(&rer->feeder, rfr); + rt_export_change_state(rer, BIT32_ALL(TES_FEEDING, TES_PARTIAL, TES_READY), TES_PARTIAL); + if (rer->r.event) + ev_send(rer->r.target, rer->r.event); +} + +void +rtex_export_subscribe(struct rt_exporter *e, struct rt_export_request *r) +{ + rt_export_change_state(r, BIT32_ALL(TES_DOWN), TES_FEEDING); + + ASSERT_DIE(r->pool); + + rt_feeder_subscribe(e, &r->feeder); + + lfjour_register(&e->journal, &r->r); + + r->stats = (struct rt_export_stats) {}; + r->last_state_change = current_time(); + bmap_init(&r->seq_map, r->pool, 4); + bmap_init(&r->feed_map, r->pool, 4); + + rt_export_refeed_request(r, NULL); +} + +void +rtex_export_unsubscribe(struct rt_export_request *r) +{ + rt_feeder_unsubscribe(&r->feeder); + + if (r->cur) + rt_export_release(r->cur); + + switch (rt_export_change_state(r, BIT32_ALL(TES_FEEDING, TES_PARTIAL, TES_READY, TES_STOP), TES_DOWN)) + { + case TES_FEEDING: + case TES_PARTIAL: + case TES_READY: + case TES_STOP: + lfjour_unregister(&r->r); + break; + default: + bug("not implemented"); + } + + bmap_free(&r->feed_map); + bmap_free(&r->seq_map); +} + +static void +rt_exporter_cleanup_done(struct lfjour *j, u64 begin_seq UNUSED, u64 end_seq) +{ + SKIP_BACK_DECLARE(struct rt_exporter, e, journal, j); + + /* TODO: log the begin_seq / end_seq values */ + + CALL(e->cleanup_done, e, end_seq); + if (e->stopped && (lfjour_count_recipients(j) == 0)) + { + settle_cancel(&j->announce_timer); + ev_postpone(&j->cleanup_event); + e->stopped(e); + } +} + +void +rt_exporter_init(struct rt_exporter *e, struct settle_config *scf) +{ + rtex_trace(e, D_STATES, "Exporter init"); + e->journal.cleanup_done = rt_exporter_cleanup_done; + lfjour_init(&e->journal, scf); + ASSERT_DIE(e->feed_net); + ASSERT_DIE(e->netindex); +} + +struct rt_export_item * +rt_exporter_push(struct rt_exporter *e, const struct rt_export_item *uit) +{ + /* Get the object */ + struct lfjour_item *lit = lfjour_push_prepare(&e->journal); + if (!lit) + return NULL; + + SKIP_BACK_DECLARE(struct rt_export_item, it, li, lit); + + /* Copy the data, keeping the header */ + memcpy(&it->data, &uit->data, e->journal.item_size - OFFSETOF(struct rt_export_item, data)); + + /* Commit the update */ + rtex_trace(e, D_ROUTES, "Announcing change %lu at %N: %p (%u) -> %p (%u)", + lit->seq, (uit->new ?: uit->old)->net, + uit->old, uit->old ? uit->old->id : 0, + uit->new, uit->new ? uit->new->id : 0); + + lfjour_push_commit(&e->journal); + + /* Return the update pointer */ + return it; +} + +#define RTEX_FEEDERS_LOCK(e) \ + while (atomic_exchange_explicit(&e->feeders_lock, 1, memory_order_acq_rel)) \ + birdloop_yield(); \ + CLEANUP(_rtex_feeders_unlock_) UNUSED struct rt_exporter *_rtex_feeders_locked_ = e; + +static inline void _rtex_feeders_unlock_(struct rt_exporter **e) +{ + ASSERT_DIE(atomic_exchange_explicit(&(*e)->feeders_lock, 0, memory_order_acq_rel)); +} + +void +rt_feeder_subscribe(struct rt_exporter *e, struct rt_export_feeder *f) +{ + f->feed_index = 0; + + atomic_store_explicit(&f->exporter, e, memory_order_relaxed); + f->domain = e->domain; + + RTEX_FEEDERS_LOCK(e); + rt_export_feeder_add_tail(&e->feeders, f); + + rtex_trace(f, D_STATES, "Subscribed to exporter %s", e->name); +} + +static void +rt_feeder_do_unsubscribe(struct rt_export_feeder *f) +{ + struct rt_exporter *e = atomic_exchange_explicit(&f->exporter, NULL, memory_order_acquire); + if (e) + { + RTEX_FEEDERS_LOCK(e); + rt_export_feeder_rem_node(&e->feeders, f); + + rtex_trace(f, D_STATES, "Unsubscribed from exporter %s", e->name); + } + else + rtex_trace(f, D_STATES, "Already unsubscribed"); +} + +void +rt_feeder_unsubscribe(struct rt_export_feeder *f) +{ + if (f->domain.rtable) + { + LOCK_DOMAIN(rtable, f->domain); + rt_feeder_do_unsubscribe(f); + UNLOCK_DOMAIN(rtable, f->domain); + } + else + { + RCU_ANCHOR(u); + rt_feeder_do_unsubscribe(f); + } +} + +void +rt_exporter_shutdown(struct rt_exporter *e, void (*stopped)(struct rt_exporter *)) +{ + rtex_trace(e, D_STATES, "Exporter shutdown"); + + /* Last lock check before dropping the domain reference */ + if (e->journal.domain) + ASSERT_DIE(DG_IS_LOCKED(e->journal.domain)); + + e->journal.domain = NULL; + + /* We have to tell every receiver to stop */ - _Bool done = 1; ++ bool done = 1; + WALK_TLIST(lfjour_recipient, r, &e->journal.recipients) + { + done = 0; + rt_export_change_state( + SKIP_BACK(struct rt_export_request, r, r), + BIT32_ALL(TES_FEEDING, TES_PARTIAL, TES_READY, TES_STOP), + TES_STOP); + } + + /* We can drop feeders synchronously */ + { + RTEX_FEEDERS_LOCK(e); + WALK_TLIST_DELSAFE(rt_export_feeder, f, &e->feeders) + { + ASSERT_DIE(atomic_exchange_explicit(&f->exporter, NULL, memory_order_acq_rel) == e); + rt_export_feeder_rem_node(&e->feeders, f); + } + } + + /* Wait for feeders to finish */ + synchronize_rcu(); + + /* The rest is done via the cleanup routine */ + lfjour_do_cleanup_now(&e->journal); + + if (done) + { + ev_postpone(&e->journal.cleanup_event); + settle_cancel(&e->journal.announce_timer); + CALL(stopped, e); + } + else +// e->stopped = stopped; + bug("not implemented yet"); +} diff --cc nest/rt-table.c index fbb0d985e,1b30e7dc7..4f173815f --- a/nest/rt-table.c +++ b/nest/rt-table.c @@@ -407,257 -275,6 +407,257 @@@ net_route(struct rtable_reading *tr, co #undef FVR_VPN } +/* + * ROA aggregation subsystem + */ + +struct rt_roa_aggregator { + struct rt_stream stream; + struct rte_owner sources; + struct rte_src *main_source; + struct rt_export_request src; + event event; +}; + +static void +rt_dump_roa_aggregator_dst_req(struct rt_import_request *req) +{ + debug(" ROA aggregator import request req=%p", req); +} + +static void +rt_dump_roa_aggregator_src_req(struct rt_export_request *req) +{ + debug(" ROA aggregator export request req=%p", req); +} + +static void +rt_roa_aggregator_state_change(struct rt_import_request *req, u8 state) +{ + if (req->trace_routes & D_STATES) + log("%s: import state changed to %s", + req->name, rt_import_state_name(state)); +} + +struct rt_roa_aggregated_adata { + adata ad; + u32 padding; + struct { u32 asn, max_pxlen; } u[0]; +}; + +#define ROA_AGGR_COUNT(rad) (((typeof (&(rad)->u[0])) (rad->ad.data + rad->ad.length)) - &(rad)->u[0]) + +static void +ea_roa_aggregate_format(const eattr *a, byte *buf, uint size) +{ + SKIP_BACK_DECLARE(struct rt_roa_aggregated_adata, rad, ad, a->u.ptr); + uint cnt = ROA_AGGR_COUNT(rad); + for (uint upos = 0; upos < cnt; upos++) + { + int x = bsnprintf(buf, size, "as %u max %u, ", rad->u[upos].asn, rad->u[upos].max_pxlen); + size -= x; + buf += x; + if (size < 30) + { + bsnprintf(buf, size, " ... "); + return; + } + } + + buf[-2] = 0; +} + +static struct ea_class ea_roa_aggregated = { + .name = "roa_aggregated", + .type = T_ROA_AGGREGATED, + .format = ea_roa_aggregate_format, +}; + + +static void +rt_aggregate_roa(void *_rag) +{ + struct rt_roa_aggregator *rag = _rag; + + RT_EXPORT_WALK(&rag->src, u) TMP_SAVED + { - _Bool withdraw = 0; ++ bool withdraw = 0; + const net_addr *nroa = NULL; + switch (u->kind) + { + case RT_EXPORT_STOP: + bug("Main table export stopped"); + break; + + case RT_EXPORT_FEED: + nroa = u->feed->ni->addr; + withdraw = (u->feed->count_routes == 0); + break; + + case RT_EXPORT_UPDATE: + nroa = u->update->new ? u->update->new->net : u->update->old->net; + withdraw = !u->update->new; + break; + } + + net_addr_union nip; + net_copy(&nip.n, nroa); + + uint asn, max_pxlen; + + switch (nip.n.type) + { + case NET_ROA6: nip.n.type = NET_IP6; + nip.n.length = net_addr_length[NET_IP6]; + asn = nip.roa6.asn; + max_pxlen = nip.roa6.max_pxlen; + break; + case NET_ROA4: nip.n.type = NET_IP4; + nip.n.length = net_addr_length[NET_IP4]; + asn = nip.roa4.asn; + max_pxlen = nip.roa4.max_pxlen; + break; + default: bug("exported garbage from ROA table"); + } + + rte prev = rt_net_best(rag->stream.dst_tab, &nip.n); + + struct rt_roa_aggregated_adata *rad_new; + uint count; + + if (prev.attrs) + { + eattr *ea = ea_find(prev.attrs, &ea_roa_aggregated); + SKIP_BACK_DECLARE(struct rt_roa_aggregated_adata, rad, ad, ea->u.ptr); + + count = ROA_AGGR_COUNT(rad); + rad_new = tmp_alloc(sizeof *rad_new + (count + 1) * sizeof rad_new->u[0]); + + /* Insertion into a sorted list */ + uint p = 0; + for (p = 0; p < count; p++) + if ((rad->u[p].asn < asn) || (rad->u[p].asn == asn) && (rad->u[p].max_pxlen < max_pxlen)) + rad_new->u[p] = rad->u[p]; + else + break; + + if ((rad->u[p].asn == asn) && (rad->u[p].max_pxlen)) + /* Found */ + if (withdraw) + memcpy(&rad_new->u[p], &rad->u[p+1], (--count - p) * sizeof rad->u[p]); + else + continue; + else + /* Not found */ + if (withdraw) + continue; + else + { + rad_new->u[p].asn = asn; + rad_new->u[p].max_pxlen = max_pxlen; + memcpy(&rad_new->u[p+1], &rad->u[p], (count++ - p) * sizeof rad->u[p]); + } + } + else if (!withdraw) + { + count = 1; + rad_new = tmp_alloc(sizeof *rad_new + sizeof rad_new->u[0]); + rad_new->u[0].asn = asn; + rad_new->u[0].max_pxlen = max_pxlen; + } + else + continue; + + rad_new->ad.length = (byte *) &rad_new->u[count] - rad_new->ad.data; + + rte r = { + .src = rag->main_source, + }; + + ea_set_attr(&r.attrs, EA_LITERAL_DIRECT_ADATA(&ea_roa_aggregated, 0, &rad_new->ad)); + + rte_import(&rag->stream.dst, &nip.n, &r, rag->main_source); + +#if 0 + /* Do not split ROA aggregator, we want this to be finished asap */ + MAYBE_DEFER_TASK(rag->src.r.target, rag->src.r.event, + "export to %s", rag->src.name); +#endif + } +} + +static void +rt_setup_roa_aggregator(rtable *t) +{ + rtable *src = t->config->master.src->table; + struct rt_roa_aggregator *rag; + { + RT_LOCK(t, tab); + char *ragname = mb_sprintf(tab->rp, "%s.roa-aggregator", src->name); + rag = mb_alloc(tab->rp, sizeof *rag); + *rag = (struct rt_roa_aggregator) { + .stream = { + .dst = { + .name = ragname, + .trace_routes = tab->debug, + .loop = t->loop, + .dump_req = rt_dump_roa_aggregator_dst_req, + .log_state_change = rt_roa_aggregator_state_change, + }, + .dst_tab = t, + }, + .src = { + .name = ragname, + .r = { + .target = birdloop_event_list(t->loop), + .event = &rag->event, + }, + .pool = birdloop_pool(t->loop), + .dump = rt_dump_roa_aggregator_src_req, + .trace_routes = tab->debug, + }, + .event = { + .hook = rt_aggregate_roa, + .data = rag, + }, + }; + + rt_init_sources(&rag->sources, ragname, birdloop_event_list(t->loop)); + rag->main_source = rt_get_source_o(&rag->sources, 0); + + tab->master = &rag->stream; + } + + rt_request_import(t, &rag->stream.dst); + rt_export_subscribe(src, best, &rag->src); +} + +static void +rt_roa_aggregator_sources_gone(void *t) +{ + rt_unlock_table((rtable *) t); +} + +static void +rt_stop_roa_aggregator(rtable *t) +{ + struct rt_roa_aggregator *rag; + RT_LOCKED(t, tab) + { + rag = SKIP_BACK(struct rt_roa_aggregator, stream, tab->master); + + rt_lock_table(tab); + rt_destroy_sources(&rag->sources, ev_new_init(tab->rp, + rt_roa_aggregator_sources_gone, tab)); + rt_unlock_source(rag->main_source); + } + + /* Stopping both import and export. + * All memory will be freed with table shutdown, + * no need to do anything from import done callback */ + rt_stop_import(&rag->stream.dst, NULL); + rt_export_unsubscribe(best, &rag->src); +} /** * roa_check - check validity of route origination in a ROA table @@@ -1079,50 -647,51 +1079,50 @@@ rt_notify_basic(struct channel *c, cons } static void -rt_notify_accepted(struct channel *c, net *net, rte *new_changed, rte *old_changed, int refeed) +rt_notify_accepted(struct channel *c, const struct rt_export_feed *feed) { - // struct proto *p = c->proto; - rte *new_best = NULL; - rte *old_best = NULL; - rte *new_free = NULL; - int new_first = 0; + rte *old_best, *new_best; - _Bool feeding = rt_net_is_feeding(&c->out_req, feed->ni->addr); - _Bool idempotent = 0; ++ bool feeding = rt_net_is_feeding(&c->out_req, feed->ni->addr); ++ bool idempotent = 0; - /* - * We assume that there are no changes in net route order except (added) - * new_changed and (removed) old_changed. Therefore, the function is not - * compatible with deterministic_med (where nontrivial reordering can happen - * as a result of a route change) and with recomputation of recursive routes - * due to next hop update (where many routes can be changed in one step). - * - * Note that we need this assumption just for optimizations, we could just - * run full new_best recomputation otherwise. - * - * There are three cases: - * feed or old_best is old_changed -> we need to recompute new_best - * old_best is before new_changed -> new_best is old_best, ignore - * old_best is after new_changed -> try new_changed, otherwise old_best - */ - - if (net->routes) - c->stats.exp_updates_received++; - else - c->stats.exp_withdraws_received++; - - /* Find old_best - either old_changed, or route for net->routes */ - if (old_changed && bmap_test(&c->export_map, old_changed->id)) - old_best = old_changed; - else + for (uint i = 0; i < feed->count_routes; i++) { - for (rte *r = net->routes; rte_is_valid(r); r = r->next) + rte *r = &feed->block[i]; + + /* Previously exported */ + if (!old_best && bmap_test(&c->export_accepted_map, r->id)) { - if (bmap_test(&c->export_map, r->id)) + old_best = r; + + /* Is still the best and need not be refed anyway */ + if (!new_best && !feeding) { - old_best = r; - break; + idempotent = 1; + new_best = r; } + } + + /* Unflag obsolete routes */ + if (r->flags & REF_OBSOLETE) + bmap_clear(&c->export_rejected_map, r->id); + + /* Mark invalid as rejected */ + else if (!rte_is_valid(r)) + bmap_set(&c->export_rejected_map, r->id); - /* Note if new_changed found before old_best */ - if (r == new_changed) - new_first = 1; + /* Already rejected */ + else if (!feeding && bmap_test(&c->export_rejected_map, r->id)) + ; + + /* No new best route yet and this is a valid candidate */ + else if (!new_best) + { + /* This branch should not be executed if this route is old best */ + ASSERT_DIE(r != old_best); + + /* Have no new best route yet, try this route not seen before */ + new_best = export_filter(c, r, 0); + DBG("rt_notify_accepted: checking route id %u: %s\n", r->id, new_best ? "ok" : "no"); } } @@@ -1166,20 -730,15 +1166,20 @@@ channel_notify_accepted(void *_channel } rte * -rt_export_merged(struct channel *c, net *net, rte **rt_free, linpool *pool, int silent) +rt_export_merged(struct channel *c, const struct rt_export_feed *feed, linpool *pool, int silent) { - _Bool feeding = !silent && rt_net_is_feeding(&c->out_req, feed->ni->addr); ++ bool feeding = !silent && rt_net_is_feeding(&c->out_req, feed->ni->addr); + // struct proto *p = c->proto; - struct nexthop *nhs = NULL; - rte *best0, *best, *rt0, *rt, *tmp; + struct nexthop_adata *nhs = NULL; + rte *best0 = &feed->block[0]; + rte *best = NULL; - best0 = net->routes; - *rt_free = NULL; + /* First route is obsolete */ + if (best0->flags & REF_OBSOLETE) + return NULL; + /* First route is invalid */ if (!rte_is_valid(best0)) return NULL; @@@ -1246,549 -782,382 +1246,549 @@@ } static void -rt_notify_merged(struct channel *c, net *net, rte *new_changed, rte *old_changed, - rte *new_best, rte *old_best, int refeed) +rt_notify_merged(struct channel *c, const struct rt_export_feed *f) { - // struct proto *p = c->proto; - rte *new_free = NULL; - - /* We assume that all rte arguments are either NULL or rte_is_valid() */ + const rte *old_best = NULL; + /* Find old best route */ + for (uint i = 0; i < f->count_routes; i++) + if (bmap_test(&c->export_accepted_map, f->block[i].id)) + { + old_best = &f->block[i]; + break; + } - /* This check should be done by the caller */ - if (!new_best && !old_best) - return; + /* Prepare new merged route */ + rte *new_merged = f->count_routes ? rt_export_merged(c, f, tmp_linpool, 0) : NULL; - /* Check whether the change is relevant to the merged route */ - if ((new_best == old_best) && - (new_changed != old_changed) && - !rte_mergable(new_best, new_changed) && - !rte_mergable(old_best, old_changed)) - return; + /* And notify the protocol */ + if (new_merged || old_best) + do_rt_notify(c, f->ni->addr, new_merged, old_best); +} - if (new_best) - c->stats.exp_updates_received++; - else - c->stats.exp_withdraws_received++; - /* Prepare new merged route */ - if (new_best) - new_best = rt_export_merged(c, net, &new_free, rte_update_pool, 0); +void +channel_notify_merged(void *_channel) +{ + struct channel *c = _channel; - /* Check old merged route */ - if (old_best && !bmap_test(&c->export_map, old_best->id)) - old_best = NULL; + RT_EXPORT_WALK(&c->out_req, u) + { + switch (u->kind) + { + case RT_EXPORT_STOP: + bug("Main table export stopped"); - if (!new_best && !old_best) - return; + case RT_EXPORT_FEED: + if (u->feed->count_routes) + rt_notify_merged(c, u->feed); + break; - do_rt_notify(c, net, new_best, old_best, refeed); + case RT_EXPORT_UPDATE: + { + struct rt_export_feed *f = rt_net_feed(c->table, u->update->new ? u->update->new->net : u->update->old->net, SKIP_BACK(struct rt_pending_export, it, u->update)); + rt_notify_merged(c, f); + for (uint i=0; icount_exports; i++) + rt_export_processed(&c->out_req, f->exports[i]); + break; + } + } - /* Discard temporary rte */ - if (new_free) - rte_free(new_free); + MAYBE_DEFER_TASK(c->out_req.r.target, c->out_req.r.event, + "export to %s.%s (merged)", c->proto->name, c->name); + } } - -/** - * rte_announce - announce a routing table change - * @tab: table the route has been added to - * @type: type of route announcement (RA_UNDEF or RA_ANY) - * @net: network in question - * @new: the new or changed route - * @old: the previous route replaced by the new one - * @new_best: the new best route for the same network - * @old_best: the previous best route for the same network - * - * This function gets a routing table update and announces it to all protocols - * that are connected to the same table by their channels. - * - * There are two ways of how routing table changes are announced. First, there - * is a change of just one route in @net (which may caused a change of the best - * route of the network). In this case @new and @old describes the changed route - * and @new_best and @old_best describes best routes. Other routes are not - * affected, but in sorted table the order of other routes might change. - * - * Second, There is a bulk change of multiple routes in @net, with shared best - * route selection. In such case separate route changes are described using - * @type of %RA_ANY, with @new and @old specifying the changed route, while - * @new_best and @old_best are NULL. After that, another notification is done - * where @new_best and @old_best are filled (may be the same), but @new and @old - * are NULL. - * - * The function announces the change to all associated channels. For each - * channel, an appropriate preprocessing is done according to channel &ra_mode. - * For example, %RA_OPTIMAL channels receive just changes of best routes. - * - * In general, we first call preexport() hook of a protocol, which performs - * basic checks on the route (each protocol has a right to veto or force accept - * of the route before any filter is asked). Then we consult an export filter - * of the channel and verify the old route in an export map of the channel. - * Finally, the rt_notify() hook of the protocol gets called. - * - * Note that there are also calls of rt_notify() hooks due to feed, but that is - * done outside of scope of rte_announce(). - */ -static void -rte_announce(rtable *tab, uint type, net *net, rte *new, rte *old, - rte *new_best, rte *old_best) +void +channel_notify_basic(void *_channel) { - if (!rte_is_valid(new)) - new = NULL; + struct channel *c = _channel; - if (!rte_is_valid(old)) - old = NULL; + RT_EXPORT_WALK(&c->out_req, u) + { + switch (u->kind) + { + case RT_EXPORT_STOP: + bug("Main table export stopped"); + + case RT_EXPORT_FEED: + { + /* Find where the old route block begins */ + uint oldpos = 0; + while ((oldpos < u->feed->count_routes) && !(u->feed->block[oldpos].flags & REF_OBSOLETE)) + oldpos++; + + /* Send updates one after another */ + for (uint i = 0; i < oldpos; i++) + { + rte *new = &u->feed->block[i]; + rte *old = NULL; + for (uint o = oldpos; o < u->feed->count_routes; o++) + if (new->src == u->feed->block[o].src) + { + old = &u->feed->block[o]; + break; + } - if (!rte_is_valid(new_best)) - new_best = NULL; + rt_notify_basic(c, new, old); - if (!rte_is_valid(old_best)) - old_best = NULL; + /* Mark old processed */ + if (old) + old->src = NULL; + } - if (!new && !old && !new_best && !old_best) - return; + /* Send withdraws */ + for (uint o = oldpos; o < u->feed->count_routes; o++) + if (u->feed->block[o].src) + rt_notify_basic(c, NULL, &u->feed->block[o]); + } + break; - if (new_best != old_best) - { - if (new_best) - new_best->sender->stats.pref_routes++; - if (old_best) - old_best->sender->stats.pref_routes--; + case RT_EXPORT_UPDATE: + { + const rte *new = u->update->new; + const rte *old = u->update->old; + struct rte_src *src = (c->ra_mode == RA_ANY) ? (new ? new->src : old->src) : NULL; + + /* Squashing subsequent updates */ + for (SKIP_BACK_DECLARE(const struct rt_pending_export, rpe, it, u->update); + rpe = atomic_load_explicit(&rpe->next, memory_order_acquire) ;) + /* Either new is the same as this update's "old". Then the squash + * is obvious. + * + * Or we're squashing an update-from-nothing with a withdrawal, + * and then either src is set because it must match (RA_ANY) + * or it doesn't matter at all (RA_OPTIMAL). + */ + if ((rpe->it.old == new) && (new || src && (src == rpe->it.new->src))) + { + new = rpe->it.new; + rt_export_processed(&c->out_req, rpe->it.seq); + } - if (tab->hostcache) - rt_notify_hostcache(tab, net); + if (new && old && rte_same(new, old)) + { + channel_rte_trace_out(D_ROUTES, c, new, "already exported"); - if (!EMPTY_LIST(tab->flowspec_links)) - rt_flowspec_notify(tab, net); - } + if ((new->id != old->id) && bmap_test(&c->export_accepted_map, old->id)) + { + bmap_set(&c->export_accepted_map, new->id); + bmap_clear(&c->export_accepted_map, old->id); + } + } + else if (!new && !old) + channel_rte_trace_out(D_ROUTES, c, u->update->new, "idempotent withdraw (squash)"); + else + rt_notify_basic(c, new, old); + + break; + } + } - rt_schedule_notify(tab); + MAYBE_DEFER_TASK(c->out_req.r.target, c->out_req.r.event, + "export to %s.%s (regular)", c->proto->name, c->name); + } +} - struct channel *c; node *n; - WALK_LIST2(c, n, tab->channels, table_node) +static void +rt_flush_best(struct rtable_private *tab, u64 upto) +{ + u64 last_seq = 0; + RT_EXPORT_WALK(&tab->best_req, u) { - if (c->export_state == ES_DOWN) - continue; + ASSERT_DIE(u->kind == RT_EXPORT_UPDATE); + ASSERT_DIE(u->update->seq <= upto); + last_seq = u->update->seq; + if (last_seq == upto) + return; + } - if (type && (type != c->ra_mode)) - continue; + rt_trace(tab, D_STATES, "Export best full flushed regular up to %lu", last_seq); +} - switch (c->ra_mode) - { - case RA_OPTIMAL: - if (new_best != old_best) - rt_notify_basic(c, net, new_best, old_best, 0); - break; +static struct rt_pending_export * +rte_announce_to(struct rt_exporter *e, struct rt_net_pending_export *npe, const rte *new, const rte *old) +{ + if (new == old) + return NULL; - case RA_ANY: - if (new != old) - rt_notify_basic(c, net, new, old, 0); - break; + struct rt_pending_export rpe = { + .it = { + .new = new, + .old = old, + }, + }; - case RA_ACCEPTED: - /* - * The (new != old) condition is problematic here, as it would break - * the second usage pattern (announcement after bulk change, used in - * rt_next_hop_update_net(), which sends both new and old as NULL). - * - * But recursive next hops do not work with sorted tables anyways, - * such configuration is forbidden in BGP and not supported in - * rt_notify_accepted(). - * - * The condition is needed to eliminate spurious announcements where - * both old and new routes are not valid (so they are NULL). - */ - if (new != old) - rt_notify_accepted(c, net, new, old, 0); - break; + struct rt_export_item *rei = rt_exporter_push(e, &rpe.it); + if (!rei) + return NULL; - case RA_MERGED: - rt_notify_merged(c, net, new, old, new_best, old_best, 0); - break; - } - } + SKIP_BACK_DECLARE(struct rt_pending_export, pushed, it, rei); + + struct rt_pending_export *last = atomic_load_explicit(&npe->last, memory_order_relaxed); + if (last) + ASSERT_DIE(atomic_exchange_explicit(&last->next, pushed, memory_order_acq_rel) == NULL); + + atomic_store_explicit(&npe->last, pushed, memory_order_release); + if (!atomic_load_explicit(&npe->first, memory_order_relaxed)) + atomic_store_explicit(&npe->first, pushed, memory_order_release); + + return pushed; } -static inline int -rte_validate(rte *e) +static void +rte_announce(struct rtable_private *tab, const struct netindex *i UNUSED, net *net, const rte *new, const rte *old, + const rte *new_best, const rte *old_best) { - int c; - net *n = e->net; + /* Update network count */ + tab->net_count += (!!new_best - !!old_best); + + int new_best_valid = rte_is_valid(new_best); + int old_best_valid = rte_is_valid(old_best); + + if ((new == old) && (new_best == old_best)) + return; - if (!net_validate(n->n.addr)) + if (new_best_valid) + new_best->sender->stats.pref++; + if (old_best_valid) + old_best->sender->stats.pref--; + + /* Try to push */ + struct rt_pending_export *best_rpe = NULL; + struct rt_pending_export *all_rpe = rte_announce_to(&tab->export_all, &net->all, new, old); + if (all_rpe) { - log(L_WARN "Ignoring bogus prefix %N received via %s", - n->n.addr, e->sender->proto->name); - return 0; + /* Also best may have changed */ + best_rpe = rte_announce_to(&tab->export_best, &net->best, new_best, old_best); + if (best_rpe) + /* Announced best, need an anchor to all */ + best_rpe->seq_all = all_rpe->it.seq; + else if (!lfjour_pending_items(&tab->export_best.journal)) + /* Best is idle, flush its recipient immediately */ + rt_flush_best(tab, all_rpe->it.seq); + + rt_check_cork_high(tab); } - - /* FIXME: better handling different nettypes */ - c = !net_is_flow(n->n.addr) ? - net_classify(n->n.addr): (IADDR_HOST | SCOPE_UNIVERSE); - if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK)) + else { - log(L_WARN "Ignoring bogus route %N received via %s", - n->n.addr, e->sender->proto->name); - return 0; + /* Not announced anything, cleanup now */ + ASSERT_DIE(new_best == old_best); + hmap_clear(&tab->id_map, old->id); + rte_free(SKIP_BACK(struct rte_storage, rte, old), tab); } +} - if (net_type_match(n->n.addr, NB_DEST) == !e->attrs->dest) - { - /* Exception for flowspec that failed validation */ - if (net_is_flow(n->n.addr) && (e->attrs->dest == RTD_UNREACHABLE)) - return 1; +static net * +rt_cleanup_find_net(struct rtable_private *tab, struct rt_pending_export *rpe) +{ + /* Find the appropriate struct network */ + ASSERT_DIE(rpe->it.new || rpe->it.old); + const net_addr *n = rpe->it.new ? + rpe->it.new->net : + rpe->it.old->net; + struct netindex *ni = NET_TO_INDEX(n); + ASSERT_DIE(ni->index < atomic_load_explicit(&tab->routes_block_size, memory_order_relaxed)); + net *routes = atomic_load_explicit(&tab->routes, memory_order_relaxed); + return &routes[ni->index]; +} - static _Bool - log(L_WARN "Ignoring route %N with invalid dest %d received via %s", - n->n.addr, e->attrs->dest, e->sender->proto->name); - return 0; - } ++static bool +rt_cleanup_update_pointers(struct rt_net_pending_export *npe, struct rt_pending_export *rpe) +{ + struct rt_pending_export *first = atomic_load_explicit(&npe->first, memory_order_relaxed); + struct rt_pending_export *last = atomic_load_explicit(&npe->last, memory_order_relaxed); + ASSERT_DIE(rpe == first); - if ((e->attrs->dest == RTD_UNICAST) && !nexthop_is_sorted(&(e->attrs->nh))) - { - log(L_WARN "Ignoring unsorted multipath route %N received via %s", - n->n.addr, e->sender->proto->name); + atomic_store_explicit( + &npe->first, + atomic_load_explicit(&rpe->next, memory_order_relaxed), + memory_order_release + ); + + if (rpe != last) return 0; - } + atomic_store_explicit(&npe->last, NULL, memory_order_release); return 1; } -/** - * rte_free - delete a &rte - * @e: &rte to be deleted - * - * rte_free() deletes the given &rte from the routing table it's linked to. - */ -void -rte_free(rte *e) +static void +rt_cleanup_export_best(struct lfjour *j, struct lfjour_item *i) { - rt_unlock_source(e->src); - if (rta_is_cached(e->attrs)) - rta_free(e->attrs); - sl_free(e); + SKIP_BACK_DECLARE(struct rt_pending_export, rpe, it.li, i); + SKIP_BACK_DECLARE(struct rtable_private, tab, export_best.journal, j); + rt_flush_best(tab, rpe->seq_all); + + /* Find the appropriate struct network */ + net *net = rt_cleanup_find_net(tab, rpe); + + /* Update the first and last pointers */ + rt_cleanup_update_pointers(&net->best, rpe); } -static inline void -rte_free_quick(rte *e) +static void +rt_cleanup_export_all(struct lfjour *j, struct lfjour_item *i) { - rt_unlock_source(e->src); - rta_free(e->attrs); - sl_free(e); + SKIP_BACK_DECLARE(struct rt_pending_export, rpe, it.li, i); + SKIP_BACK_DECLARE(struct rtable_private, tab, export_all.journal, j); + + /* Find the appropriate struct network */ + net *net = rt_cleanup_find_net(tab, rpe); + + /* Update the first and last pointers */ - _Bool is_last = rt_cleanup_update_pointers(&net->all, rpe); ++ bool is_last = rt_cleanup_update_pointers(&net->all, rpe); + + /* Free the old route */ + if (rpe->it.old) + { + ASSERT_DIE(rpe->it.old->flags & REF_OBSOLETE); + hmap_clear(&tab->id_map, rpe->it.old->id); + rte_free(SKIP_BACK(struct rte_storage, rte, rpe->it.old), tab); + } + + if (is_last) + tab->gc_counter++; } -int -rte_same(rte *x, rte *y) +static void +rt_dump_best_req(struct rt_export_request *req) { - /* rte.flags / rte.pflags are not checked, as they are internal to rtable */ - return - x->attrs == y->attrs && - x->src == y->src && - rte_is_filtered(x) == rte_is_filtered(y); + SKIP_BACK_DECLARE(struct rtable_private, tab, best_req, req); + debug(" Table %s best cleanup request (%p)\n", tab->name, req); } -static inline int rte_is_ok(rte *e) { return e && !rte_is_filtered(e); } - static void -rte_recalculate(struct channel *c, net *net, rte *new, struct rte_src *src) +rt_import_cleared(void *_ih) { - struct proto *p = c->proto; - struct rtable *table = c->table; - struct proto_stats *stats = &c->stats; - static struct tbf rl_pipe = TBF_DEFAULT_LOG_LIMITS; - rte *before_old = NULL; - rte *old_best = net->routes; - rte *old = NULL; - rte **k; - - k = &net->routes; /* Find and remove original route from the same protocol */ - while (old = *k) - { - if (old->src == src) - { - /* If there is the same route in the routing table but from - * a different sender, then there are two paths from the - * source protocol to this routing table through transparent - * pipes, which is not allowed. - * - * We log that and ignore the route. If it is withdraw, we - * ignore it completely (there might be 'spurious withdraws', - * see FIXME in do_rte_announce()) - */ - if (old->sender->proto != p) - { - if (new) - { - log_rl(&rl_pipe, L_ERR "Pipe collision detected when sending %N to table %s", - net->n.addr, table->name); - rte_free_quick(new); - } - return; - } - - if (new && rte_same(old, new)) - { - /* No changes, ignore the new route and refresh the old one */ + struct rt_import_hook *hook = _ih; - old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY); - - if (!rte_is_filtered(new)) - { - stats->imp_updates_ignored++; - rte_trace_in(D_ROUTES, c, new, "ignored"); - } + ASSERT_DIE(hook->import_state == TIS_CLEARED); - rte_free_quick(new); - return; - } - *k = old->next; - table->rt_count--; - break; - } - k = &old->next; - before_old = old; - } + /* Local copy of the otherwise freed callback data */ + void (*stopped)(struct rt_import_request *) = hook->stopped; + struct rt_import_request *req = hook->req; - /* Save the last accessed position */ - rte **pos = k; + /* Finally uncouple from the table */ + RT_LOCKED(hook->table, tab) + { + req->hook = NULL; - if (!old) - before_old = NULL; + rt_trace(tab, D_EVENTS, "Hook %s stopped", req->name); + rem_node(&hook->n); + mb_free(hook); + rt_unlock_table(tab); + } - if (!old && !new) - { - stats->imp_withdraws_ignored++; - return; - } + /* And call the callback */ + CALL(stopped, req); +} - int new_ok = rte_is_ok(new); - int old_ok = rte_is_ok(old); +static void +rt_cleanup_done_all(struct rt_exporter *e, u64 end_seq) +{ + SKIP_BACK_DECLARE(struct rtable_private, tab, export_all, e); + ASSERT_DIE(DG_IS_LOCKED(tab->lock.rtable)); - struct channel_limit *l = &c->rx_limit; - if (l->action && !old && new && !c->in_table) - { - u32 all_routes = stats->imp_routes + stats->filt_routes; + if (~end_seq) + rt_trace(tab, D_STATES, "Export all cleanup done up to seq %lu", end_seq); + else + rt_trace(tab, D_STATES, "Export all cleanup complete"); - if (all_routes >= l->limit) - channel_notify_limit(c, l, PLD_RX, all_routes); + rt_check_cork_low(tab); - if (l->state == PLS_BLOCKED) + struct rt_import_hook *ih; node *x, *n; + uint cleared_counter = 0; + if (tab->wait_counter) + WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n) + if (ih->import_state == TIS_WAITING) + { + if (end_seq >= ih->flush_seq) { - /* In receive limit the situation is simple, old is NULL so - we just free new and exit like nothing happened */ - - stats->imp_updates_ignored++; - rte_trace_in(D_FILTERS, c, new, "ignored [limit]"); - rte_free_quick(new); - return; + ih->import_state = TIS_CLEARED; + tab->wait_counter--; + cleared_counter++; + + ih->cleanup_event = (event) { + .hook = rt_import_cleared, + .data = ih, + }; + ev_send_loop(ih->req->loop, &ih->cleanup_event); } - } - - l = &c->in_limit; - if (l->action && !old_ok && new_ok) - { - if (stats->imp_routes >= l->limit) - channel_notify_limit(c, l, PLD_IN, stats->imp_routes); + } - if (l->state == PLS_BLOCKED) - { - /* In import limit the situation is more complicated. We - shouldn't just drop the route, we should handle it like - it was filtered. We also have to continue the route - processing if old or new is non-NULL, but we should exit - if both are NULL as this case is probably assumed to be - already handled. */ + if (!EMPTY_LIST(tab->imports) && + (tab->gc_counter >= tab->config->gc_threshold)) + rt_kick_prune_timer(tab); +} - stats->imp_updates_ignored++; - rte_trace_in(D_FILTERS, c, new, "ignored [limit]"); +static void +rt_cleanup_done_best(struct rt_exporter *e, u64 end_seq) +{ + SKIP_BACK_DECLARE(struct rtable_private, tab, export_best, e); - if (c->in_keep_filtered) - new->flags |= REF_FILTERED; - else - { rte_free_quick(new); new = NULL; } + if (~end_seq) + rt_trace(tab, D_STATES, "Export best cleanup done up to seq %lu", end_seq); + else + { + rt_trace(tab, D_STATES, "Export best cleanup complete, flushing regular"); + rt_flush_best(tab, ~0ULL); + } +} - /* Note that old && !new could be possible when - c->in_keep_filtered changed in the recent past. */ +#define RT_EXPORT_BULK 1024 - if (!old && !new) - return; +static inline int +rte_validate(struct channel *ch, rte *e) +{ + int c; + const net_addr *n = e->net; + +#define IGNORING(pre, post) do { \ + log(L_WARN "%s.%s: Ignoring " pre " %N " post, ch->proto->name, ch->name, n); \ + return 0; \ + } while (0) + + if (!net_validate(n)) + IGNORING("bogus prefix", ""); + + /* FIXME: better handling different nettypes */ + c = !net_is_flow(n) ? + net_classify(n): (IADDR_HOST | SCOPE_UNIVERSE); + if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK)) + IGNORING("bogus route", ""); + + if (net_type_match(n, NB_DEST)) + { + eattr *nhea = ea_find(e->attrs, &ea_gen_nexthop); + int dest = nhea_dest(nhea); + + if (dest == RTD_NONE) + IGNORING("route", "with no destination"); + + if ((dest == RTD_UNICAST) && + !nexthop_is_sorted((struct nexthop_adata *) nhea->u.ptr)) + IGNORING("unsorted multipath route", ""); + } + else if (ea_find(e->attrs, &ea_gen_nexthop)) + IGNORING("route", "having a superfluous nexthop attribute"); + + return 1; +} + +int +rte_same(const rte *x, const rte *y) +{ + /* rte.flags / rte.pflags are not checked, as they are internal to rtable */ + return + (x == y) || ( + (x->attrs == y->attrs) || + ((!x->attrs->stored || !y->attrs->stored) && ea_same(x->attrs, y->attrs)) + ) && + x->src == y->src && + rte_is_filtered(x) == rte_is_filtered(y); +} + +static inline int rte_is_ok(const rte *e) { return e && !rte_is_filtered(e); } + +static void +rte_recalculate(struct rtable_private *table, struct rt_import_hook *c, struct netindex *i, net *net, rte *new, struct rte_src *src) +{ + struct rt_import_request *req = c->req; + struct rt_import_stats *stats = &c->stats; + struct rte_storage *old_best_stored = NET_BEST_ROUTE(table, net); + const rte *old_best = old_best_stored ? &old_best_stored->rte : NULL; + + /* If the new route is identical to the old one, we find the attributes in + * cache and clone these with no performance drop. OTOH, if we were to lookup + * the attributes, such a route definitely hasn't been anywhere yet, + * therefore it's definitely worth the time. */ + struct rte_storage *new_stored = NULL; + if (new) + { + new_stored = rte_store(new, i, table); + new = RTES_WRITE(new_stored); + } + + struct rte_storage * _Atomic *last_ptr = NULL; + struct rte_storage *old_stored = NULL; + const rte *old = NULL; - new_ok = 0; - goto skip_stats1; + /* Find the original route from the same protocol */ + NET_WALK_ROUTES(table, net, ep, e) + { + last_ptr = &e->next; + if (e->rte.src == src) + if (old_stored) + bug("multiple routes in table with the same src"); + else + old_stored = e; + } + + if (old_stored) + { + old = &old_stored->rte; + + /* If there is the same route in the routing table but from + * a different sender, then there are two paths from the + * source protocol to this routing table through transparent + * pipes, which is not allowed. + * We log that and ignore the route. */ + if (old->sender != c) + { + if (!old->generation && !new->generation) + bug("Two protocols claim to author a route with the same rte_src in table %s: %N %s/%u:%u", + c->table->name, i->addr, old->src->owner->name, old->src->private_id, old->src->global_id); + + log_rl(&table->rl_pipe, L_ERR "Route source collision in table %s: %N %s/%u:%u", + c->table->name, i->addr, old->src->owner->name, old->src->private_id, old->src->global_id); } + + if (new && rte_same(old, &new_stored->rte)) + { + /* No changes, ignore the new route and refresh the old one */ + old_stored->stale_cycle = new->stale_cycle; + + if (!rte_is_filtered(new)) + { + stats->updates_ignored++; + rt_rte_trace_in(D_ROUTES, req, new, "ignored"); + } + + /* We need to free the already stored route here before returning */ + rte_free(new_stored, table); + return; + } + } + + if (!old && !new) + { + stats->withdraws_ignored++; + return; } + /* If rejected by import limit, we need to pretend there is no route */ + if (req->preimport && (req->preimport(req, new, old) == 0)) + { + rte_free(new_stored, table); + new_stored = NULL; + new = NULL; + } + + if (!new && !old) + { + stats->withdraws_ignored++; + return; + } + + int new_ok = rte_is_ok(new); + int old_ok = rte_is_ok(old); + if (new_ok) - stats->imp_updates_accepted++; + stats->updates_accepted++; else if (old_ok) - stats->imp_withdraws_accepted++; + stats->withdraws_accepted++; else - stats->imp_withdraws_ignored++; + stats->withdraws_ignored++; if (old_ok || new_ok) table->last_rt_change = current_time(); @@@ -2026,478 -1373,179 +2026,478 @@@ rte_update(struct channel *c, const net ASSERT(c->channel_state == CS_UP); - rte_update_lock(); + /* Storing prefilter routes as an explicit layer */ + if (new && (c->in_keep & RIK_PREFILTER)) + new->attrs = ea_lookup_tmp(new->attrs, 0, EALS_PREIMPORT); + +#if 0 + debug("%s.%s -(prefilter)-> %s: %N ", c->proto->name, c->name, c->table->name, n); + if (new) ea_dump(new->attrs); + else debug("withdraw"); + debug("\n"); +#endif + + const struct filter *filter = c->in_filter; + struct channel_import_stats *stats = &c->import_stats; + struct mpls_fec *fec = NULL; + if (new) { - /* Create a temporary table node */ - nn = alloca(sizeof(net) + n->length); - memset(nn, 0, sizeof(net) + n->length); - net_copy(nn->n.addr, n); + new->net = n; + new->sender = c->in_req.hook; - new->net = nn; - new->sender = c; + int fr; - stats->imp_updates_received++; - if (!rte_validate(new)) + stats->updates_received++; + if ((filter == FILTER_REJECT) || + ((fr = f_run(filter, new, 0)) > F_ACCEPT)) { - rte_trace_in(D_FILTERS, c, new, "invalid"); - stats->imp_updates_invalid++; - goto drop; + stats->updates_filtered++; + channel_rte_trace_in(D_FILTERS, c, new, "filtered out"); + + if (c->in_keep & RIK_REJECTED) + new->flags |= REF_FILTERED; + else + new = NULL; } - if (filter == FILTER_REJECT) - { - stats->imp_updates_filtered++; - rte_trace_in(D_FILTERS, c, new, "filtered out"); + if (new && c->proto->mpls_channel) + if (mpls_handle_rte(c->proto->mpls_channel, n, new, &fec) < 0) + { + channel_rte_trace_in(D_FILTERS, c, new, "invalid"); + stats->updates_invalid++; + new = NULL; + } + + if (new) + { + new->attrs = ea_lookup_tmp(new->attrs, + (c->in_keep & RIK_PREFILTER) ? BIT32_ALL(EALS_PREIMPORT) : 0, EALS_FILTERED); - if (! c->in_keep_filtered) - goto drop; + if (net_is_flow(n)) + rt_flowspec_resolve_rte(new, c); + else + rt_next_hop_resolve_rte(new); + } - /* new is a private copy, i could modify it */ - new->flags |= REF_FILTERED; - } - else if (filter) + if (new && !rte_validate(c, new)) { - int fr = f_run(filter, &new, rte_update_pool, 0); - if (fr > F_ACCEPT) - { - stats->imp_updates_filtered++; - rte_trace_in(D_FILTERS, c, new, "filtered out"); + channel_rte_trace_in(D_FILTERS, c, new, "invalid"); + stats->updates_invalid++; + new = NULL; + } + } + else + stats->withdraws_received++; - if (! c->in_keep_filtered) - goto drop; + rte_import(&c->in_req, n, new, src); - new->flags |= REF_FILTERED; - } - } + if (fec) + { + mpls_unlock_fec(fec); + DBGL( "Unlock FEC %p (rte_update %N)", fec, n); + } +} - if (p->mpls_map) - { - if (mpls_handle_rte(p->mpls_map, n, new, rte_update_pool, &fec) < 0) - { - rte_trace_in(D_FILTERS, c, new, "invalid"); - stats->imp_updates_invalid++; - goto drop; - } - } +void +rte_import(struct rt_import_request *req, const net_addr *n, rte *new, struct rte_src *src) +{ + struct rt_import_hook *hook = req->hook; + if (!hook) + { + log(L_WARN "%s: Called rte_import without import hook", req->name); + return; + } - if (!rta_is_cached(new->attrs)) /* Need to copy attributes */ - new->attrs = rta_lookup(new->attrs); - new->flags |= REF_COW; + RT_LOCKED(hook->table, tab) + { + u32 bs = atomic_load_explicit(&tab->routes_block_size, memory_order_acquire); - /* Use the actual struct network, not the dummy one */ - nn = net_get(c->table, n); - new->net = nn; - } - else + struct netindex *i; + net *routes = atomic_load_explicit(&tab->routes, memory_order_acquire); + net *nn; + if (new) { - stats->imp_withdraws_received++; + /* An update */ + /* Set auxiliary values */ + new->stale_cycle = hook->stale_set; + new->sender = hook; - if (!(nn = net_find(c->table, n)) || !src) - { - stats->imp_withdraws_ignored++; - rte_update_unlock(); - return; - } - } + /* Allocate the key structure */ + i = net_get_index(tab->netindex, n); + new->net = i->addr; - recalc: - /* And recalculate the best route */ - rte_recalculate(c, nn, new, src); + /* Block size update */ + u32 nbs = bs; + while (i->index >= nbs) + nbs *= 2; - if (p->mpls_map) - mpls_handle_rte_cleanup(p->mpls_map, &fec); + if (nbs > bs) + { + net *nb = mb_alloc(tab->rp, nbs * sizeof *nb); + memcpy(&nb[0], routes, bs * sizeof *nb); + memset(&nb[bs], 0, (nbs - bs) * sizeof *nb); + ASSERT_DIE(atomic_compare_exchange_strong_explicit( + &tab->routes, &routes, nb, + memory_order_acq_rel, memory_order_relaxed)); + ASSERT_DIE(atomic_compare_exchange_strong_explicit( + &tab->routes_block_size, &bs, nbs, + memory_order_acq_rel, memory_order_relaxed)); + ASSERT_DIE(atomic_compare_exchange_strong_explicit( + &tab->export_all.max_feed_index, &bs, nbs, + memory_order_acq_rel, memory_order_relaxed)); + ASSERT_DIE(atomic_compare_exchange_strong_explicit( + &tab->export_best.max_feed_index, &bs, nbs, + memory_order_acq_rel, memory_order_relaxed)); + + synchronize_rcu(); + mb_free(routes); + + routes = nb; + } - rte_update_unlock(); - return; + /* Update table tries */ + struct f_trie *trie = atomic_load_explicit(&tab->trie, memory_order_relaxed); + if (trie) + trie_add_prefix(trie, i->addr, i->addr->pxlen, i->addr->pxlen); + + if (tab->trie_new) + trie_add_prefix(tab->trie_new, i->addr, i->addr->pxlen, i->addr->pxlen); + } + else if ((i = net_find_index(tab->netindex, n)) && (i->index < bs)) + /* Found an block where we can withdraw from */ + ; + else + { + /* No route for this net is present at all. Ignore right now. */ + req->hook->stats.withdraws_ignored++; + if (req->trace_routes & D_ROUTES) + log(L_TRACE "%s > ignored %N withdraw", req->name, n); + return; + } - drop: - rte_free(new); - new = NULL; - if (nn = net_find(c->table, n)) - goto recalc; + /* Resolve the net structure */ + nn = &routes[i->index]; - rte_update_unlock(); + /* Recalculate the best route. */ + rte_recalculate(tab, hook, i, nn, new, src); + } } -/* Independent call to rte_announce(), used from next hop - recalculation, outside of rte_update(). new must be non-NULL */ -static inline void -rte_announce_i(rtable *tab, uint type, net *net, rte *new, rte *old, - rte *new_best, rte *old_best) +/* + * Feeding + */ + +static net * +rt_net_feed_get_net(struct rtable_reading *tr, uint index) { - rte_update_lock(); - rte_announce(tab, type, net, new, old, new_best, old_best); - rte_update_unlock(); + /* Get the route block from the table */ + net *routes = atomic_load_explicit(&tr->t->routes, memory_order_acquire); + u32 bs = atomic_load_explicit(&tr->t->routes_block_size, memory_order_acquire); + + /* Nothing to actually feed */ + if (index >= bs) + return NULL; + + /* We have a net to feed! */ + return &routes[index]; } -static inline void -rte_discard(rte *old) /* Non-filtered route deletion, used during garbage collection */ +static const struct rt_pending_export * +rt_net_feed_validate_first( + struct rtable_reading *tr, + const struct rt_pending_export *first_in_net, + const struct rt_pending_export *last_in_net, + const struct rt_pending_export *first) { - rte_update_lock(); - rte_recalculate(old->sender, old->net, NULL, old->src); - rte_update_unlock(); + /* Inconsistent input */ + if (!first_in_net != !last_in_net) + RT_READ_RETRY(tr); + + if (!first) + return first_in_net; + + /* Export item validity check: we must find it between first_in_net and last_in_net */ + const struct rt_pending_export *rpe = first_in_net; + while (rpe) + if (rpe == first) + return first; + else if (rpe == last_in_net) + /* Got to the end without finding the beginning */ + break; + else + rpe = atomic_load_explicit(&rpe->next, memory_order_acquire); + + /* Not found, inconsistent export, retry */ + RT_READ_RETRY(tr); } -/* Modify existing route by protocol hook, used for long-lived graceful restart */ -static inline void -rte_modify(rte *old) +static struct rt_export_feed * - rt_net_feed_index(struct rtable_reading *tr, net *n, _Bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_pending_export *first) ++rt_net_feed_index(struct rtable_reading *tr, net *n, bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_pending_export *first) { - rte_update_lock(); + /* Get the feed itself. It may change under our hands tho. */ + struct rt_pending_export *first_in_net, *last_in_net; + first_in_net = atomic_load_explicit(&n->all.first, memory_order_acquire); + last_in_net = atomic_load_explicit(&n->all.last, memory_order_acquire); + + first = rt_net_feed_validate_first(tr, first_in_net, last_in_net, first); + + /* Count the elements */ + uint rcnt = rte_feed_count(tr, n); + uint ecnt = 0; + uint ocnt = 0; + for (const struct rt_pending_export *rpe = first; rpe; + rpe = atomic_load_explicit(&rpe->next, memory_order_acquire)) + { + ecnt++; + if (rpe->it.old) + ocnt++; + } + + if (ecnt) { + const net_addr *a = (first->it.new ?: first->it.old)->net; + if (prefilter && !prefilter(f, a)) + return NULL; + } + + struct rt_export_feed *feed = NULL; - rte *new = old->sender->proto->rte_modify(old, rte_update_pool); - if (new != old) + if (rcnt || ocnt || ecnt) { - if (new) + if (!ecnt && prefilter && !prefilter(f, NET_READ_BEST_ROUTE(tr, n)->rte.net)) + return NULL; + + feed = rt_alloc_feed(rcnt+ocnt, ecnt); + + if (rcnt) + rte_feed_obtain_copy(tr, n, feed->block, rcnt); + + if (ecnt) { - if (!rta_is_cached(new->attrs)) - new->attrs = rta_lookup(new->attrs); - new->flags = (old->flags & ~REF_MODIFY) | REF_COW; + uint e = 0; + uint rpos = rcnt; + for (const struct rt_pending_export *rpe = first; rpe; + rpe = atomic_load_explicit(&rpe->next, memory_order_acquire)) + if (e >= ecnt) + RT_READ_RETRY(tr); + else + { + feed->exports[e++] = rpe->it.seq; + + /* Copy also obsolete routes */ + if (rpe->it.old) + { + ASSERT_DIE(rpos < rcnt + ocnt); + feed->block[rpos++] = *rpe->it.old; + ea_free_later(ea_ref(rpe->it.old->attrs)); + } + } + + ASSERT_DIE(e == ecnt); } - rte_recalculate(old->sender, old->net, new, old->src); + feed->ni = NET_TO_INDEX(feed->block[0].net); } - rte_update_unlock(); + /* Check that it indeed didn't change and the last export is still the same. */ + if ( + (first_in_net != atomic_load_explicit(&n->all.first, memory_order_acquire)) + || (last_in_net != atomic_load_explicit(&n->all.last, memory_order_acquire))) + RT_READ_RETRY(tr); + + return feed; } -/* Check rtable for best route to given net whether it would be exported do p */ -int -rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter) +static struct rt_export_feed * - rt_net_feed_internal(struct rtable_reading *tr, u32 index, _Bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_pending_export *first) ++rt_net_feed_internal(struct rtable_reading *tr, u32 index, bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_pending_export *first) { - struct proto *p = c->proto; - net *n = net_find(t, a); - rte *rt = n ? n->routes : NULL; + net *n = rt_net_feed_get_net(tr, index); + if (!n) + return &rt_feed_index_out_of_range; - if (!rte_is_valid(rt)) - return 0; + return rt_net_feed_index(tr, n, prefilter, f, first); +} - rte_update_lock(); +struct rt_export_feed * +rt_net_feed(rtable *t, const net_addr *a, const struct rt_pending_export *first) +{ + RT_READ(t, tr); + const struct netindex *ni = net_find_index(tr->t->netindex, a); + return ni ? rt_net_feed_internal(tr, ni->index, NULL, NULL, first) : NULL; +} - /* Rest is stripped down export_filter() */ - int v = p->preexport ? p->preexport(c, rt) : 0; - if (v == RIC_PROCESS) - v = (f_run(filter, &rt, rte_update_pool, FF_SILENT) <= F_ACCEPT); +static struct rt_export_feed * - rt_feed_net_all(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, _Bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first) ++rt_feed_net_all(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first) +{ + RT_READ_ANCHORED(SKIP_BACK(rtable, export_all, e), tr, u); + return rt_net_feed_internal(tr, index, prefilter, f, SKIP_BACK(const struct rt_pending_export, it, _first)); +} - /* Discard temporary rte */ - if (rt != n->routes) - rte_free(rt); +rte +rt_net_best(rtable *t, const net_addr *a) +{ + rte rt = {}; - rte_update_unlock(); + RT_READ(t, tr); - return v > 0; + struct netindex *i = net_find_index(t->netindex, a); + net *n = i ? net_find(tr, i) : NULL; + if (!n) + return rt; + + struct rte_storage *e = NET_READ_BEST_ROUTE(tr, n); + if (!e || !rte_is_valid(&e->rte)) + return rt; + + ASSERT_DIE(e->rte.net == i->addr); + ea_free_later(ea_ref(e->rte.attrs)); + return RTE_COPY(e); } +static struct rt_export_feed * - rt_feed_net_best(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, _Bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first) ++rt_feed_net_best(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first) +{ + SKIP_BACK_DECLARE(rtable, t, export_best, e); + SKIP_BACK_DECLARE(const struct rt_pending_export, first, it, _first); + + RT_READ_ANCHORED(t, tr, u); + + net *n = rt_net_feed_get_net(tr, index); + if (!n) + return &rt_feed_index_out_of_range; + /* No more to feed, we are fed up! */ + + const struct rt_pending_export *first_in_net, *last_in_net; + first_in_net = atomic_load_explicit(&n->best.first, memory_order_acquire); + last_in_net = atomic_load_explicit(&n->best.last, memory_order_acquire); + first = rt_net_feed_validate_first(tr, first_in_net, last_in_net, first); + + uint ecnt = 0; + for (const struct rt_pending_export *rpe = first; rpe; + rpe = atomic_load_explicit(&rpe->next, memory_order_acquire)) + ecnt++; + + if (ecnt) { + const net_addr *a = (first->it.new ?: first->it.old)->net; + if (prefilter && !prefilter(f, a)) + return NULL; + } -/** - * rt_refresh_begin - start a refresh cycle - * @t: related routing table - * @c related channel - * - * This function starts a refresh cycle for given routing table and announce - * hook. The refresh cycle is a sequence where the protocol sends all its valid - * routes to the routing table (by rte_update()). After that, all protocol - * routes (more precisely routes with @c as @sender) not sent during the - * refresh cycle but still in the table from the past are pruned. This is + struct rte_storage *best = NET_READ_BEST_ROUTE(tr, n); + + if (!ecnt && (!best || prefilter && !prefilter(f, best->rte.net))) + return NULL; + + struct rt_export_feed *feed = rt_alloc_feed(!!best, ecnt); + if (best) + { + feed->block[0] = best->rte; + feed->ni = NET_TO_INDEX(best->rte.net); + } + else + feed->ni = NET_TO_INDEX((first->it.new ?: first->it.old)->net); + + if (ecnt) + { + uint e = 0; + for (const struct rt_pending_export *rpe = first; rpe; + rpe = atomic_load_explicit(&rpe->next, memory_order_acquire)) + if (e >= ecnt) + RT_READ_RETRY(tr); + else + feed->exports[e++] = rpe->it.seq; + + ASSERT_DIE(e == ecnt); + } + + /* Check that it indeed didn't change and the last export is still the same. */ + if ( + (first_in_net != atomic_load_explicit(&n->best.first, memory_order_acquire)) + || (last_in_net != atomic_load_explicit(&n->best.last, memory_order_acquire))) + RT_READ_RETRY(tr); + + /* And we're finally done */ + return feed; +} + + +/* Check rtable for best route to given net whether it would be exported do p */ +int +rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter) +{ + rte rt = rt_net_best(t, a); + + int v = c->proto->preexport ? c->proto->preexport(c, &rt) : 0; + if (v == RIC_PROCESS) + v = (f_run(filter, &rt, FF_SILENT) <= F_ACCEPT); + + return v > 0; +} + +static inline void +rt_set_import_state(struct rt_import_hook *hook, u8 state) +{ + hook->last_state_change = current_time(); + hook->import_state = state; + + CALL(hook->req->log_state_change, hook->req, state); +} + +void +rt_request_import(rtable *t, struct rt_import_request *req) +{ + RT_LOCKED(t, tab) + { + rt_lock_table(tab); + + struct rt_import_hook *hook = req->hook = mb_allocz(tab->rp, sizeof(struct rt_import_hook)); + + DBG("Lock table %s for import %p req=%p uc=%u\n", tab->name, hook, req, tab->use_count); + + hook->req = req; + hook->table = t; + + rt_set_import_state(hook, TIS_UP); + add_tail(&tab->imports, &hook->n); + } +} + +void +rt_stop_import(struct rt_import_request *req, void (*stopped)(struct rt_import_request *)) +{ + ASSERT_DIE(req->hook); + struct rt_import_hook *hook = req->hook; + + RT_LOCKED(hook->table, tab) + { + rt_set_import_state(hook, TIS_STOP); + hook->stopped = stopped; + + rt_refresh_trace(tab, hook, "stop import"); + + /* Cancel table rr_counter */ + if (hook->stale_set != hook->stale_pruned) + tab->rr_counter -= ((int) hook->stale_set - (int) hook->stale_pruned); + + tab->rr_counter++; + + hook->stale_set = hook->stale_pruned = hook->stale_pruning = hook->stale_valid = 0; + + rt_schedule_prune(tab); + } +} + + +/** + * rt_refresh_begin - start a refresh cycle + * @t: related routing table + * @c related channel + * + * This function starts a refresh cycle for given routing table and announce + * hook. The refresh cycle is a sequence where the protocol sends all its valid + * routes to the routing table (by rte_update()). After that, all protocol + * routes (more precisely routes with @c as @sender) not sent during the + * refresh cycle but still in the table from the past are pruned. This is * implemented by marking all related routes as stale by REF_STALE flag in * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD * flag in rt_refresh_end() and then removing such routes in the prune loop. @@@ -2907,31 -1857,20 +2907,31 @@@ rt_flowspec_link(rtable *src_pub, rtabl void rt_flowspec_unlink(rtable *src, rtable *dst) { - struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst); + birdloop_enter(dst->loop); - _Bool unlock_dst = 0; - ASSERT(ln && (ln->uc > 0)); ++ bool unlock_dst = 0; - ln->uc--; - - if (!ln->uc) + struct rt_flowspec_link *ln; + RT_LOCKED(src, t) { - rem_node(&ln->n); - mb_free(ln); + ln = rt_flowspec_find_link(t, dst); - rt_unlock_table(src); - rt_unlock_table(dst); + ASSERT(ln && (ln->uc > 0)); + + if (!--ln->uc) + { + rt_flowspec_link_rem_node(&t->flowspec_links, ln); + rtex_export_unsubscribe(&ln->req); + ev_postpone(&ln->event); + mb_free(ln); + unlock_dst = 1; + } } + + if (unlock_dst) + rt_unlock_table(dst); + + birdloop_leave(dst->loop); } static void @@@ -3252,86 -2004,10 +3252,86 @@@ voi rt_init(void) { rta_init(); - rt_table_pool = rp_new(&root_pool, "Routing tables"); - rte_update_pool = lp_new_default(rt_table_pool); - rte_slab = sl_new(rt_table_pool, sizeof(rte)); + rt_table_pool = rp_new(&root_pool, the_bird_domain.the_bird, "Routing tables"); init_list(&routing_tables); + init_list(&deleted_routing_tables); + ev_init_list(&rt_cork.queue, &main_birdloop, "Route cork release"); + rt_cork.dom = DOMAIN_NEW_RCU_SYNC(resource); + idm_init(&rtable_idm, rt_table_pool, 256); + + ea_register_init(&ea_roa_aggregated); +} + - static _Bool ++static bool +rt_prune_net(struct rtable_private *tab, struct network *n) +{ + NET_WALK_ROUTES(tab, n, ep, e) + { + ASSERT_DIE(!(e->flags & REF_OBSOLETE)); + struct rt_import_hook *s = e->rte.sender; + - _Bool stale = (s->import_state == TIS_FLUSHING); ++ bool stale = (s->import_state == TIS_FLUSHING); + + if (!stale) + { + + /* + * The range of 0..256 is split by s->stale_* like this: + * + * pruned pruning valid set + * | | | | + * 0 v v v v 256 + * |...........................+++++++++++........| + * + * We want to drop everything outside the marked range, thus + * (e->rte.stale_cycle < s->stale_valid) || + * (e->rte.stale_cycle > s->stale_set)) + * looks right. + * + * But the pointers may wrap around, and in the following situation, all the routes get pruned: + * + * set pruned pruning valid + * | | | | + * 0 v v v v 256 + * |++++++..................................++++++| + * + * In that case, we want + * (e->rte.stale_cycle > s->stale_valid) || + * (e->rte.stale_cycle < s->stale_set)) + * + * Full logic table: + * + * permutation | result | (S < V) + (S < SC) + (SC < V) + * -----------------+----------+--------------------------------- + * SC < V <= S | prune | 0 + 0 + 1 = 1 + * S < SC < V | prune | 1 + 1 + 1 = 3 + * V <= S < SC | prune | 0 + 1 + 0 = 1 + * SC <= S < V | keep | 1 + 0 + 1 = 2 + * V <= SC <= S | keep | 0 + 0 + 0 = 0 + * S < V <= SC | keep | 1 + 1 + 0 = 2 + * + * Now the following code hopefully makes sense. + */ + + int sv = (s->stale_set < s->stale_valid); + int ssc = (s->stale_set < e->rte.stale_cycle); + int scv = (e->rte.stale_cycle < s->stale_valid); + stale = (sv + ssc + scv) & 1; + } + + /* By the C standard, either the importer is flushing and stale_perm is 1, + * or by the table above, stale_perm is between 0 and 3, where even values + * say "keep" and odd values say "prune". */ + + if (stale) + { + /* Announce withdrawal */ + struct netindex *i = RTE_GET_NETINDEX(&e->rte); + rte_recalculate(tab, e->rte.sender, i, n, NULL, e->rte.src); + return 1; + } + } + return 0; } @@@ -3925,234 -2539,78 +3925,234 @@@ rt_flowspec_update_rte(struct rtable_pr #endif } +static inline void +rt_flowspec_resolve_rte(rte *r, struct channel *c) +{ +#ifdef CONFIG_BGP + enum flowspec_valid valid, old = rt_get_flowspec_valid(r); + struct bgp_channel *bc = (struct bgp_channel *) c; -static inline int -rt_next_hop_update_net(rtable *tab, net *n) + if ( (rt_get_source_attr(r) == RTS_BGP) + && (c->class == &channel_bgp) + && (bc->base_table)) + { + SKIP_BACK_DECLARE(struct bgp_proto, p, p, bc->c.proto); + RT_LOCKED(c->in_req.hook->table, tab) + valid = rt_flowspec_check( + bc->base_table, tab, + r->net, r->attrs, p->is_interior); + } + else + valid = FLOWSPEC_UNKNOWN; + + if (valid == old) + return; + + if (valid == FLOWSPEC_UNKNOWN) + ea_unset_attr(&r->attrs, 0, &ea_gen_flowspec_valid); + else + ea_set_attr_u32(&r->attrs, &ea_gen_flowspec_valid, 0, valid); +#endif +} + +static inline void +rt_next_hop_update_net(struct rtable_private *tab, struct netindex *ni, net *n) { - rte **k, *e, *new, *old_best, **new_best; - int count = 0; - int free_old_best = 0; + uint count = 0; + int is_flow = net_val_match(tab->addr_type, NB_FLOW); - old_best = n->routes; + struct rte_storage *old_best = NET_BEST_ROUTE(tab, n); if (!old_best) - return 0; + return; + + NET_WALK_ROUTES(tab, n, ep, e) + count++; + + if (!count) + return; + + struct rte_multiupdate { + struct rte_storage *old, *new_stored; + rte new; + } *updates = tmp_allocz(sizeof(struct rte_multiupdate) * (count+1)); + + uint pos = 0; + NET_WALK_ROUTES(tab, n, ep, e) + updates[pos++].old = e; + + uint mod = 0; + if (is_flow) + for (uint i = 0; i < pos; i++) + mod += rt_flowspec_update_rte(tab, &updates[i].old->rte, &updates[i].new); + + else + for (uint i = 0; i < pos; i++) + mod += rt_next_hop_update_rte(&updates[i].old->rte, &updates[i].new); + + if (!mod) + return; + + /* We add a spinlock sentinel to the beginning */ + struct rte_storage local_sentinel = { + .flags = REF_OBSOLETE, + .next = old_best, + }; + atomic_store_explicit(&n->routes, &local_sentinel, memory_order_release); - for (k = &n->routes; e = *k; k = &e->next) + /* Now we mark all the old routes obsolete */ + for (uint i = 0; i < pos; i++) + if (updates[i].new.attrs) + updates[i].old->flags |= REF_OBSOLETE; + + /* Wait for readers */ + synchronize_rcu(); + + /* And now we go backwards to keep the list properly linked */ + struct rte_storage *next = NULL; + for (int i = pos - 1; i >= 0; i--) { - if (!net_is_flow(n->n.addr)) - new = rt_next_hop_update_rte(tab, e); + struct rte_storage *this; + if (updates[i].new.attrs) + { + rte *new = &updates[i].new; + new->lastmod = current_time(); + new->id = hmap_first_zero(&tab->id_map); + hmap_set(&tab->id_map, new->id); + this = updates[i].new_stored = rte_store(new, ni, tab); + } else - new = rt_flowspec_update_rte(tab, e); + this = updates[i].old; - if (new) - { - *k = new; + atomic_store_explicit(&this->next, next, memory_order_release); + next = this; + } - rte_trace_in(D_ROUTES, new->sender, new, "updated"); - rte_announce_i(tab, RA_ANY, n, new, e, NULL, NULL); + /* Add behind the sentinel */ + atomic_store_explicit(&local_sentinel.next, next, memory_order_release); - /* Call a pre-comparison hook */ + /* Call the pre-comparison hooks */ + for (uint i = 0; i < pos; i++) + if (updates[i].new_stored) + { /* Not really an efficient way to compute this */ - if (e->src->proto->rte_recalculate) - e->src->proto->rte_recalculate(tab, n, new, e, NULL); + if (updates[i].old->rte.src->owner->rte_recalculate) + updates[i].old->rte.src->owner->rte_recalculate(tab, n, updates[i].new_stored, updates[i].old, old_best); + } - if (e != old_best) - rte_free_quick(e); - else /* Freeing of the old best rte is postponed */ - free_old_best = 1; + /* Find the new best route */ + uint best_pos = 0; + struct rte_storage *new_best = updates[0].new_stored ?: updates[0].old; - e = new; - count++; - } + for (uint i = 1; i < pos; i++) + { + struct rte_storage *s = updates[i].new_stored ?: updates[i].old; + if (rte_better(&s->rte, &new_best->rte)) + { + best_pos = i; + new_best = s; + } } - if (!count) - return 0; + /* Relink the new best route to the first position */ + struct rte_storage * _Atomic *best_prev; + if (best_pos) + best_prev = &(updates[best_pos-1].new_stored ?: updates[best_pos-1].old)->next; + else + best_prev = &local_sentinel.next; - /* Find the new best route */ - new_best = NULL; - for (k = &n->routes; e = *k; k = &e->next) + /* Unlink from the original place */ + atomic_store_explicit(best_prev, + atomic_load_explicit(&new_best->next, memory_order_relaxed), + memory_order_release); + + /* Link out */ + atomic_store_explicit(&new_best->next, + atomic_load_explicit(&local_sentinel.next, memory_order_relaxed), + memory_order_release); + + /* Now we have to announce the routes the right way, to not cause any + * strange problems with consistency. */ + + ASSERT_DIE(updates[0].old == old_best); + + /* Find new best route original position */ + uint nbpos = ~0; + for (uint i=0; iroutes) + struct rt_pending_export *best_rpe = + (new_best != old_best) ? + rte_announce_to(&tab->export_best, &n->best, &new_best->rte, &old_best->rte) + : NULL; + + uint total = 0; + u64 last_seq = 0; + + /* Announce the changes */ + for (uint i=0; iexport_all, &n->all, + &updates[i].new_stored->rte, &updates[i].old->rte); + + ASSERT_DIE(this_rpe); - _Bool nb = (new_best->rte.src == updates[i].new.src), ob = (i == 0); ++ bool nb = (new_best->rte.src == updates[i].new.src), ob = (i == 0); + char info[96]; + char best_indicator[2][2] = { { ' ', '+' }, { '-', '=' } }; + bsnprintf(info, sizeof info, "autoupdated [%cbest]", best_indicator[ob][nb]); + + rt_rte_trace_in(D_ROUTES, updates[i].new.sender->req, &updates[i].new, info); + + /* Double announcement of this specific route */ + if (ob && best_rpe) { - *new_best = new->next; - new->next = n->routes; - n->routes = new; + ASSERT_DIE(best_rpe->it.old == &updates[i].old->rte); + ASSERT_DIE(!best_rpe->seq_all); + best_rpe->seq_all = this_rpe->it.seq; } + else + last_seq = this_rpe->it.seq; - /* Announce the new best route */ - if (new != old_best) - rte_trace_in(D_ROUTES, new->sender, new, "updated [best]"); + total++; + } - /* Propagate changes */ - rte_announce_i(tab, RA_UNDEF, n, NULL, NULL, n->routes, old_best); + if (best_rpe && !best_rpe->seq_all) + { + ASSERT_DIE(!updates[0].new_stored); + best_rpe->seq_all = last_seq; + } - if (free_old_best) - rte_free_quick(old_best); + /* Now we can finally release the changes back into the table */ + atomic_store_explicit(&n->routes, new_best, memory_order_release); - return count; + return; +} + +static void +rt_nhu_uncork(callback *cb) +{ + RT_LOCKED(SKIP_BACK(rtable, priv.nhu_uncork.cb, cb), tab) + { + ASSERT_DIE(tab->nhu_corked); + ASSERT_DIE(tab->nhu_state == 0); + + /* Reset the state */ + tab->nhu_state = tab->nhu_corked; + tab->nhu_corked = 0; + rt_trace(tab, D_STATES, "Next hop updater uncorked"); + + ev_send_loop(tab->loop, tab->nhu_event); + rt_unlock_table(tab); + } } static void @@@ -4454,110 -3070,123 +4454,110 @@@ rt_check_cork_high(struct rtable_privat } } -void -rt_prune_sync(rtable *t, int all) + +static int +rt_reconfigure(struct rtable_private *tab, struct rtable_config *new, struct rtable_config *old) { - struct fib_iterator fit; + if ((new->addr_type != old->addr_type) || + (new->sorted != old->sorted) || + (new->trie_used != old->trie_used)) + return 0; - FIB_ITERATE_INIT(&fit, &t->fib); + ASSERT_DIE(new->master.setup == old->master.setup); + ASSERT_DIE(new->master.stop == old->master.stop); -again: - FIB_ITERATE_START(&t->fib, &fit, net, n) - { - rte *e, **ee = &n->routes; + DBG("\t%s: same\n", new->name); + new->table = RT_PUB(tab); + tab->name = new->name; + tab->config = new; + tab->debug = new->debug; + tab->export_all.trace_routes = tab->export_best.trace_routes = new->debug; - while (e = *ee) - { - if (all || (e->flags & (REF_STALE | REF_DISCARD))) - { - *ee = e->next; - rte_free_quick(e); - t->rt_count--; - } - else - ee = &e->next; - } + if (tab->hostcache) + tab->hostcache->req.trace_routes = new->debug; - if (all || !n->routes) - { - FIB_ITERATE_PUT(&fit); - fib_delete(&t->fib, n); - goto again; - } - } - FIB_ITERATE_END; -} + WALK_TLIST(rt_flowspec_link, ln, &tab->flowspec_links) + ln->req.trace_routes = new->debug; + tab->cork_threshold = new->cork_threshold; -/* - * Export table - */ + if (new->cork_threshold.high != old->cork_threshold.high) + rt_check_cork_high(tab); -int -rte_update_out(struct channel *c, const net_addr *n, rte *new, rte *old0, int refeed) -{ - struct rtable *tab = c->out_table; - struct rte_src *src; - rte *old, **pos; - net *net; + if (new->cork_threshold.low != old->cork_threshold.low) + rt_check_cork_low(tab); - if (new) - { - net = net_get(tab, n); - src = new->src; + if (tab->export_digest && ( + (new->digest_settle.min != tab->export_digest->settle.cf.min) + || (new->digest_settle.max != tab->export_digest->settle.cf.max))) + tab->export_digest->settle.cf = new->digest_settle; - if (!rta_is_cached(new->attrs)) - new->attrs = rta_lookup(new->attrs); - } - else - { - net = net_find(tab, n); - src = old0->src; + return 1; +} - if (!net) - goto drop_withdraw; - } +static struct rtable_config * +rt_find_table_config(struct config *cf, char *name) +{ + struct symbol *sym = cf_find_symbol(cf, name); + return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL; +} + +/** + * rt_commit - commit new routing table configuration + * @new: new configuration + * @old: original configuration or %NULL if it's boot time config + * + * Scan differences between @old and @new configuration and modify + * the routing tables according to these changes. If @new defines a + * previously unknown table, create it, if it omits a table existing + * in @old, schedule it for deletion (it gets deleted when all protocols + * disconnect from it by calling rt_unlock_table()), if it exists + * in both configurations, leave it unchanged. + */ +void +rt_commit(struct config *new, struct config *old) +{ + struct rtable_config *o, *r; - /* Find the old rte */ - for (pos = &net->routes; old = *pos; pos = &old->next) - if ((c->ra_mode != RA_ANY) || (old->src == src)) + DBG("rt_commit:\n"); + + if (old) { - if (new && rte_same(old, new)) + WALK_LIST(o, old->tables) { - _Bool ok; - /* REF_STALE / REF_DISCARD not used in export table */ - /* - if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY)) ++ bool ok; + RT_LOCKED(o->table, tab) { - old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY); - return 1; + r = OBSREF_GET(tab->deleted) ? NULL : rt_find_table_config(new, o->name); + ok = r && !new->shutdown && rt_reconfigure(tab, r, o); } - */ - goto drop_update; - } + if (ok) + continue; - /* Remove the old rte */ - *pos = old->next; - rte_free_quick(old); - tab->rt_count--; + birdloop_enter(o->table->loop); + RT_LOCKED(o->table, tab) + { + DBG("\t%s: deleted\n", o->name); + OBSREF_SET(tab->deleted, old); + rt_check_cork_low(tab); + rt_lock_table(tab); + rt_unlock_table(tab); + } - break; + CALL(o->table->config->master.stop, o->table); + birdloop_leave(o->table->loop); + } } - if (!new) - { - if (!old) - goto drop_withdraw; - - if (!net->routes) - fib_delete(&tab->fib, net); - - return 1; - } - - /* Insert the new rte */ - rte *e = rte_do_cow(new); - e->flags |= REF_COW; - e->net = net; - e->sender = c; - e->lastmod = current_time(); - e->next = *pos; - *pos = e; - tab->rt_count++; - return 1; - -drop_update: - return refeed; - -drop_withdraw: - return 0; + WALK_LIST(r, new->tables) + if (!r->table) + { + r->table = rt_setup(rt_table_pool, r); + DBG("\t%s: created\n", r->name); + add_tail(&routing_tables, &r->table->n); + } + DBG("\tdone\n"); } diff --cc proto/bgp/attrs.c index 7ed898d23,85646647f..d8d9d3cf3 --- a/proto/bgp/attrs.c +++ b/proto/bgp/attrs.c @@@ -1803,242 -1716,15 +1803,242 @@@ bgp_free_prefix(struct bgp_ptx_private } void -bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px) +bgp_done_prefix(struct bgp_ptx_private *c, struct bgp_prefix *px, struct bgp_bucket *buck) { + /* BMP hack */ + if (buck->bmp) + return; + + /* Cleanup: We're called from bucket senders. */ + ASSERT_DIE(px->cur == buck); rem_node(&px->buck_node); - HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px); - if (c->prefix_slab) - sl_free(px); - else - mb_free(px); + /* We may want to store the updates */ + if (c->c->tx_keep) + { + /* Nothing to be sent right now */ + px->cur = NULL; + + /* Unref the previous sent version */ + if (px->last) + if (!--px->last->px_uc) + bgp_done_bucket(c, px->last); + + /* Ref the current sent version */ + if (!IS_WITHDRAW_BUCKET(buck)) + { + px->last = buck; + px->last->px_uc++; + return; + } + + /* Prefixes belonging to the withdraw bucket are freed always */ + } + + bgp_free_prefix(c, px); +} + +void +bgp_tx_resend(struct bgp_proto *p, struct bgp_channel *bc) +{ + BGP_PTX_LOCK(bc->tx, c); + + ASSERT_DIE(bc->tx_keep); + uint seen = 0; + + HASH_WALK(c->prefix_hash, next, px) + { + if (!px->cur) + { + ASSERT_DIE(px->last); + struct bgp_bucket *last = px->last; + + /* Remove the last reference, we wanna resend the route */ + px->last->px_uc--; + px->last = NULL; + + /* And send it once again */ + seen += bgp_update_prefix(c, px, last); + } + } + HASH_WALK_END; + + if (bc->c.debug & D_EVENTS) + log(L_TRACE "%s.%s: TX resending %u routes", + bc->c.proto->name, bc->c.name, seen); + + if (seen) + bgp_schedule_packet(p->conn, bc, PKT_UPDATE); +} + +/* + * Prefix hash table exporter + */ + +static void +bgp_out_item_done(struct lfjour *j, struct lfjour_item *i) +{} + +static struct rt_export_feed * - bgp_out_feed_net(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, _Bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first) ++bgp_out_feed_net(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first) +{ + ASSERT_DIE(u == NULL); + SKIP_BACK_DECLARE(struct bgp_ptx_private, c, exporter, e); + ASSERT_DIE(DOMAIN_IS_LOCKED(rtable, c->lock)); + + struct netindex *ni = net_resolve_index(c->exporter.netindex, index); + if (ni == &net_index_out_of_range) + return &rt_feed_index_out_of_range; + + if (ni == NULL) + return NULL; + + if (prefilter && !prefilter(f, ni->addr)) + return NULL; + + struct rt_export_feed *feed = NULL; + + uint count = 0; + + struct bgp_prefix *chain = HASH_FIND_CHAIN(c->prefix_hash, PXH, ni, NULL); + + for (struct bgp_prefix *px = chain; px; px = px->next) + if (px->ni == ni) + count += !!px->last + !!px->cur; + + if (count) + { + feed = rt_alloc_feed(count, 0); + feed->ni = ni; + + uint pos = 0; + + for (struct bgp_prefix *px = chain; px; px = px->next) + if (px->ni == ni) + { + if (px->cur) + feed->block[pos++] = (rte) { + .attrs = (px->cur == c->withdraw_bucket) ? NULL : ea_free_later(ea_lookup_slow(px->cur->eattrs, 0, EALS_CUSTOM)), + .net = ni->addr, + .src = px->src, + .lastmod = px->lastmod, + .flags = REF_PENDING, + }; + + if (px->last) + feed->block[pos++] = (rte) { + .attrs = (px->last == c->withdraw_bucket) ? NULL : ea_free_later(ea_lookup_slow(px->last->eattrs, 0, EALS_CUSTOM)), + .net = ni->addr, + .src = px->src, + .lastmod = px->lastmod, + }; + } + + ASSERT_DIE(pos == count); + } + + return feed; +} + +/* TX structures Init and Free */ + +void +bgp_init_pending_tx(struct bgp_channel *c) +{ + ASSERT_DIE(c->c.out_table == NULL); + ASSERT_DIE(c->tx == NULL); + + DOMAIN(rtable) dom = DOMAIN_NEW_RCU_SYNC(rtable); + LOCK_DOMAIN(rtable, dom); + pool *p = rp_newf(c->pool, dom.rtable, "%s.%s TX", c->c.proto->name, c->c.name); + + struct bgp_ptx_private *bpp = mb_allocz(p, sizeof *bpp); + + bpp->lock = dom; + bpp->pool = p; + bpp->c = c; + + bgp_init_bucket_table(bpp); + bgp_init_prefix_table(bpp); + + bpp->exporter = (struct rt_exporter) { + .journal = { + .loop = c->c.proto->loop, + .item_size = sizeof(struct rt_export_item), + .item_done = bgp_out_item_done, + }, + .name = mb_sprintf(c->c.proto->pool, "%s.%s.export", c->c.proto->name, c->c.name), + .net_type = c->c.net_type, + .max_feed_index = 0, + .netindex = c->c.table->netindex, + .trace_routes = c->c.debug, + .feed_net = bgp_out_feed_net, + .domain = dom, + }; + + rt_exporter_init(&bpp->exporter, &c->cf->ptx_exporter_settle); + c->c.out_table = &bpp->exporter; + + c->tx = BGP_PTX_PUB(bpp); + + UNLOCK_DOMAIN(rtable, dom); +} + +void +bgp_free_pending_tx(struct bgp_channel *bc) +{ + if (!bc->tx) + return; + + DOMAIN(rtable) dom = bc->tx->lock; + LOCK_DOMAIN(rtable, dom); + struct bgp_ptx_private *c = &bc->tx->priv; + + bc->c.out_table = NULL; + rt_exporter_shutdown(&c->exporter, NULL); /* TODO: actually implement exports */ + + /* Move all prefixes to the withdraw bucket to unref the "last" prefixes */ + struct bgp_bucket *b = bgp_get_withdraw_bucket(c); + HASH_WALK(c->prefix_hash, next, px) + bgp_update_prefix(c, px, b); + HASH_WALK_END; + + /* Flush withdrawals */ + struct bgp_prefix *px; + WALK_LIST_FIRST(px, b->prefixes) + bgp_done_prefix(c, px, b); + + /* Flush pending TX */ + WALK_LIST_FIRST(b, c->bucket_queue) + { + WALK_LIST_FIRST(px, b->prefixes) + bgp_done_prefix(c, px, b); + bgp_done_bucket(c, b); + } + + /* Consistency and resource leak checks */ + HASH_WALK(c->prefix_hash, next, n) + bug("Stray prefix after cleanup"); + HASH_WALK_END; + + HASH_FREE(c->prefix_hash); + sl_delete(c->prefix_slab); + c->prefix_slab = NULL; + + HASH_WALK(c->bucket_hash, next, n) + bug("Stray bucket after cleanup"); + HASH_WALK_END; + + HASH_FREE(c->bucket_hash); + sl_delete(c->bucket_slab); + c->bucket_slab = NULL; + + rp_free(c->pool); + + UNLOCK_DOMAIN(rtable, dom); + DOMAIN_FREE(rtable, dom); + + bc->tx = NULL; } diff --cc sysdep/unix/alloc.c index 6d9bcde04,08fc99801..ef383f365 --- a/sysdep/unix/alloc.c +++ b/sysdep/unix/alloc.c @@@ -30,134 -29,67 +30,134 @@@ long page_size = 0; #ifdef HAVE_MMAP -#define KEEP_PAGES_MAIN_MAX 256 -#define KEEP_PAGES_MAIN_MIN 8 -#define CLEANUP_PAGES_BULK 256 +# define KEEP_PAGES_MAX 16384 +# define KEEP_PAGES_MIN 32 +# define KEEP_PAGES_MAX_LOCAL 128 +# define ALLOC_PAGES_AT_ONCE 32 + + STATIC_ASSERT(KEEP_PAGES_MIN * 4 < KEEP_PAGES_MAX); + STATIC_ASSERT(ALLOC_PAGES_AT_ONCE < KEEP_PAGES_MAX_LOCAL); + - static _Bool use_fake = 0; - static _Bool initialized = 0; ++ static bool use_fake = 0; ++ static bool initialized = 0; + +# define PROTECT_PAGE(pg) +# define UNPROTECT_PAGE(pg) + +# if DEBUGGING +# ifdef ENABLE_EXPENSIVE_CHECKS +# undef PROTECT_PAGE +# undef UNPROTECT_PAGE +# define PROTECT_PAGE(pg) mprotect((pg), page_size, PROT_READ) +# define UNPROTECT_PAGE(pg) mprotect((pg), page_size, PROT_READ | PROT_WRITE) +# endif + +# define AJSIZE 16384 + + static struct alloc_journal { + void *fp; + void *next; + u16 pos; + u16 type; + uint thread_id; + } alloc_journal[AJSIZE]; + + _Thread_local int alloc_journal_local_pos = -1; + _Atomic int alloc_journal_pos = 0; + +# define AJT_ALLOC_LOCAL_HOT 1 +# define AJT_ALLOC_GLOBAL_HOT 2 +# define AJT_ALLOC_COLD_STD 3 +# define AJT_ALLOC_COLD_KEEPER 4 +# define AJT_ALLOC_MMAP 5 + +# define AJT_FREE_LOCAL_HOT 0x11 +# define AJT_FREE_GLOBAL_HOT 0x12 + +# define AJT_CLEANUP_NOTHING 0xc0 +# define AJT_CLEANUP_COLD_STD 0xc3 +# define AJT_CLEANUP_COLD_KEEPER 0xc4 +# define AJT_CLEANUP_BEGIN 0xcb +# define AJT_CLEANUP_END 0xce + +# define AJT_FLUSH_LOCAL_BEGIN 0xfb +# define AJT_FLUSH_LOCAL_END 0xfe +# define AJT_SCHEDULE_CLEANUP 0xff + + static void + ajlog(void *fp, void *next, u16 pos, u16 type) + { + alloc_journal[(alloc_journal_local_pos = atomic_fetch_add_explicit(&alloc_journal_pos, 1, memory_order_relaxed)) % AJSIZE] = (struct alloc_journal) { + .fp = fp, + .next = next, + .pos = pos, + .type = type, + .thread_id = THIS_THREAD_ID, + }; + } -STATIC_ASSERT(KEEP_PAGES_MAIN_MIN * 4 < KEEP_PAGES_MAIN_MAX); + struct free_page { + node unused[42]; + struct free_page * _Atomic next; + }; +# else /* ! DEBUGGING */ -static bool use_fake = 0; +# define ajlog(...) -#if DEBUGGING -struct free_page { - node unused[42]; - node n; -}; -#else -struct free_page { - node n; -}; -#endif + struct free_page { + struct free_page * _Atomic next; + }; -#define EP_POS_MAX ((page_size - OFFSETOF(struct empty_pages, pages)) / sizeof (void *)) +# endif -struct empty_pages { - node n; - uint pos; - void *pages[0]; -}; +# define WRITE_NEXT(pg, val) do { UNPROTECT_PAGE((pg)); (pg)->next = (val); PROTECT_PAGE((pg)); } while (0) -struct free_pages { - list pages; /* List of (struct free_page) keeping free pages without releasing them (hot) */ - list empty; /* List of (struct empty_pages) keeping invalidated pages mapped for us (cold) */ - u16 min, max; /* Minimal and maximal number of free pages kept */ - uint cnt; /* Number of free pages in list */ - event cleanup; -}; +# define EP_POS_MAX ((page_size - OFFSETOF(struct empty_pages, pages)) / sizeof (void *)) -static void global_free_pages_cleanup_event(void *); -static void *alloc_cold_page(void); + struct empty_pages { + struct empty_pages *next; + uint pos; + void *pages[0]; + }; -static struct free_pages global_free_pages = { - .min = KEEP_PAGES_MAIN_MIN, - .max = KEEP_PAGES_MAIN_MAX, - .cleanup = { .hook = global_free_pages_cleanup_event }, -}; + static DOMAIN(resource) empty_pages_domain; + static struct empty_pages *empty_pages = NULL; -uint *pages_kept = &global_free_pages.cnt; + static struct free_page * _Atomic page_stack = NULL; + static _Thread_local struct free_page * local_page_stack = NULL; + static struct free_page page_stack_blocked; -static void * -alloc_sys_page(void) -{ - void *ptr = mmap(NULL, page_size, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + /* Try to replace the page stack head with a cork, until it succeeds. */ +# define PAGE_STACK_GET ({ \ + struct free_page *fp; \ + while ((fp = atomic_exchange_explicit(&page_stack, &page_stack_blocked, memory_order_acq_rel)) == &page_stack_blocked) birdloop_yield(); \ + fp; }) + /* Reinstate the stack with another value */ +# define PAGE_STACK_PUT(val) ASSERT_DIE(atomic_exchange_explicit(&page_stack, (val), memory_order_acq_rel) == &page_stack_blocked) - if (ptr == MAP_FAILED) - die("mmap(%ld) failed: %m", (s64) page_size); + static void page_cleanup(void *); + static event page_cleanup_event = { .hook = page_cleanup, }; +# define SCHEDULE_CLEANUP do if (initialized && !shutting_down) ev_send(&global_event_list, &page_cleanup_event); while (0) - return ptr; -} + _Atomic int pages_kept = 0; + _Atomic int pages_kept_locally = 0; + static _Thread_local int pages_kept_here = 0; -extern int shutting_down; /* Shutdown requested. */ + static void * + alloc_sys_page(void) + { + void *ptr = mmap(NULL, page_size * ALLOC_PAGES_AT_ONCE, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + if (ptr == MAP_FAILED) + die("mmap(%ld) failed: %m", (s64) page_size); + + return ptr; + } + + extern int shutting_down; /* Shutdown requested. */ #else // ! HAVE_MMAP -#define use_fake 1 +# define use_fake 1 #endif void * diff --cc sysdep/unix/domain.c index 7dace7e12,000000000..a3104b898 mode 100644,000000..100644 --- a/sysdep/unix/domain.c +++ b/sysdep/unix/domain.c @@@ -1,159 -1,0 +1,159 @@@ +/* + * BIRD Locking + * + * (c) 2020 Maria Matejka + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#undef LOCAL_DEBUG + +#undef DEBUG_LOCKING + +#include "lib/birdlib.h" +#include "lib/locking.h" +#include "lib/resource.h" +#include "lib/timer.h" + +#include "conf/conf.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Locking subsystem + */ + +#ifdef DEBUGGING +_Thread_local rw_spinlock *rw_spinlocks_taken[MAX_RWS_AT_ONCE]; +_Thread_local btime rw_spinlocks_time[MAX_RWS_AT_ONCE]; +_Thread_local u32 rw_spinlocks_taken_cnt; +_Thread_local u32 rw_spinlocks_taken_write; +#endif + +_Thread_local struct lock_order locking_stack = {}; +_Thread_local struct domain_generic **last_locked = NULL; + +#define ASSERT_NO_LOCK ASSERT_DIE(last_locked == NULL) + +struct domain_generic { + pthread_mutex_t mutex; + uint order; - _Bool forbidden_when_reading_rcu; ++ bool forbidden_when_reading_rcu; + struct domain_generic **prev; + struct lock_order *locked_by; + const char *name; + pool *pool; +}; + +#define DOMAIN_INIT(_order, _allow_rcu) { \ + .mutex = PTHREAD_MUTEX_INITIALIZER, \ + .order = _order, \ + .forbidden_when_reading_rcu = !_allow_rcu, \ +} + +static struct domain_generic the_bird_domain_gen = DOMAIN_INIT(OFFSETOF(struct lock_order, the_bird), 1); + +DOMAIN(the_bird) the_bird_domain = { .the_bird = &the_bird_domain_gen }; + +struct domain_generic * - domain_new(uint order, _Bool allow_rcu) ++domain_new(uint order, bool allow_rcu) +{ + ASSERT_DIE(order < sizeof(struct lock_order)); + struct domain_generic *dg = xmalloc(sizeof(struct domain_generic)); + *dg = (struct domain_generic) DOMAIN_INIT(order, allow_rcu); + return dg; +} + +void +domain_free(struct domain_generic *dg) +{ + pthread_mutex_destroy(&dg->mutex); + xfree(dg); +} + +const char * +domain_name(struct domain_generic *dg) +{ + return dg->name; +} + +uint dg_order(struct domain_generic *dg) +{ + return dg->order; +} + +void +domain_setup(struct domain_generic *dg, const char *name, pool *p) +{ + ASSERT_DIE(dg->pool == NULL); + dg->pool = p; + dg->name = name; +} + +void do_lock(struct domain_generic *dg, struct domain_generic **lsp) +{ + struct lock_order stack_copy; + memcpy(&stack_copy, &locking_stack, sizeof(stack_copy)); + struct domain_generic **lll = last_locked; + + if (dg->forbidden_when_reading_rcu) + if (rcu_read_active()) + bug("Locking of this lock forbidden while RCU reader is active"); + else + rcu_blocked++; + + if ((char *) lsp - (char *) &locking_stack != dg->order) + bug("Trying to lock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack); + + if (lsp <= last_locked) + bug("Trying to lock in a bad order: %p %p", &stack_copy, lll); + if (*lsp) + bug("Inconsistent locking stack state on lock"); + + btime lock_begin = current_time(); + pthread_mutex_lock(&dg->mutex); + btime duration = current_time() - lock_begin; + btime wdw = atomic_load_explicit(&global_runtime, memory_order_relaxed)->watchdog_warning; + if (wdw && (duration > wdw)) + log(L_WARN "Locking of %s took %d ms", dg->name, (int) (duration TO_MS)); + + if (dg->prev || dg->locked_by) + bug("Previous unlock not finished correctly"); + dg->prev = last_locked; + *lsp = dg; + last_locked = lsp; + dg->locked_by = &locking_stack; +} + +void do_unlock(struct domain_generic *dg, struct domain_generic **lsp) +{ + if (dg->forbidden_when_reading_rcu) + ASSERT_DIE(rcu_blocked--); + + if ((char *) lsp - (char *) &locking_stack != dg->order) + bug("Trying to unlock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack); + + if (dg->locked_by != &locking_stack) + bug("Inconsistent domain state on unlock"); + if ((last_locked != lsp) || (*lsp != dg)) + bug("Inconsistent locking stack state on unlock"); + dg->locked_by = NULL; + last_locked = dg->prev; + *lsp = NULL; + dg->prev = NULL; + pthread_mutex_unlock(&dg->mutex); + + /* From here on, the dg pointer is invalid! */ +} diff --cc sysdep/unix/io-loop.c index be18752a7,000000000..32530826c mode 100644,000000..100644 --- a/sysdep/unix/io-loop.c +++ b/sysdep/unix/io-loop.c @@@ -1,1758 -1,0 +1,1758 @@@ +/* + * BIRD -- I/O and event loop + * + * Can be freely distributed and used under the terms of the GNU GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "nest/bird.h" + +#include "lib/buffer.h" +#include "lib/defer.h" +#include "lib/lists.h" +#include "lib/locking.h" +#include "lib/resource.h" +#include "lib/event.h" +#include "lib/timer.h" +#include "lib/socket.h" + +#include "lib/io-loop.h" +#include "sysdep/unix/io-loop.h" +#include "conf/conf.h" +#include "nest/cli.h" + +#define THREAD_STACK_SIZE 65536 /* To be lowered in near future */ + +static struct birdloop *birdloop_new_no_pickup(pool *pp, uint order, const char *name, ...); + +/* + * Nanosecond time for accounting purposes + * + * A fixed point on startup is set as zero, all other values are relative to that. + * Caution: this overflows after like 500 years or so. If you plan to run + * BIRD for such a long time, please implement some means of overflow prevention. + */ + +#if ! HAVE_CLOCK_MONOTONIC_COARSE +#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC +#endif + +static struct timespec ns_begin; + +static void ns_init(void) +{ + if (clock_gettime(CLOCK_MONOTONIC_COARSE, &ns_begin)) + bug("clock_gettime: %m"); +} + +#define NSEC_IN_SEC ((u64) (1000 * 1000 * 1000)) + +u64 ns_now(void) +{ + struct timespec ts; + if (clock_gettime(CLOCK_MONOTONIC_COARSE, &ts)) + bug("clock_gettime: %m"); + + return (u64) (ts.tv_sec - ns_begin.tv_sec) * NSEC_IN_SEC + ts.tv_nsec - ns_begin.tv_nsec; +} + +#define NSEC_TO_SEC(x) ((x) / NSEC_IN_SEC) +#define CURRENT_SEC NSEC_TO_SEC(ns_now()) + +static _Thread_local struct spent_time *account_target_spent_time; +static _Thread_local u64 *account_target_total; +static _Thread_local u64 account_last; + +static u64 account_finish(void) +{ + /* Get current time */ + u64 now = ns_now(); + u64 dif = now - account_last; + + /* Update second by second */ + if (account_target_spent_time) + { + /* Drop old time information if difference is too large */ + if (NSEC_TO_SEC(account_last) + TIME_BY_SEC_SIZE - 1 < NSEC_TO_SEC(now)) + account_last = (NSEC_TO_SEC(now) - TIME_BY_SEC_SIZE + 1) * NSEC_IN_SEC; + + /* Zero new records */ + if (NSEC_TO_SEC(account_target_spent_time->last_written_ns) + TIME_BY_SEC_SIZE < NSEC_TO_SEC(account_last)) + memset(account_target_spent_time->by_sec_ns, 0, sizeof(account_target_spent_time->by_sec_ns)); + else + for (u64 fclr = NSEC_TO_SEC(account_target_spent_time->last_written_ns) + 1; + fclr <= NSEC_TO_SEC(now); + fclr++) + account_target_spent_time->by_sec_ns[fclr % TIME_BY_SEC_SIZE] = 0; + + /* Add times second by second */ + while (NSEC_TO_SEC(account_last) != NSEC_TO_SEC(now)) + { + u64 part = (NSEC_TO_SEC(account_last) + 1) * NSEC_IN_SEC - account_last; + account_target_spent_time->by_sec_ns[NSEC_TO_SEC(account_last) % TIME_BY_SEC_SIZE] += part; + account_last += part; + } + + /* Update the last second */ + account_target_spent_time->by_sec_ns[NSEC_TO_SEC(account_last) % TIME_BY_SEC_SIZE] += now - account_last; + + /* Store the current time */ + account_target_spent_time->last_written_ns = now; + } + + /* Update the total */ + if (account_target_total) + *account_target_total += dif; + + /* Store current time */ + account_last = now; + + return dif; +} + +static u64 account_to_spent_time(struct spent_time *st) +{ + u64 elapsed = account_finish(); + + account_target_spent_time = st; + account_target_total = &st->total_ns; + + return elapsed; +} + +static u64 account_to_total(u64 *total) +{ + u64 elapsed = account_finish(); + + account_target_spent_time = NULL; + account_target_total = total; + + return elapsed; +} + +#define account_to(_arg) _Generic((_arg), \ + struct spent_time *: account_to_spent_time, \ + u64 *: account_to_total)(_arg) + +/* + * Current thread context + */ + +_Thread_local struct birdloop *birdloop_current; +static _Thread_local struct birdloop *birdloop_wakeup_masked; +static _Thread_local uint birdloop_wakeup_masked_count; + +#define LOOP_NAME(loop) domain_name((loop)->time.domain) +#define LATENCY_DEBUG(flags) (atomic_load_explicit(&global_runtime, memory_order_relaxed)->latency_debug & (flags)) + +#define LOOP_TRACE(loop, flags, fmt, args...) do { if (LATENCY_DEBUG(flags)) log(L_TRACE "%s (%p): " fmt, LOOP_NAME(loop), (loop), ##args); } while (0) +#define THREAD_TRACE(flags, ...) do { if (LATENCY_DEBUG(flags)) log(L_TRACE "Thread: " __VA_ARGS__); } while (0) + +#define LOOP_WARN(loop, fmt, args...) log(L_WARN "%s (%p): " fmt, LOOP_NAME(loop), (loop), ##args) + + +event_list * +birdloop_event_list(struct birdloop *loop) +{ + return &loop->event_list; +} + +struct timeloop * +birdloop_time_loop(struct birdloop *loop) +{ + return &loop->time; +} + +pool * +birdloop_pool(struct birdloop *loop) +{ + return loop->pool; +} + - _Bool ++bool +birdloop_inside(struct birdloop *loop) +{ + for (struct birdloop *c = birdloop_current; c; c = c->prev_loop) + if (loop == c) + return 1; + + return 0; +} + - _Bool ++bool +birdloop_in_this_thread(struct birdloop *loop) +{ + return pthread_equal(pthread_self(), loop->thread->thread_id); +} + +/* + * Wakeup code for birdloop + */ + +void +pipe_new(struct pipe *p) +{ + int rv = pipe(p->fd); + if (rv < 0) + die("pipe: %m"); + + if (fcntl(p->fd[0], F_SETFL, O_NONBLOCK) < 0) + die("fcntl(O_NONBLOCK): %m"); + + if (fcntl(p->fd[1], F_SETFL, O_NONBLOCK) < 0) + die("fcntl(O_NONBLOCK): %m"); +} + +void +pipe_drain(struct pipe *p) +{ + while (1) { + char buf[64]; + int rv = read(p->fd[0], buf, sizeof(buf)); + if ((rv < 0) && (errno == EAGAIN)) + return; + + if (rv == 0) + bug("wakeup read eof"); + if ((rv < 0) && (errno != EINTR)) + bug("wakeup read: %m"); + } +} + +int +pipe_read_one(struct pipe *p) +{ + while (1) { + char v; + int rv = read(p->fd[0], &v, sizeof(v)); + if (rv == 1) + return 1; + if ((rv < 0) && (errno == EAGAIN)) + return 0; + if (rv > 1) + bug("wakeup read more bytes than expected: %d", rv); + if (rv == 0) + bug("wakeup read eof"); + if (errno != EINTR) + bug("wakeup read: %m"); + } +} + +void +pipe_kick(struct pipe *p) +{ + char v = 1; + int rv; + + while (1) { + rv = write(p->fd[1], &v, sizeof(v)); + if ((rv >= 0) || (errno == EAGAIN)) + return; + if (errno != EINTR) + bug("wakeup write: %m"); + } +} + +void +pipe_pollin(struct pipe *p, struct pfd *pfd) +{ + BUFFER_PUSH(pfd->pfd) = (struct pollfd) { + .fd = p->fd[0], + .events = POLLIN, + }; + BUFFER_PUSH(pfd->loop) = NULL; +} + +void +pipe_free(struct pipe *p) +{ + close(p->fd[0]); + close(p->fd[1]); +} + +static inline void +wakeup_init(struct bird_thread *loop) +{ + pipe_new(&loop->wakeup); +} + +static inline void +wakeup_drain(struct bird_thread *loop) +{ + pipe_drain(&loop->wakeup); +} + +static inline void +wakeup_do_kick(struct bird_thread *loop) +{ + pipe_kick(&loop->wakeup); +} + +static inline void +wakeup_free(struct bird_thread *loop) +{ + pipe_free(&loop->wakeup); +} + - static inline _Bool ++static inline bool +birdloop_try_ping(struct birdloop *loop, u32 ltt) +{ + /* Somebody else is already pinging, be idempotent */ + if (ltt & LTT_PING) + { + LOOP_TRACE(loop, DL_PING, "already being pinged"); + return 0; + } + + /* Thread moving is an implicit ping */ + if (ltt & LTT_MOVE) + { + LOOP_TRACE(loop, DL_PING, "ping while moving"); + return 1; + } + + /* No more flags allowed */ + ASSERT_DIE(!ltt); + + /* No ping when not picked up */ + if (!loop->thread) + { + LOOP_TRACE(loop, DL_PING, "not picked up yet, can't ping"); + return 1; + } + + /* No ping when masked */ + if (loop == birdloop_wakeup_masked) + { + LOOP_TRACE(loop, DL_PING, "wakeup masked, can't ping"); + birdloop_wakeup_masked_count++; + return 1; + } + + /* Send meta event to ping */ + if ((loop != loop->thread->meta) && (loop != &main_birdloop)) + { + LOOP_TRACE(loop, DL_PING, "Ping by meta event to %p", loop->thread->meta); + ev_send_loop(loop->thread->meta, &loop->event); + return 1; + } + + /* Do the real ping of Meta or Main */ + LOOP_TRACE(loop, DL_WAKEUP, "sending pipe ping"); + wakeup_do_kick(loop->thread); + return 0; +} + +static inline void +birdloop_do_ping(struct birdloop *loop) +{ + /* Register our ping effort */ + u32 ltt = atomic_fetch_or_explicit(&loop->thread_transition, LTT_PING, memory_order_acq_rel); + + /* Try to ping in multiple ways */ + if (birdloop_try_ping(loop, ltt)) + atomic_fetch_and_explicit(&loop->thread_transition, ~LTT_PING, memory_order_acq_rel); +} + +void +birdloop_ping(struct birdloop *loop) +{ + if (!birdloop_inside(loop)) + { + LOOP_TRACE(loop, DL_PING, "ping from outside"); + birdloop_do_ping(loop); + } + else + { + LOOP_TRACE(loop, DL_PING, "ping from inside, pending=%d", loop->ping_pending); + if (!loop->ping_pending) + loop->ping_pending++; + } +} + + +/* + * Sockets + */ + +static void +sockets_init(struct birdloop *loop) +{ + init_list(&loop->sock_list); + loop->sock_num = 0; +} + +void +socket_changed(sock *s) +{ + struct birdloop *loop = s->loop; + ASSERT_DIE(birdloop_inside(loop)); + + LOOP_TRACE(loop, DL_SOCKETS, "socket %p changed", s); + loop->sock_changed = 1; + birdloop_ping(loop); +} + +void +birdloop_add_socket(struct birdloop *loop, sock *s) +{ + ASSERT_DIE(birdloop_inside(loop)); + ASSERT_DIE(!s->loop); + + LOOP_TRACE(loop, DL_SOCKETS, "adding socket %p (total=%d)", s, loop->sock_num); + add_tail(&loop->sock_list, &s->n); + loop->sock_num++; + + s->loop = loop; + s->index = -1; + + socket_changed(s); +} + +extern sock *stored_sock; /* mainloop hack */ + +void +birdloop_remove_socket(struct birdloop *loop, sock *s) +{ + ASSERT_DIE(!enlisted(&s->n) == !s->loop); + + if (!s->loop) + return; + + ASSERT_DIE(birdloop_inside(loop)); + ASSERT_DIE(s->loop == loop); + + /* Decouple the socket from the loop at all. */ + LOOP_TRACE(loop, DL_SOCKETS, "removing socket %p (total=%d)", s, loop->sock_num); + + if (loop->sock_active == s) + loop->sock_active = sk_next(s); + + if ((loop == &main_birdloop) && (s == stored_sock)) + stored_sock = sk_next(s); + + rem_node(&s->n); + loop->sock_num--; + + socket_changed(s); + + s->loop = NULL; + s->index = -1; +} + +void +sk_reloop(sock *s, struct birdloop *loop) +{ + ASSERT_DIE(birdloop_inside(loop)); + ASSERT_DIE(birdloop_inside(s->loop)); + + if (loop == s->loop) + return; + + birdloop_remove_socket(s->loop, s); + birdloop_add_socket(loop, s); +} + +void +sk_pause_rx(struct birdloop *loop, sock *s) +{ + ASSERT_DIE(birdloop_inside(loop)); + s->rx_hook = NULL; + socket_changed(s); +} + +void +sk_resume_rx(struct birdloop *loop, sock *s, int (*hook)(sock *, uint)) +{ + ASSERT_DIE(birdloop_inside(loop)); + ASSERT_DIE(hook); + s->rx_hook = hook; + socket_changed(s); +} + +static inline uint sk_want_events(sock *s) +{ return (s->rx_hook ? POLLIN : 0) | (sk_tx_pending(s) ? POLLOUT : 0); } + +void +sockets_prepare(struct birdloop *loop, struct pfd *pfd) +{ + node *n; + WALK_LIST(n, loop->sock_list) + { + SKIP_BACK_DECLARE(sock, s, n, n); + uint w = sk_want_events(s); + + if (!w) + { + s->index = -1; + continue; + } + + s->index = pfd->pfd.used; + LOOP_TRACE(loop, DL_SOCKETS, "socket %p poll index is %d", s, s->index); + + BUFFER_PUSH(pfd->pfd) = (struct pollfd) { + .fd = s->fd, + .events = sk_want_events(s), + }; + BUFFER_PUSH(pfd->loop) = loop; + } +} + +int sk_read(sock *s, int revents); +int sk_write(sock *s); +void sk_err(sock *s, int revents); + +static void - sockets_fire(struct birdloop *loop, _Bool read, _Bool write) ++sockets_fire(struct birdloop *loop, bool read, bool write) +{ + if (EMPTY_LIST(loop->sock_list)) + return; + + times_update(); + + struct pollfd *pfd = loop->thread->pfd->pfd.data; + loop->sock_active = SKIP_BACK(sock, n, HEAD(loop->sock_list)); + + while (loop->sock_active) + { + sock *s = loop->sock_active; + + int rev; + if ((s->index >= 0) && (rev = pfd[s->index].revents) && !(rev & POLLNVAL)) + { + int e = 1; + + if (write && (rev & POLLOUT)) + { + /* Write until task limit is up */ + while ((s == loop->sock_active) && (e = sk_write(s)) && task_still_in_limit()) + ; + + if (s != loop->sock_active) + continue; + + if (!sk_tx_pending(s)) + loop->thread->sock_changed = 1; + } + + /* Read until task limit is up */ + if (read && (rev & POLLIN)) + while ((s == loop->sock_active) && s->rx_hook && sk_read(s, rev) && (s->fast_rx || task_still_in_limit())) + ; + + if (s != loop->sock_active) + continue; + + if (!(rev & (POLLOUT | POLLIN)) && (rev & POLLERR)) + sk_err(s, rev); + + if (s != loop->sock_active) + continue; + } + + loop->sock_active = sk_next(s); + } +} + +/* + * Threads + */ + +static void bird_thread_start_event(void *_data); +static void bird_thread_busy_set(struct bird_thread *thr, int val); + +struct birdloop_pickup_group { + DOMAIN(attrs) domain; + list loops; + list threads; + uint thread_count; + uint thread_busy_count; + uint loop_count; + uint loop_unassigned_count; + btime max_latency; + event start_threads; +} pickup_groups[2] = { + { + /* all zeroes */ + }, + { + /* FIXME: make this dynamic, now it copies the loop_max_latency value from proto/bfd/config.Y */ + .max_latency = 10 MS, + .start_threads.hook = bird_thread_start_event, + .start_threads.data = &pickup_groups[1], + }, +}; + +static _Thread_local struct bird_thread *this_thread; + +static void +birdloop_set_thread(struct birdloop *loop, struct bird_thread *thr, struct birdloop_pickup_group *group) +{ + struct bird_thread *old = loop->thread; + ASSERT_DIE(!thr != !old); + + /* Signal our moving effort */ + u32 ltt = atomic_fetch_or_explicit(&loop->thread_transition, LTT_MOVE, memory_order_acq_rel); + ASSERT_DIE((ltt & LTT_MOVE) == 0); + + /* Wait until all previously started pings end */ + while (ltt & LTT_PING) + { + birdloop_yield(); + ltt = atomic_load_explicit(&loop->thread_transition, memory_order_acquire); + ASSERT_DIE(ltt & LTT_MOVE); + } + /* Now we are free of running pings */ + + if (!thr) + { + /* Unschedule from Meta */ + ev_postpone(&loop->event); + tm_stop(&loop->timer); + + /* Request local socket reload */ + this_thread->sock_changed = 1; + } + + /* Update the thread value */ + loop->thread = thr; + + /* Allow pings */ + atomic_fetch_and_explicit(&loop->thread_transition, ~LTT_MOVE, memory_order_acq_rel); + + /* Put into appropriate lists */ + if (thr) + { + thr->loop_count++; + add_tail(&thr->loops, &loop->n); + + if (!EMPTY_LIST(loop->sock_list)) + thr->sock_changed = 1; + ev_send_loop(loop->thread->meta, &loop->event); + } + else + { + /* Put into pickup list */ + LOCK_DOMAIN(attrs, group->domain); + add_tail(&group->loops, &loop->n); + group->loop_unassigned_count++; + UNLOCK_DOMAIN(attrs, group->domain); + } + + loop->last_transition_ns = ns_now(); +} + +static void +bird_thread_pickup_next(struct birdloop_pickup_group *group) +{ + /* This thread goes to the end of the pickup list */ + rem_node(&this_thread->n); + add_tail(&group->threads, &this_thread->n); + + /* If there are more loops to be picked up, wakeup the next thread in order */ + if (!EMPTY_LIST(group->loops)) + wakeup_do_kick(SKIP_BACK(struct bird_thread, n, HEAD(group->threads))); +} + - static _Bool ++static bool +birdloop_hot_potato(struct birdloop *loop) +{ + if (!loop) + return 0; + + return ns_now() - loop->last_transition_ns < 1 S TO_NS; +} + +static void +birdloop_take(struct birdloop_pickup_group *group) +{ + struct birdloop *loop = NULL; + + if (birdloop_hot_potato(this_thread->meta)) + return; + + LOCK_DOMAIN(attrs, group->domain); + + if (this_thread->busy_active && + (group->thread_busy_count < group->thread_count) && + (this_thread->loop_count > 1) && + !EMPTY_LIST(group->loops) && + birdloop_hot_potato(HEAD(group->loops))) + { + THREAD_TRACE(DL_SCHEDULING, "Loop drop requested (tbc=%d, tc=%d, lc=%d)", + group->thread_busy_count, group->thread_count, this_thread->loop_count); + UNLOCK_DOMAIN(attrs, group->domain); + + uint dropped = 0; + node *n; + WALK_LIST2(loop, n, this_thread->loops, n) + { + birdloop_enter(loop); + if (ev_active(&loop->event) && !loop->stopped && !birdloop_hot_potato(loop)) + { + /* Pass to another thread */ + rem_node(&loop->n); + this_thread->loop_count--; + LOOP_TRACE(loop, DL_SCHEDULING, "Dropping from thread, remaining %u loops here", this_thread->loop_count); + + /* This also unschedules the loop from Meta */ + birdloop_set_thread(loop, NULL, group); + + dropped++; + if (dropped * dropped > this_thread->loop_count) + { + birdloop_leave(loop); + + LOCK_DOMAIN(attrs, group->domain); + bird_thread_pickup_next(group); + UNLOCK_DOMAIN(attrs, group->domain); + + break; + } + } + birdloop_leave(loop); + } + + if (dropped) + { + this_thread->meta->last_transition_ns = ns_now(); + return; + } + + this_thread->busy_counter = 0; + bird_thread_busy_set(this_thread, 0); + LOCK_DOMAIN(attrs, group->domain); + } + + if (!EMPTY_LIST(group->loops)) + { + THREAD_TRACE(DL_SCHEDULING, "Loop take requested"); + + /* Take a proportional amount of loops from the pickup list and unlock */ + uint thread_count = group->thread_count + 1; + if (group->thread_busy_count < group->thread_count) + thread_count -= group->thread_busy_count; + + uint assign = 1 + group->loop_unassigned_count / thread_count; + for (uint i=0; !EMPTY_LIST(group->loops) && iloops)); + rem_node(&loop->n); + group->loop_unassigned_count--; + UNLOCK_DOMAIN(attrs, group->domain); + + birdloop_enter(loop); + birdloop_set_thread(loop, this_thread, group); + LOOP_TRACE(loop, DL_SCHEDULING, "Picked up by thread"); + + node *n; + WALK_LIST(n, loop->sock_list) + SKIP_BACK(sock, n, n)->index = -1; + + birdloop_leave(loop); + + LOCK_DOMAIN(attrs, group->domain); + } + + bird_thread_pickup_next(group); + } + + UNLOCK_DOMAIN(attrs, group->domain); + this_thread->meta->last_transition_ns = ns_now(); +} + +static int +poll_timeout(struct birdloop *loop) +{ + timer *t = timers_first(&loop->time); + if (!t) + { + THREAD_TRACE(DL_SCHEDULING, "No timers, no events in meta"); + return -1; + } + + btime remains = tm_remains(t); + int timeout = remains TO_MS + ((remains TO_MS) MS < remains); + + THREAD_TRACE(DL_SCHEDULING, "Next meta timer in %d ms for %s", timeout, + LOOP_NAME(SKIP_BACK(struct birdloop, timer, t))); + + return timeout; +} + +static void +bird_thread_busy_set(struct bird_thread *thr, int val) +{ + LOCK_DOMAIN(attrs, thr->group->domain); + if (thr->busy_active = val) + thr->group->thread_busy_count++; + else + thr->group->thread_busy_count--; + ASSERT_DIE(thr->group->thread_busy_count <= thr->group->thread_count); + UNLOCK_DOMAIN(attrs, thr->group->domain); +} + +static void * +bird_thread_main(void *arg) +{ + struct bird_thread *thr = this_thread = arg; + + rcu_thread_start(); + + account_to(&thr->overhead); + + birdloop_enter(thr->meta); + this_birdloop = thr->meta; + + THREAD_TRACE(DL_SCHEDULING, "Started"); + + tmp_init(thr->pool); + init_list(&thr->loops); + + defer_init(lp_new(thr->pool)); + + thr->sock_changed = 1; + + struct pfd pfd; + BUFFER_INIT(pfd.pfd, thr->pool, 16); + BUFFER_INIT(pfd.loop, thr->pool, 16); + thr->pfd = &pfd; + + while (1) + { + u64 thr_loop_start = ns_now(); + int timeout; + + /* Schedule all loops with timed out timers */ + timers_fire(&thr->meta->time, 0); + + /* Pickup new loops */ + birdloop_take(thr->group); + + /* Compute maximal time per loop */ + u64 thr_before_run = ns_now(); + if (thr->loop_count > 0) + { + thr->max_loop_time_ns = (thr->max_latency_ns / 2 - (thr_before_run - thr_loop_start)) / (u64) thr->loop_count; + if (thr->max_loop_time_ns NS > 300 MS) + thr->max_loop_time_ns = 300 MS TO_NS; + } + + /* Run all scheduled loops */ + int more_events = ev_run_list(&thr->meta->event_list); + if (more_events) + { + THREAD_TRACE(DL_SCHEDULING, "More metaevents to run from %s", + LOOP_NAME(SKIP_BACK(struct birdloop, event, + atomic_load_explicit(&thr->meta->event_list.receiver, memory_order_relaxed))) + ); + timeout = 0; + } + else + timeout = poll_timeout(thr->meta); + + /* Run priority events before sleeping */ + ev_run_list(&thr->priority_events); + + /* Do we have to refresh sockets? */ + if (thr->sock_changed) + { + THREAD_TRACE(DL_SOCKETS, "Recalculating socket poll"); + thr->sock_changed = 0; + + BUFFER_FLUSH(pfd.pfd); + BUFFER_FLUSH(pfd.loop); + + pipe_pollin(&thr->wakeup, &pfd); + + node *nn; + struct birdloop *loop; + WALK_LIST2(loop, nn, thr->loops, n) + { + birdloop_enter(loop); + sockets_prepare(loop, &pfd); + birdloop_leave(loop); + } + + ASSERT_DIE(pfd.loop.used == pfd.pfd.used); + THREAD_TRACE(DL_SOCKETS, "Total %d sockets", pfd.pfd.used); + } + + /* Check thread busy indicator */ + int idle_force = (timeout < 0) || (timeout > 300); + int busy_now = (timeout < 5) && !idle_force; + + /* Nothing to do right now but there may be some loops for pickup */ + if (idle_force) + { + LOCK_DOMAIN(attrs, thr->group->domain); + if (!EMPTY_LIST(thr->group->loops)) + timeout = 0; + UNLOCK_DOMAIN(attrs, thr->group->domain); + } + + if (busy_now && !thr->busy_active && (++thr->busy_counter == 4)) + bird_thread_busy_set(thr, 1); + + if (!busy_now && thr->busy_active && (idle_force || (--thr->busy_counter == 0))) + { + thr->busy_counter = 0; + bird_thread_busy_set(thr, 0); + } + + account_to(&this_thread->idle); + birdloop_leave(thr->meta); +poll_retry:; + int rv = poll(pfd.pfd.data, pfd.pfd.used, timeout); + if (rv < 0) + { + if (errno == EINTR || errno == EAGAIN) + goto poll_retry; + bug("poll in %p: %m", thr); + } + + account_to(&this_thread->overhead); + birdloop_enter(thr->meta); + + /* Drain wakeup fd */ + if (pfd.pfd.data[0].revents & POLLIN) + { + THREAD_TRACE(DL_WAKEUP, "Ping received"); + ASSERT_DIE(rv > 0); + rv--; + wakeup_drain(thr); + } + + /* Unset ping information for Meta */ + atomic_fetch_and_explicit(&thr->meta->thread_transition, ~LTT_PING, memory_order_acq_rel); + + /* Schedule loops with active sockets */ + if (rv) + for (uint i = 1; i < pfd.pfd.used; i++) + if (pfd.pfd.data[i].revents) + { + LOOP_TRACE(pfd.loop.data[i], DL_SOCKETS, "socket id %d got revents=0x%x", i, pfd.pfd.data[i].revents); + ev_send_loop(thr->meta, &pfd.loop.data[i]->event); + } + } + + bug("An infinite loop has ended."); +} + +static void +bird_thread_cleanup(void *_thr) +{ + struct bird_thread *thr = _thr; + struct birdloop *meta = thr->meta; + ASSERT_DIE(birdloop_inside(&main_birdloop)); + + /* Wait until the thread actually finishes */ + ASSERT_DIE(meta); + birdloop_enter(meta); + birdloop_leave(meta); + + /* No more wakeup */ + wakeup_free(thr); + + /* Thread attributes no longer needed */ + pthread_attr_destroy(&thr->thread_attr); + + /* Free the meta loop */ + thr->meta->thread = NULL; + thr->meta = NULL; + birdloop_free(meta); +} + +static struct bird_thread * +bird_thread_start(struct birdloop_pickup_group *group) +{ + ASSERT_DIE(birdloop_inside(&main_birdloop)); + + struct birdloop *meta = birdloop_new_no_pickup(&root_pool, DOMAIN_ORDER(meta), "Thread Meta"); + pool *p = birdloop_pool(meta); + + birdloop_enter(meta); + LOCK_DOMAIN(attrs, group->domain); + + struct bird_thread *thr = mb_allocz(p, sizeof(*thr)); + thr->pool = p; + thr->cleanup_event = (event) { .hook = bird_thread_cleanup, .data = thr, }; + thr->group = group; + thr->max_latency_ns = (group->max_latency ?: 5 S) TO_NS; + thr->meta = meta; + thr->meta->thread = thr; + + wakeup_init(thr); + ev_init_list(&thr->priority_events, NULL, "Thread direct event list"); + + add_tail(&group->threads, &thr->n); + + int e = 0; + + if (e = pthread_attr_init(&thr->thread_attr)) + die("pthread_attr_init() failed: %M", e); + + /* We don't have to worry about thread stack size so much. + if (e = pthread_attr_setstacksize(&thr->thread_attr, THREAD_STACK_SIZE)) + die("pthread_attr_setstacksize(%u) failed: %M", THREAD_STACK_SIZE, e); + */ + + if (e = pthread_attr_setdetachstate(&thr->thread_attr, PTHREAD_CREATE_DETACHED)) + die("pthread_attr_setdetachstate(PTHREAD_CREATE_DETACHED) failed: %M", e); + + if (e = pthread_create(&thr->thread_id, &thr->thread_attr, bird_thread_main, thr)) + die("pthread_create() failed: %M", e); + + group->thread_count++; + + UNLOCK_DOMAIN(attrs, group->domain); + birdloop_leave(meta); + return thr; +} + +static void +bird_thread_start_event(void *_data) +{ + struct birdloop_pickup_group *group = _data; + bird_thread_start(group); +} + +static struct birdloop *thread_dropper; +static event *thread_dropper_event; +static uint thread_dropper_goal; + +static void +bird_thread_dropper_free(void *data) +{ + struct birdloop *tdl_stop = data; + birdloop_free(tdl_stop); +} + +static void +bird_thread_shutdown(void * _ UNUSED) +{ + struct birdloop_pickup_group *group = this_thread->group; + LOCK_DOMAIN(attrs, group->domain); + int dif = group->thread_count - thread_dropper_goal; + struct birdloop *tdl_stop = NULL; + + if (dif > 0) + ev_send_loop(thread_dropper, thread_dropper_event); + else + { + tdl_stop = thread_dropper; + thread_dropper = NULL; + } + + UNLOCK_DOMAIN(attrs, group->domain); + + THREAD_TRACE(DL_SCHEDULING, "Thread pickup size differs from dropper goal by %d%s", dif, tdl_stop ? ", stopping" : ""); + + if (tdl_stop) + { + birdloop_stop_self(tdl_stop, bird_thread_dropper_free, tdl_stop); + return; + } + + struct bird_thread *thr = this_thread; + + LOCK_DOMAIN(attrs, group->domain); + /* Leave the thread-picker list to get no more loops */ + rem_node(&thr->n); + group->thread_count--; + + /* Fix the busy count */ + if (thr->busy_active) + group->thread_busy_count--; + + UNLOCK_DOMAIN(attrs, group->domain); + + /* Leave the thread-dropper loop as we aren't going to return. */ + birdloop_leave(thread_dropper); + + /* Last try to run the priority event list; ruin it then to be extra sure */ + ev_run_list(&this_thread->priority_events); + memset(&this_thread->priority_events, 0xa5, sizeof(this_thread->priority_events)); + + /* Drop loops including the thread dropper itself */ + while (!EMPTY_LIST(thr->loops)) + { + struct birdloop *loop = HEAD(thr->loops); + + /* Remove loop from this thread's list */ + this_thread->loop_count--; + rem_node(&loop->n); + + /* Unset loop's thread */ + birdloop_set_thread(loop, NULL, group); + } + + /* Let others know about new loops */ + LOCK_DOMAIN(attrs, group->domain); + if (!EMPTY_LIST(group->loops)) + wakeup_do_kick(SKIP_BACK(struct bird_thread, n, HEAD(group->threads))); + UNLOCK_DOMAIN(attrs, group->domain); + + /* Request thread cleanup from main loop */ + ev_send_loop(&main_birdloop, &thr->cleanup_event); + + /* Local pages not needed anymore */ + flush_local_pages(); + + /* Unregister from RCU */ + rcu_thread_stop(); + + /* Now we can be cleaned up */ + birdloop_leave(thr->meta); + + /* Exit! */ + THREAD_TRACE(DL_SCHEDULING, "Stopped"); + pthread_exit(NULL); +} + +void +bird_thread_commit(struct config *new, struct config *old UNUSED) +{ + ASSERT_DIE(birdloop_inside(&main_birdloop)); + + if (new->shutdown) + return; + + if (!new->thread_count) + new->thread_count = 1; + + while (1) + { + struct birdloop_pickup_group *group = &pickup_groups[0]; + LOCK_DOMAIN(attrs, group->domain); + + int dif = group->thread_count - (thread_dropper_goal = new->thread_count); - _Bool thread_dropper_running = !!thread_dropper; ++ bool thread_dropper_running = !!thread_dropper; + + UNLOCK_DOMAIN(attrs, group->domain); + + if (dif < 0) + { + bird_thread_start(group); + continue; + } + + if ((dif > 0) && !thread_dropper_running) + { + struct birdloop *tdl = birdloop_new(&root_pool, DOMAIN_ORDER(control), group->max_latency, "Thread dropper"); + birdloop_enter(tdl); + event *tde = ev_new_init(tdl->pool, bird_thread_shutdown, NULL); + + LOCK_DOMAIN(attrs, group->domain); + thread_dropper = tdl; + thread_dropper_event = tde; + UNLOCK_DOMAIN(attrs, group->domain); + + ev_send_loop(thread_dropper, thread_dropper_event); + birdloop_leave(tdl); + } + + return; + } +} + +/* Cleanup after last thread */ +static void +bird_thread_sync_finish(void *_sync) +{ + ASSERT_THE_BIRD_LOCKED; + struct bird_thread_syncer *sync = _sync; + + /* Keep necessary pointers locally */ + pool *p = sync->pool; + DOMAIN(control) lock = sync->lock; + LOCK_DOMAIN(control, lock); + + /* This invalidates the `sync` pointer */ + CALL(sync->finish, sync); + + /* Free pool and domain */ + rp_free(p); + UNLOCK_DOMAIN(control, lock); + DOMAIN_FREE(control, lock); +} + +/* Process regular one thread hook */ +static void +bird_thread_sync_one(void *_sync) +{ + struct bird_thread_syncer *sync = _sync; + + LOCK_DOMAIN(control, sync->lock); + CALL(sync->hook, sync); + sync->done++; + if (sync->done == sync->total) + ev_send_loop(&main_birdloop, ev_new_init(sync->pool, bird_thread_sync_finish, sync)); + UNLOCK_DOMAIN(control, sync->lock); +} + +void +bird_thread_sync_all(struct bird_thread_syncer *sync, + void (*hook)(struct bird_thread_syncer *), + void (*done)(struct bird_thread_syncer *), const char *name) +{ + sync->lock = DOMAIN_NEW(control); + LOCK_DOMAIN(control, sync->lock); + + sync->pool = rp_new(&root_pool, sync->lock.control, name); + sync->hook = hook; + sync->finish = done; + + for (int i=0; i<2; i++) + { + struct birdloop_pickup_group *group = &pickup_groups[i]; + + LOCK_DOMAIN(attrs, group->domain); + + struct bird_thread *thr; + WALK_LIST(thr, group->threads) + { + sync->total++; + ev_send(&thr->priority_events, ev_new_init(sync->pool, bird_thread_sync_one, sync)); + wakeup_do_kick(thr); + } + + UNLOCK_DOMAIN(attrs, group->domain); + } + + UNLOCK_DOMAIN(control, sync->lock); +} + + +struct bird_thread_show_data { + struct bird_thread_syncer sync; + cli *cli; + linpool *lp; + u8 show_loops; + uint line_pos; + uint line_max; + const char **lines; +}; + +#define tsd_append(...) do { \ + if (!tsd->lines) \ + tsd->lines = mb_allocz(tsd->sync.pool, sizeof(const char *) * tsd->line_max); \ + if (tsd->line_pos >= tsd->line_max) \ + tsd->lines = mb_realloc(tsd->lines, sizeof (const char *) * (tsd->line_max *= 2)); \ + tsd->lines[tsd->line_pos++] = lp_sprintf(tsd->lp, __VA_ARGS__); \ +} while (0) + +static void +bird_thread_show_cli_cont(struct cli *c UNUSED) +{ + /* Explicitly do nothing to prevent CLI from trying to parse another command. */ +} + +static int +bird_thread_show_cli_cleanup(struct cli *c UNUSED) +{ + return 1; /* Defer the cleanup until the writeout is finished. */ +} + +static void +bird_thread_show_spent_time(struct bird_thread_show_data *tsd, const char *name, struct spent_time *st) +{ + char b[TIME_BY_SEC_SIZE * sizeof("1234567890, ")], *bptr = b, *bend = b + sizeof(b); + uint cs = CURRENT_SEC; + uint fs = NSEC_TO_SEC(st->last_written_ns); + + for (uint i = 0; i <= cs && i < TIME_BY_SEC_SIZE; i++) + bptr += bsnprintf(bptr, bend - bptr, "% 10lu ", + (cs - i > fs) ? 0 : st->by_sec_ns[(cs - i) % TIME_BY_SEC_SIZE]); + bptr[-1] = 0; /* Drop the trailing space */ + + tsd_append(" %s total time: % 9t s; last %d secs [ns]: %s", name, st->total_ns NS, MIN(CURRENT_SEC+1, TIME_BY_SEC_SIZE), b); +} + +static void +bird_thread_show_loop(struct bird_thread_show_data *tsd, struct birdloop *loop) +{ + tsd_append(" Loop %s", domain_name(loop->time.domain)); + bird_thread_show_spent_time(tsd, "Working ", &loop->working); + bird_thread_show_spent_time(tsd, "Locking ", &loop->locking); +} + +static void +bird_thread_show(struct bird_thread_syncer *sync) +{ + SKIP_BACK_DECLARE(struct bird_thread_show_data, tsd, sync, sync); + + if (!tsd->lp) + tsd->lp = lp_new(tsd->sync.pool); + + if (tsd->show_loops) + tsd_append("Thread %p%s (busy counter %d)", this_thread, this_thread->busy_active ? " [busy]" : "", this_thread->busy_counter); + + u64 total_time_ns = 0; + struct birdloop *loop; + WALK_LIST(loop, this_thread->loops) + { + if (tsd->show_loops) + bird_thread_show_loop(tsd, loop); + + total_time_ns += loop->working.total_ns + loop->locking.total_ns; + } + + if (tsd->show_loops) + { + tsd_append(" Total working time: %t", total_time_ns NS); + bird_thread_show_spent_time(tsd, "Overhead", &this_thread->overhead); + bird_thread_show_spent_time(tsd, "Idle ", &this_thread->idle); + } + else + tsd_append("Thread %p working %t s overhead %t s", + this_thread, total_time_ns NS, this_thread->overhead.total_ns NS); +} + +static void +cmd_show_threads_done(struct bird_thread_syncer *sync) +{ + SKIP_BACK_DECLARE(struct bird_thread_show_data, tsd, sync, sync); + ASSERT_DIE(birdloop_inside(&main_birdloop)); + + tsd->cli->cont = NULL; + tsd->cli->cleanup = NULL; + + for (int i=0; i<2; i++) + { + struct birdloop_pickup_group *group = &pickup_groups[i]; + + LOCK_DOMAIN(attrs, group->domain); + uint count = 0; + u64 total_time_ns = 0; + if (!EMPTY_LIST(group->loops)) + { + if (tsd->show_loops) + tsd_append("Unassigned loops in group %d:", i); + + struct birdloop *loop; + WALK_LIST(loop, group->loops) + { + if (tsd->show_loops) + bird_thread_show_loop(tsd, loop); + + total_time_ns += loop->working.total_ns + loop->locking.total_ns; + count++; + } + + if (tsd->show_loops) + tsd_append(" Total working time: %t", total_time_ns NS); + else + tsd_append("Unassigned %d loops in group %d, total time %t", count, i, total_time_ns NS); + } + else + tsd_append("All loops in group %d are assigned.", i); + + UNLOCK_DOMAIN(attrs, group->domain); + } + + for (uint i = 0; i < tsd->line_pos - 1; i++) + cli_printf(tsd->cli, -1027, "%s", tsd->lines[i]); + + cli_printf(tsd->cli, 1027, "%s", tsd->lines[tsd->line_pos-1]); + cli_write_trigger(tsd->cli); + mb_free(tsd); +} + +void +cmd_show_threads(int show_loops) +{ + struct bird_thread_show_data *tsd = mb_allocz(&root_pool, sizeof(struct bird_thread_show_data)); + tsd->cli = this_cli; + tsd->show_loops = show_loops; + tsd->line_pos = 0; + tsd->line_max = 64; + + this_cli->cont = bird_thread_show_cli_cont; + this_cli->cleanup = bird_thread_show_cli_cleanup; + + bird_thread_sync_all(&tsd->sync, bird_thread_show, cmd_show_threads_done, "Show Threads"); +} + - _Bool task_still_in_limit(void) ++bool task_still_in_limit(void) +{ + static u64 main_counter = 0; + if (this_birdloop == &main_birdloop) + return (++main_counter % 2048); /* This is a hack because of no accounting in mainloop */ + else + return ns_now() < account_last + this_thread->max_loop_time_ns; +} + - _Bool task_before_halftime(void) ++bool task_before_halftime(void) +{ + return ns_now() < account_last + this_thread->max_loop_time_ns / 2; +} + + +/* + * Birdloop + */ + +static struct bird_thread main_thread; +struct birdloop main_birdloop = { .thread = &main_thread, }; +_Thread_local struct birdloop *this_birdloop; + +static void birdloop_enter_locked(struct birdloop *loop); + +void +birdloop_init(void) +{ + ns_init(); + + for (int i=0; i<2; i++) + { + struct birdloop_pickup_group *group = &pickup_groups[i]; + + group->domain = DOMAIN_NEW(attrs); + DOMAIN_SETUP(attrs, group->domain, "Loop Pickup", NULL); + init_list(&group->loops); + init_list(&group->threads); + } + + wakeup_init(main_birdloop.thread); + + main_birdloop.time.domain = the_bird_domain.the_bird; + main_birdloop.time.loop = &main_birdloop; + + times_update(); + timers_init(&main_birdloop.time, &root_pool); + + birdloop_enter_locked(&main_birdloop); + this_birdloop = &main_birdloop; + this_thread = &main_thread; + + defer_init(lp_new(&root_pool)); +} + +static void +birdloop_stop_internal(struct birdloop *loop) +{ + LOOP_TRACE(loop, DL_SCHEDULING, "Stopping"); + + /* Block incoming pings */ + u32 ltt = atomic_load_explicit(&loop->thread_transition, memory_order_acquire); + while (!atomic_compare_exchange_strong_explicit( + &loop->thread_transition, <t, LTT_PING, + memory_order_acq_rel, memory_order_acquire)) + ; + + /* Flush remaining events */ + ASSERT_DIE(!ev_run_list(&loop->event_list)); + + /* Drop timers */ + timer *t; + while (t = timers_first(&loop->time)) + tm_stop(t); + + /* Drop sockets */ + sock *s; + WALK_LIST_FIRST2(s, n, loop->sock_list) + birdloop_remove_socket(loop, s); + + /* Unschedule from Meta */ + ev_postpone(&loop->event); + tm_stop(&loop->timer); + + /* Remove from thread loop list */ + ASSERT_DIE(loop->thread == this_thread); + rem_node(&loop->n); + loop->thread = NULL; + + /* Uncount from thread group */ + LOCK_DOMAIN(attrs, this_thread->group->domain); + this_thread->group->loop_count--; + UNLOCK_DOMAIN(attrs, this_thread->group->domain); + + /* Leave the loop context without causing any other fuss */ + ASSERT_DIE(!ev_active(&loop->event)); + loop->ping_pending = 0; + account_to(&this_thread->overhead); + this_birdloop = this_thread->meta; + birdloop_leave(loop); + + /* Request local socket reload */ + this_thread->sock_changed = 1; + + /* Call the stopped hook from the main loop */ + loop->event.hook = loop->stopped; + loop->event.data = loop->stop_data; + ev_send_loop(&main_birdloop, &loop->event); +} + +static void +birdloop_run(void *_loop) +{ + /* Run priority events before the loop is executed */ + ev_run_list(&this_thread->priority_events); + + struct birdloop *loop = _loop; + account_to(&loop->locking); + birdloop_enter(loop); + this_birdloop = loop; + + /* Wait until pingers end to wait for all events to actually arrive */ + for (u32 ltt; + ltt = atomic_load_explicit(&loop->thread_transition, memory_order_acquire); + ) + { + ASSERT_DIE(ltt == LTT_PING); + birdloop_yield(); + } + + /* Now we can actually do some work */ + u64 dif = account_to(&loop->working); + + struct global_runtime *gr = atomic_load_explicit(&global_runtime, memory_order_relaxed); + if (dif > this_thread->max_loop_time_ns + gr->latency_limit TO_NS) + LOOP_WARN(loop, "locked %lu us after its scheduled end time", dif NS TO_US); + + uint repeat, loop_runs = 0; + do { + LOOP_TRACE(loop, DL_SCHEDULING, "Regular run (%d)", loop_runs); + loop_runs++; + + if (loop->stopped) + /* Birdloop left inside the helper function */ + return birdloop_stop_internal(loop); + + /* Process socket TX */ + sockets_fire(loop, 0, 1); + + /* Run timers */ + timers_fire(&loop->time, 0); + + /* Run events */ + repeat = ev_run_list(&loop->event_list); + + /* Process socket RX */ + sockets_fire(loop, 1, 0); + + /* Check end time */ + } while (repeat && task_still_in_limit()); + + /* Request meta timer */ + timer *t = timers_first(&loop->time); + if (t) + tm_start_in(&loop->timer, tm_remains(t), this_thread->meta); + else + tm_stop(&loop->timer); + + /* Request re-run if needed */ + if (repeat) + ev_send_loop(this_thread->meta, &loop->event); + + /* Collect socket change requests */ + this_thread->sock_changed |= loop->sock_changed; + loop->sock_changed = 0; + + account_to(&this_thread->overhead); + this_birdloop = this_thread->meta; + birdloop_leave(loop); +} + +static void +birdloop_run_timer(timer *tm) +{ + struct birdloop *loop = tm->data; + LOOP_TRACE(loop, DL_TIMERS, "Meta timer ready, requesting run"); + ev_send_loop(loop->thread->meta, &loop->event); +} + +static struct birdloop * +birdloop_vnew_internal(pool *pp, uint order, struct birdloop_pickup_group *group, const char *name, va_list args) +{ + struct domain_generic *dg = domain_new(order, 1); + DG_LOCK(dg); + + pool *p = rp_vnewf(pp, dg, name, args); + struct birdloop *loop = mb_allocz(p, sizeof(struct birdloop)); + loop->pool = p; + + loop->time.domain = dg; + loop->time.loop = loop; + + atomic_store_explicit(&loop->thread_transition, 0, memory_order_relaxed); + + birdloop_enter_locked(loop); + + ev_init_list(&loop->event_list, loop, p->name); + timers_init(&loop->time, p); + sockets_init(loop); + + loop->event = (event) { .hook = birdloop_run, .data = loop, }; + loop->timer = (timer) { .hook = birdloop_run_timer, .data = loop, }; + + LOOP_TRACE(loop, DL_SCHEDULING, "New loop: %s", p->name); + + if (group) + { + LOCK_DOMAIN(attrs, group->domain); + group->loop_count++; + group->loop_unassigned_count++; + add_tail(&group->loops, &loop->n); + if (EMPTY_LIST(group->threads)) + ev_send(&global_event_list, &group->start_threads); + else + wakeup_do_kick(SKIP_BACK(struct bird_thread, n, HEAD(group->threads))); + UNLOCK_DOMAIN(attrs, group->domain); + } + else + loop->n.next = loop->n.prev = &loop->n; + + birdloop_leave(loop); + + return loop; +} + +static struct birdloop * +birdloop_new_no_pickup(pool *pp, uint order, const char *name, ...) +{ + va_list args; + va_start(args, name); + struct birdloop *loop = birdloop_vnew_internal(pp, order, NULL, name, args); + va_end(args); + return loop; +} + +struct birdloop * +birdloop_new(pool *pp, uint order, btime max_latency, const char *name, ...) +{ + va_list args; + va_start(args, name); + struct birdloop *loop = birdloop_vnew_internal(pp, order, max_latency ? &pickup_groups[1] : &pickup_groups[0], name, args); + va_end(args); + return loop; +} + +static void +birdloop_do_stop(struct birdloop *loop, void (*stopped)(void *data), void *data) +{ + LOOP_TRACE(loop, DL_SCHEDULING, "Stop requested"); + + loop->stopped = stopped; + loop->stop_data = data; + + birdloop_do_ping(loop); +} + +void +birdloop_stop(struct birdloop *loop, void (*stopped)(void *data), void *data) +{ + DG_LOCK(loop->time.domain); + birdloop_do_stop(loop, stopped, data); + DG_UNLOCK(loop->time.domain); +} + +void +birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *data) +{ + ASSERT_DIE(loop == birdloop_current); + ASSERT_DIE(DG_IS_LOCKED(loop->time.domain)); + + birdloop_do_stop(loop, stopped, data); +} + +void +birdloop_free(struct birdloop *loop) +{ + ASSERT_DIE(loop->thread == NULL); + + struct domain_generic *dg = loop->time.domain; + DG_LOCK(dg); + rp_free(loop->pool); + DG_UNLOCK(dg); + domain_free(dg); +} + +static void +birdloop_enter_locked(struct birdloop *loop) +{ + ASSERT_DIE(DG_IS_LOCKED(loop->time.domain)); + ASSERT_DIE(!birdloop_inside(loop)); + + /* Store the old context */ + loop->prev_loop = birdloop_current; + + /* Put the new context */ + birdloop_current = loop; +} + +void +birdloop_enter(struct birdloop *loop) +{ + DG_LOCK(loop->time.domain); + return birdloop_enter_locked(loop); +} + +static void +birdloop_leave_locked(struct birdloop *loop) +{ + /* Check the current context */ + ASSERT_DIE(birdloop_current == loop); + + /* Send pending pings */ + if (loop->ping_pending) + { + LOOP_TRACE(loop, DL_PING, "sending pings on leave"); + loop->ping_pending = 0; + birdloop_do_ping(loop); + } + + /* Restore the old context */ + birdloop_current = loop->prev_loop; +} + +void +birdloop_leave(struct birdloop *loop) +{ + birdloop_leave_locked(loop); + DG_UNLOCK(loop->time.domain); +} + +void +birdloop_mask_wakeups(struct birdloop *loop) +{ + ASSERT_DIE(birdloop_wakeup_masked == NULL); + birdloop_wakeup_masked = loop; +} + +void +birdloop_unmask_wakeups(struct birdloop *loop) +{ + ASSERT_DIE(birdloop_wakeup_masked == loop); + birdloop_wakeup_masked = NULL; + if (birdloop_wakeup_masked_count) + wakeup_do_kick(loop->thread); + + birdloop_wakeup_masked_count = 0; +} + +void +birdloop_yield(void) +{ + usleep(100); +} + +void +ev_send_this_thread(event *e) +{ + if (this_thread == &main_thread) + ev_send_loop(&main_birdloop, e); + else + ev_send(&this_thread->priority_events, e); +} diff --cc sysdep/unix/io.c index 7e974ec10,9b4990204..892044c5d --- a/sysdep/unix/io.c +++ b/sysdep/unix/io.c @@@ -1827,13 -1680,6 +1827,13 @@@ sk_recvmsg(sock *s static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; } - _Bool ++bool +sk_tx_pending(sock *s) +{ + return s->ttx != s->tpos; +} + + static int sk_maybe_write(sock *s) { diff --cc sysdep/unix/krt.c index 300d695dc,0664f4c1d..d6c1a31de --- a/sysdep/unix/krt.c +++ b/sysdep/unix/krt.c @@@ -445,29 -718,10 +445,29 @@@ done: lp_flush(krt_filter_lp); } - static _Bool -static void ++static bool krt_init_scan(struct krt_proto *p) { - bmap_reset(&p->seen_map, 1024); + switch (p->sync_state) + { + case KPS_IDLE: + rt_refresh_begin(&p->p.main_channel->in_req); + bmap_reset(&p->seen_map, 1024); + p->sync_state = KPS_SCANNING; + return 1; + + case KPS_SCANNING: + bug("Kernel scan double-init"); + + case KPS_PRUNING: + log(L_WARN "%s: Can't scan, still pruning", p->p.name); + return 0; + + case KPS_FLUSHING: + bug("Can't scan, flushing"); + } + + bug("Bad kernel sync state"); } static void