Merge commit 'b95dc8f29f18eb177f91fdc4bf0716fac9b15366' into mq-config-ref

author Maria Matejka <mq@ucw.cz>

Wed, 26 Jun 2024 15:19:24 +0000 (17:19 +0200)

committer Maria Matejka <mq@ucw.cz>

Wed, 26 Jun 2024 15:19:24 +0000 (17:19 +0200)
author Maria Matejka <mq@ucw.cz>
Wed, 26 Jun 2024 15:19:24 +0000 (17:19 +0200)
committer Maria Matejka <mq@ucw.cz>
Wed, 26 Jun 2024 15:19:24 +0000 (17:19 +0200)
diff --cc conf/confbase.Y

index d1d3604beba8f5cb961f9fddfcbdbba10695f320,ed3c1e6e117f77f2fb25ac7f27303106da89f1fe..56dd15e7f74c8768b1e25c5ea5b056ef1c73d6b1
--- 1/conf/confbase.Y
--- 2/conf/confbase.Y
+++ b/conf/confbase.Y
@@@ -28,8 -27,6 +28,8 @@@ CF_HD
   
   CF_DEFINES
   
- static _Bool this_sadr_from_hack_active;
++static bool this_sadr_from_hack_active;
+ +
   static void
   check_u16(uint val)
   {
diff --cc lib/io-loop.h

index 03fe25292ed4fc828bc46e6a236a0c29f8fa206f,0000000000000000000000000000000000000000..b893aa4531aa4e6b08c0943e2ab38681ba9af29d

mode 100644,000000..100644
--- 1/lib/io-loop.h
--- /dev/null
+++ b/lib/io-loop.h
@@@ -1,73 -1,0 +1,73 @@@
- _Bool task_still_in_limit(void);
- _Bool task_before_halftime(void);
+ +/*
+ + *    BIRD -- I/O and event loop
+ + *
+ + *    Can be freely distributed and used under the terms of the GNU GPL.
+ + */
+ +
+ +#ifndef _BIRD_IO_LOOP_H_
+ +#define _BIRD_IO_LOOP_H_
+ +
+ +#include "nest/bird.h"
+ +#include "lib/lists.h"
+ +#include "lib/locking.h"
+ +#include "lib/resource.h"
+ +#include "lib/event.h"
+ +#include "lib/socket.h"
+ +
+ +extern struct birdloop main_birdloop;
+ +
+ +/* Currently running birdloop */
+ +extern _Thread_local struct birdloop *this_birdloop;
+ +
+ +/* Check that the task has enough time to do a bit more */
- _Bool birdloop_inside(struct birdloop *loop);
++bool task_still_in_limit(void);
++bool task_before_halftime(void);
+ +
+ +#define MAYBE_DEFER_TASK(target, event, fmt, args...) do { \
+ +  if (!task_still_in_limit()) { \
+ +    if (atomic_load_explicit(&global_runtime, memory_order_relaxed)->latency_debug & DL_SCHEDULING) \
+ +      log(L_TRACE "Deferring " fmt, ##args); \
+ +    return ev_send(target, event); \
+ +  } } while (0)
+ +
+ +/* Start a new birdloop owned by given pool and domain */
+ +struct birdloop *birdloop_new(pool *p, uint order, btime max_latency, const char *fmt, ...);
+ +
+ +/* Stop the loop. At the end, the @stopped callback is called unlocked in tail
+ + * position to finish cleanup. Run birdloop_free() from that callback to free
+ + * the loop itself. */
+ +void birdloop_stop(struct birdloop *loop, void (*stopped)(void *data), void *data);
+ +void birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *data);
+ +void birdloop_free(struct birdloop *loop);
+ +
+ +/* Run this event in this thread's priority event list */
+ +void ev_send_this_thread(event *e);
+ +
+ +/* Get birdloop's time heap */
+ +struct timeloop *birdloop_time_loop(struct birdloop *loop);
+ +#define birdloop_domain(l)  (birdloop_time_loop((l))->domain)
+ +
+ +/* Get birdloop's pool */
+ +pool *birdloop_pool(struct birdloop *loop);
+ +
+ +/* Enter and exit the birdloop */
+ +void birdloop_enter(struct birdloop *loop);
+ +void birdloop_leave(struct birdloop *loop);
+ +
++bool birdloop_inside(struct birdloop *loop);
+ +
+ +void birdloop_mask_wakeups(struct birdloop *loop);
+ +void birdloop_unmask_wakeups(struct birdloop *loop);
+ +
+ +void birdloop_link(struct birdloop *loop);
+ +void birdloop_unlink(struct birdloop *loop);
+ +
+ +void birdloop_ping(struct birdloop *loop);
+ +
+ +/* Setup sockets */
+ +void birdloop_add_socket(struct birdloop *, struct birdsock *);
+ +void birdloop_remove_socket(struct birdloop *, struct birdsock *);
+ +
+ +void birdloop_init(void);
+ +
+ +#endif /* _BIRD_IO_LOOP_H_ */
diff --cc lib/lists.h

index 86ff59c9cc0b8cca224144f64a707b345b8a7acd,7e6d5467011f9eb9c6a160d7e4ac62a923935a59..b106687f060ce675ec5f4537d264bb065d231a20
--- 1/lib/lists.h
--- 2/lib/lists.h
+++ b/lib/lists.h
@@@ -69,18 -69,6 +69,18 @@@ typedef union list {                        /* In fact two o
   
   #define EMPTY_LIST(list) (!(list).head->next)
   
- static inline _Bool
++static inline bool
+ +enlisted(node *n)
+ +{
+ +  switch ((!!n->next) + (!!n->prev))
+ +  {
+ +    case 0: return 0;
+ +    case 2: return 1;
+ +    case 1: bug("Garbled event list node");
+ +  }
+ +
+ +  bug("Maths is broken. And you should see a new heaven and a new earth: for the first heaven and the first earth had been passed away.");
+ +}
   
   #ifndef _BIRD_LISTS_C_
   #define LIST_INLINE static inline
diff --cc lib/lockfree.c

index 17c17d1898f53a44020c30529bd3f6ced9547963,0000000000000000000000000000000000000000..2d57b46a29ee2163e95b9e18347e8c568b303d87

mode 100644,000000..100644
--- 1/lib/lockfree.c
--- /dev/null
+++ b/lib/lockfree.c
@@@ -1,455 -1,0 +1,455 @@@
-     _Bool f = 0;
+ +/*
+ + *    BIRD Library -- Generic lock-free structures
+ + *
+ + *    (c) 2023--2024 Maria Matejka <mq@jmq.cz>
+ + *    (c) 2023--2024 CZ.NIC, z.s.p.o.
+ + *
+ + *    Can be freely distributed and used under the terms of the GNU GPL.
+ + */
+ +
+ +#include "lib/birdlib.h"
+ +#include "lib/lockfree.h"
+ +
+ +#define LOCAL_DEBUG
+ +
+ +void lfuc_unlock_deferred(struct deferred_call *dc)
+ +{
+ +  SKIP_BACK_DECLARE(struct lfuc_unlock_queue_item, luqi, dc, dc);
+ +  lfuc_unlock_immediately(luqi->c, luqi->el, luqi->ev);
+ +}
+ +
+ +#if 0
+ +#define lfjour_debug(...) log(L_TRACE __VA_ARGS__)
+ +#define lfjour_debug_detailed(...) log(L_TRACE __VA_ARGS__)
+ +#elif 0
+ +#define lfjour_debug(...) log(L_TRACE __VA_ARGS__)
+ +#define lfjour_debug_detailed(...)
+ +#else
+ +#define lfjour_debug(...)
+ +#define lfjour_debug_detailed(...)
+ +#endif
+ +
+ +#define LBI(j, b, p)  ((struct lfjour_item *)(((void *) (b)->_block) + ((j)->item_size * (p))))
+ +#define LBP(j, b, i)  ({ \
+ +    off_t off = ((void *) (i)) - ((void *) (b)->_block); \
+ +    u32 s = (j)->item_size; \
+ +    ASSERT_DIE(off < page_size); \
+ +    ASSERT_DIE((off % s) == 0); \
+ +    off / s; \
+ +    })
+ +
+ +struct lfjour_item *
+ +lfjour_push_prepare(struct lfjour *j)
+ +{
+ +  ASSERT_DIE(!j->domain || DG_IS_LOCKED(j->domain));
+ +  ASSERT_DIE(!j->open);
+ +
+ +  if (EMPTY_TLIST(lfjour_block, &j->pending) &&
+ +      EMPTY_TLIST(lfjour_recipient, &j->recipients))
+ +    return NULL;
+ +
+ +  struct lfjour_block *block = NULL;
+ +  u32 end = 0;
+ +
+ +  if (!EMPTY_TLIST(lfjour_block, &j->pending))
+ +  {
+ +    block = j->pending.last;
+ +    end = atomic_load_explicit(&block->end, memory_order_relaxed);
+ +    if (end >= j->item_count)
+ +    {
+ +      ASSERT_DIE(end == j->item_count);
+ +      block = NULL;
+ +      end = 0;
+ +    }
+ +  }
+ +
+ +  if (!block)
+ +  {
+ +    block = alloc_page();
+ +    lfjour_debug("lfjour(%p)_push_prepare: allocating block %p", j, block);
+ +    *block = (struct lfjour_block) {};
+ +    lfjour_block_add_tail(&j->pending, block);
+ +  }
+ +
+ +  struct lfjour_item *i = LBI(j, block, end);
+ +  *i = (struct lfjour_item) {
+ +    .seq = j->next_seq++,
+ +  };
+ +
+ +  return j->open = i;
+ +}
+ +
+ +void
+ +lfjour_push_commit(struct lfjour *j)
+ +{
+ +  ASSERT_DIE(!j->domain || DG_IS_LOCKED(j->domain));
+ +  ASSERT_DIE(j->open);
+ +  struct lfjour_block *b = PAGE_HEAD(j->open);
+ +  ASSERT_DIE(b == j->pending.last);
+ +
+ +  lfjour_debug("lfjour(%p)_push_commit of %p, seq=%lu", j, j->open, j->open->seq);
+ +
+ +  u32 end = atomic_fetch_add_explicit(&b->end, 1, memory_order_release);
+ +  ASSERT_DIE(j->open == LBI(j, b, end));
+ +
+ +  if (end == 0)
+ +  {
+ +    struct lfjour_block *prev = b->n.prev;
++    bool f = 0;
+ +    if (prev)
+ +      ASSERT_DIE(atomic_compare_exchange_strong_explicit(&prev->not_last, &f, 1,
+ +          memory_order_release, memory_order_relaxed));
+ +  }
+ +
+ +  /* Store the first item to announce (only if this is actually the first one). */
+ +  struct lfjour_item *null_item = NULL;
+ +  if (atomic_compare_exchange_strong_explicit(
+ +      &j->first, &null_item, j->open,
+ +      memory_order_acq_rel, memory_order_relaxed))
+ +  {
+ +    lfjour_debug("lfjour(%p) first set", j);
+ +  }
+ +
+ +  j->open = NULL;
+ +
+ +  if (!ev_active(&j->announce_kick_event))
+ +    ev_send_loop(j->loop, &j->announce_kick_event);
+ +}
+ +
+ +static struct lfjour_item *
+ +lfjour_get_next(struct lfjour *j, const struct lfjour_item *last)
+ +{
+ +  /* This is lockless, no domain checks. */
+ +  if (!last)
+ +  {
+ +    struct lfjour_item *first = atomic_load_explicit(&j->first, memory_order_acquire);
+ +    return first;
+ +  }
+ +
+ +  struct lfjour_block *block = PAGE_HEAD(last);
+ +  ASSERT_DIE(block);
+ +  u32 end = atomic_load_explicit(&block->end, memory_order_acquire);
+ +  u32 pos = LBP(j, block, last);
+ +  ASSERT_DIE(pos < end);
+ +
+ +  /* Next is in the same block. */
+ +  if (++pos < end)
+ +    return LBI(j, block, pos);
+ +
+ +  /* There is another block. */
+ +  if (atomic_load_explicit(&block->not_last, memory_order_acquire))
+ +  {
+ +    /* To avoid rare race conditions, we shall check the current block end once again */
+ +    u32 new_end = atomic_load_explicit(&block->end, memory_order_acquire);
+ +    ASSERT_DIE(new_end >= end);
+ +    if (new_end > end)
+ +      return LBI(j, block, pos);
+ +
+ +    /* Nothing in the previous one, let's move to the next block.
+ +     * This is OK to do non-atomically because of the not_last flag. */
+ +    block = block->n.next;
+ +    return LBI(j, block, 0);
+ +  }
+ +
+ +  /* There is nothing more. */
+ +  return NULL;
+ +}
+ +
+ +struct lfjour_item *
+ +lfjour_get(struct lfjour_recipient *r)
+ +{
+ +  struct lfjour *j = lfjour_of_recipient(r);
+ +
+ +  const struct lfjour_item *last = r->cur;
+ +  struct lfjour_item *next = NULL;
+ +
+ +  if (last)
+ +    next = lfjour_get_next(j, r->cur);
+ +  else
+ +  {
+ +    /* The last pointer may get cleaned up under our hands.
+ +     * Indicating that we're using it, by RCU read. */
+ +
+ +    rcu_read_lock();
+ +    last = atomic_load_explicit(&r->last, memory_order_acquire);
+ +    next = lfjour_get_next(j, last);
+ +    rcu_read_unlock();
+ +  }
+ +
+ +  if (last)
+ +  {
+ +    lfjour_debug_detailed("lfjour(%p)_get(recipient=%p) returns %p, seq=%lu, last %p",
+ +      j, r, next, next ? next->seq : 0ULL, last);
+ +  }
+ +  else
+ +  {
+ +    lfjour_debug("lfjour(%p)_get(recipient=%p) returns %p, seq=%lu, clean",
+ +      j, r, next, next ? next->seq : 0ULL);
+ +  }
+ +
+ +  if (!next)
+ +    return NULL;
+ +
+ +  if (!r->first_holding_seq)
+ +    r->first_holding_seq = next->seq;
+ +
+ +  return r->cur = next;
+ +}
+ +
+ +void lfjour_release(struct lfjour_recipient *r, const struct lfjour_item *it)
+ +{
+ +  /* Find out what we actually released last */
+ +  rcu_read_lock();
+ +  const struct lfjour_item *last = atomic_load_explicit(&r->last, memory_order_acquire);
+ +  struct lfjour_block *last_block = last ? PAGE_HEAD(last) : NULL;
+ +  rcu_read_unlock();
+ +
+ +  /* This is lockless, no domain checks. */
+ +  ASSERT_DIE(r->cur);
+ +
+ +  /* Partial or full release? */
+ +  ASSERT_DIE(r->first_holding_seq);
+ +  ASSERT_DIE(it->seq >= r->first_holding_seq);
+ +  if (it->seq < r->cur->seq)
+ +  {
+ +    lfjour_debug("lfjour(%p)_release(recipient=%p) of %p, partial upto seq=%lu",
+ +      j, r, it, it->seq);
+ +    r->first_holding_seq = it->seq + 1;
+ +    atomic_store_explicit(&r->last, it, memory_order_release);
+ +    return;
+ +  }
+ +
+ +  struct lfjour_block *block = PAGE_HEAD(r->cur);
+ +  u32 end = atomic_load_explicit(&block->end, memory_order_acquire);
+ +
+ +  struct lfjour *j = lfjour_of_recipient(r);
+ +  u32 pos = LBP(j, block, r->cur);
+ +  ASSERT_DIE(pos < end);
+ +
+ +  /* Releasing this export for cleanup routine */
+ +  if (pos + 1 == end)
+ +  {
+ +    lfjour_debug("lfjour(%p)_release(recipient=%p) of %p, seq=%lu (end)",
+ +      j, r, r->cur, r->cur->seq);
+ +  }
+ +  else
+ +  {
+ +    lfjour_debug_detailed("lfjour(%p)_release(recipient=%p) of %p, seq=%lu (mid)",
+ +      j, r, r->cur, r->cur->seq);
+ +  }
+ +
+ +  atomic_store_explicit(&r->last, r->cur, memory_order_release);
+ +
+ +  /* The last block may be available to free */
+ +  if ((pos + 1 == end) || last && (last_block != block))
+ +    lfjour_schedule_cleanup(j);
+ +
+ +  r->first_holding_seq = 0;
+ +  r->cur = NULL;
+ +}
+ +
+ +void
+ +lfjour_announce_now(struct lfjour *j)
+ +{
+ +  ASSERT_DIE(birdloop_inside(j->loop));
+ +  settle_cancel(&j->announce_timer);
+ +  ev_postpone(&j->announce_kick_event);
+ +
+ +  if (EMPTY_TLIST(lfjour_recipient, &j->recipients))
+ +    return lfjour_schedule_cleanup(j);
+ +
+ +  WALK_TLIST(lfjour_recipient, r, &j->recipients)
+ +    if (r->event)
+ +      ev_send(r->target, r->event);
+ +}
+ +
+ +static void
+ +lfjour_announce_settle_hook(struct settle *s)
+ +{
+ +  return lfjour_announce_now(SKIP_BACK(struct lfjour, announce_timer, s));
+ +}
+ +
+ +static void
+ +lfjour_announce_kick_hook(void *_j)
+ +{
+ +  struct lfjour *j = _j;
+ +  settle_kick(&j->announce_timer, j->loop);
+ +}
+ +
+ +u64
+ +lfjour_pending_items(struct lfjour *j)
+ +{
+ +  ASSERT_DIE(!j->domain || DG_IS_LOCKED(j->domain));
+ +
+ +  struct lfjour_item *first = atomic_load_explicit(&j->first, memory_order_relaxed);
+ +  if (!first)
+ +    return 0;
+ +
+ +  ASSERT_DIE(j->next_seq > first->seq);
+ +  return j->next_seq - first->seq;
+ +}
+ +
+ +void
+ +lfjour_register(struct lfjour *j, struct lfjour_recipient *r)
+ +{
+ +  ASSERT_DIE(!j->domain || DG_IS_LOCKED(j->domain));
+ +  ASSERT_DIE(!r->event == !r->target);
+ +
+ +  atomic_store_explicit(&r->last, NULL, memory_order_relaxed);
+ +  ASSERT_DIE(!r->cur);
+ +
+ +  lfjour_recipient_add_tail(&j->recipients, r);
+ +}
+ +
+ +void
+ +lfjour_unregister(struct lfjour_recipient *r)
+ +{
+ +  struct lfjour *j = lfjour_of_recipient(r);
+ +  ASSERT_DIE(!j->domain || DG_IS_LOCKED(j->domain));
+ +
+ +  if (r->cur)
+ +    lfjour_release(r, r->cur);
+ +
+ +  lfjour_recipient_rem_node(&j->recipients, r);
+ +  lfjour_schedule_cleanup(j);
+ +}
+ +
+ +static inline void lfjour_cleanup_unlock_helper(struct domain_generic **dg)
+ +{
+ +  if (!*dg) return;
+ +  DG_UNLOCK(*dg);
+ +}
+ +
+ +static void
+ +lfjour_cleanup_hook(void *_j)
+ +{
+ +  struct lfjour *j = _j;
+ +
+ +  CLEANUP(lfjour_cleanup_unlock_helper) struct domain_generic *_locked = j->domain;
+ +  if (_locked) DG_LOCK(_locked);
+ +
+ +  u64 min_seq = ~((u64) 0);
+ +  const struct lfjour_item *last_item_to_free = NULL;
+ +  struct lfjour_item *first = atomic_load_explicit(&j->first, memory_order_acquire);
+ +
+ +  if (!first)
+ +  {
+ +    /* Nothing to cleanup, actually, just call the done callback */
+ +    ASSERT_DIE(EMPTY_TLIST(lfjour_block, &j->pending));
+ +    CALL(j->cleanup_done, j, 0, ~((u64) 0));
+ +    return;
+ +  }
+ +
+ +  WALK_TLIST(lfjour_recipient, r, &j->recipients)
+ +  {
+ +    const struct lfjour_item *last = atomic_load_explicit(&r->last, memory_order_acquire);
+ +
+ +    if (!last)
+ +      /* No last export means that the channel has exported nothing since last cleanup */
+ +      return;
+ +
+ +    else if (min_seq > last->seq)
+ +    {
+ +      min_seq = last->seq;
+ +      last_item_to_free = last;
+ +    }
+ +  }
+ +
+ +  /* Here we're sure that no receiver is going to use the first pointer soon.
+ +   * It is only used when the receiver's last pointer is NULL, which is avoided by the code above.
+ +   * Thus, we can just move the journal's first pointer forward. */
+ +  struct lfjour_item *next = last_item_to_free ? lfjour_get_next(j, last_item_to_free) : NULL;
+ +  atomic_store_explicit(&j->first, next, memory_order_release);
+ +
+ +  lfjour_debug("lfjour(%p) set first=%p (was %p)", j, next, first);
+ +
+ +  WALK_TLIST(lfjour_recipient, r, &j->recipients)
+ +  {
+ +    const struct lfjour_item *last = last_item_to_free;
+ +    /* This either succeeds if this item is the most-behind-one,
+ +     * or fails and gives us the actual last for debug output. */
+ +    if (atomic_compare_exchange_strong_explicit(
+ +        &r->last, &last, NULL,
+ +        memory_order_acq_rel, memory_order_acquire))
+ +    {
+ +      lfjour_debug("lfjour(%p)_cleanup(recipient=%p): store last=NULL", j, r);
+ +    }
+ +    else
+ +    {
+ +      lfjour_debug("lfjour(%p)_cleanup(recipient=%p): keep last=%p", j, r, last);
+ +    }
+ +  }
+ +
+ +  /* Now some recipients may have old last-pointers. We have to wait
+ +   * until they finish their routine, before we start cleaning up. */
+ +  synchronize_rcu();
+ +
+ +  u64 orig_first_seq = first->seq;
+ +
+ +  /* Now we do the actual cleanup */
+ +  while (first && (first->seq <= min_seq))
+ +  {
+ +    j->item_done(j, first);
+ +
+ +    /* Find next journal item */
+ +    struct lfjour_item *next = lfjour_get_next(j, first);
+ +    if (PAGE_HEAD(next) != PAGE_HEAD(first))
+ +    {
+ +      /* This was the last one in its block */
+ +      struct lfjour_block *block = PAGE_HEAD(first);
+ +      lfjour_debug("lfjour(%p)_cleanup: freeing block %p", j, block);
+ +      ASSERT_DIE(block == j->pending.first);
+ +
+ +      /* Free this block */
+ +      lfjour_block_rem_node(&j->pending, block);
+ +
+ +      /* Wait for possible pending readers of the block */
+ +      synchronize_rcu();
+ +
+ +      /* Now we can finally drop the block */
+ +#ifdef LOCAL_DEBUG
+ +      memset(block, 0xbe, page_size);
+ +#endif
+ +      free_page(block);
+ +
+ +      /* If no more blocks are remaining, we shall reset
+ +       * the sequence numbers */
+ +
+ +      if (EMPTY_TLIST(lfjour_block, &j->pending))
+ +      {
+ +      lfjour_debug("lfjour(%p)_cleanup: seq reset", j);
+ +      WALK_TLIST(lfjour_recipient, r, &j->recipients)
+ +        atomic_fetch_or_explicit(&r->recipient_flags, LFJOUR_R_SEQ_RESET, memory_order_acq_rel);
+ +
+ +      j->next_seq = 1;
+ +      }
+ +    }
+ +
+ +    /* And now move on to the next item */
+ +    first = next;
+ +  }
+ +
+ +  CALL(j->cleanup_done, j, orig_first_seq, first ? first->seq : ~((u64) 0));
+ +}
+ +
+ +void
+ +lfjour_init(struct lfjour *j, struct settle_config *scf)
+ +{
+ +  /* Expecting all other fields to be initialized to zeroes by the caller */
+ +  ASSERT_DIE(j->loop);
+ +  ASSERT_DIE(j->item_size >= sizeof(struct lfjour_item));
+ +
+ +  j->item_size = BIRD_CPU_ALIGN(j->item_size);
+ +  j->item_count = (page_size - sizeof(struct lfjour_block)) / j->item_size;
+ +
+ +  j->next_seq = 1;
+ +  j->announce_kick_event = (event) {
+ +    .hook = lfjour_announce_kick_hook,
+ +    .data = j,
+ +  };
+ +  j->announce_timer = SETTLE_INIT(scf, lfjour_announce_settle_hook, j);
+ +  j->cleanup_event = (event) {
+ +    .hook = lfjour_cleanup_hook,
+ +    .data = j,
+ +  };
+ +}
diff --cc lib/lockfree.h

index f99704b366b26b6d1e555398bcb34526951442ae,0000000000000000000000000000000000000000..ab7f7d0e2302cb8b23b9d8e2d2d055ab0e98a4b5

mode 100644,000000..100644
--- 1/lib/lockfree.h
--- /dev/null
+++ b/lib/lockfree.h
@@@ -1,284 -1,0 +1,284 @@@
- static inline _Bool
+ +/*
+ + *    BIRD Library -- Generic lock-free structures
+ + *
+ + *    (c) 2023--2024 Maria Matejka <mq@jmq.cz>
+ + *    (c) 2023--2024 CZ.NIC, z.s.p.o.
+ + *
+ + *    Can be freely distributed and used under the terms of the GNU GPL.
+ + */
+ +
+ +#ifndef _BIRD_LOCKFREE_H_
+ +#define _BIRD_LOCKFREE_H_
+ +
+ +#include "lib/defer.h"
+ +#include "lib/event.h"
+ +#include "lib/rcu.h"
+ +#include "lib/settle.h"
+ +#include "lib/tlists.h"
+ +#include "lib/io-loop.h"
+ +
+ +#include <stdatomic.h>
+ +
+ +/**
+ + * Lock-free usecounts.
+ + */
+ +
+ +struct lfuc {
+ +  _Atomic u64 uc;
+ +};
+ +
+ +#define LFUC_PU_SHIFT      44
+ +#define LFUC_IN_PROGRESS   (1ULL << LFUC_PU_SHIFT)
+ +
+ +/**
+ + * lfuc_lock - increase an atomic usecount
+ + * @c: the usecount structure
+ + */
+ +static inline u64 lfuc_lock(struct lfuc *c)
+ +{
+ +  /* Locking is trivial; somebody already holds the underlying data structure
+ +   * so we just increase the use count. Nothing can be freed underneath our hands. */
+ +  u64 uc = atomic_fetch_add_explicit(&c->uc, 1, memory_order_acq_rel);
+ +  ASSERT_DIE(uc > 0);
+ +  return uc & (LFUC_IN_PROGRESS - 1);
+ +}
+ +
+ +/**
+ + * lfuc_lock_revive - increase an atomic usecount even if it's zero
+ + * @c: the usecount structure
+ + *
+ + * If the caller is sure that they can't collide with the prune routine,
+ + * they can call this even on structures with already zeroed usecount.
+ + * Handy for situations with flapping routes. Use only from the same
+ + * loop as which runs the prune routine.
+ + */
+ +static inline u64 lfuc_lock_revive(struct lfuc *c)
+ +{
+ +  u64 uc = atomic_fetch_add_explicit(&c->uc, 1, memory_order_acq_rel);
+ +  return uc & (LFUC_IN_PROGRESS - 1);
+ +}
+ +
+ +/**
+ + * lfuc_unlock_immediately - decrease an atomic usecount
+ + * @c: the usecount structure
+ + * @el: prune event list
+ + * @ev: prune event itself
+ + *
+ + * If the usecount reaches zero, a prune event is run to possibly free the object.
+ + * The prune event MUST use lfuc_finished() to check the object state.
+ + */
+ +static inline void lfuc_unlock_immediately(struct lfuc *c, event_list *el, event *ev)
+ +{
+ +  /* Unlocking is tricky. We do it lockless so at the same time, the prune
+ +   * event may be running, therefore if the unlock gets us to zero, it must be
+ +   * the last thing in this routine, otherwise the prune routine may find the
+ +   * source's usecount zeroed, freeing it prematurely.
+ +   *
+ +   * The usecount is split into two parts:
+ +   * the top 20 bits are an in-progress indicator
+ +   * the bottom 44 bits keep the actual usecount.
+ +   *
+ +   * Therefore at most 1 million of writers can simultaneously unlock the same
+ +   * structure, while at most ~17T different places can reference it. Both limits
+ +   * are insanely high from the 2022 point of view. Let's suppose that when 17T
+ +   * routes or 1M peers/tables get real, we get also 128bit atomic variables in the
+ +   * C norm. */
+ +
+ +  /* First, we push the in-progress indicator */
+ +  u64 uc = atomic_fetch_add_explicit(&c->uc, LFUC_IN_PROGRESS, memory_order_acq_rel);
+ +
+ +  /* Then we split the indicator to its parts. Remember, we got the value
+ +   * before the operation happened so we're re-doing the operation locally
+ +   * to get a view how the indicator _would_ look if nobody else was interacting.
+ +   */
+ +  u64 pending = (uc >> LFUC_PU_SHIFT) + 1;
+ +  uc &= LFUC_IN_PROGRESS - 1;
+ +
+ +  /* Obviously, there can't be more pending unlocks than the usecount itself */
+ +  if (uc == pending)
+ +    /* If we're the last unlocker (every owner is already unlocking), schedule
+ +     * the owner's prune event */
+ +    ev_send(el, ev);
+ +  else
+ +    ASSERT_DIE(uc > pending);
+ +
+ +  /* And now, finally, simultaneously pop the in-progress indicator and the
+ +   * usecount, possibly allowing the pruning routine to free this structure */
+ +  uc = atomic_fetch_sub_explicit(&c->uc, LFUC_IN_PROGRESS + 1, memory_order_acq_rel);
+ +
+ +//  return uc - LFUC_IN_PROGRESS - 1;
+ +}
+ +
+ +struct lfuc_unlock_queue_item {
+ +  struct deferred_call dc;
+ +  struct lfuc *c;
+ +  event_list *el;
+ +  event *ev;
+ +};
+ +
+ +void lfuc_unlock_deferred(struct deferred_call *dc);
+ +
+ +static inline void lfuc_unlock(struct lfuc *c, event_list *el, event *ev)
+ +{
+ +  struct lfuc_unlock_queue_item luqi = {
+ +    .dc.hook = lfuc_unlock_deferred,
+ +    .c = c,
+ +    .el = el,
+ +    .ev = ev,
+ +  };
+ +
+ +  defer_call(&luqi.dc, sizeof luqi);
+ +}
+ +
+ +/**
+ + * lfuc_finished - auxiliary routine for prune event
+ + * @c: usecount structure
+ + *
+ + * This routine simply waits until all unlockers finish their job and leave
+ + * the critical section of lfuc_unlock(). Then we decide whether the usecount
+ + * is indeed zero or not, and therefore whether the structure is free to be freed.
+ + */
-   _Atomic _Bool not_last;
++static inline bool
+ +lfuc_finished(struct lfuc *c)
+ +{
+ +  u64 uc;
+ +  /* Wait until all unlockers finish */
+ +  while ((uc = atomic_load_explicit(&c->uc, memory_order_acquire)) >> LFUC_PU_SHIFT)
+ +    birdloop_yield();
+ +
+ +  /* All of them are now done and if the usecount is now zero, then we're
+ +   * the last place to reference the object and we can call it finished. */
+ +  return (uc == 0);
+ +}
+ +
+ +/**
+ + * lfuc_init - auxiliary routine for usecount initialization
+ + * @c: usecount structure
+ + *
+ + * Called on object initialization, sets the usecount to an initial one to make
+ + * sure that the prune routine doesn't free it before somebody else references it.
+ + */
+ +static inline void
+ +lfuc_init(struct lfuc *c)
+ +{
+ +  atomic_store_explicit(&c->uc, 1, memory_order_release);
+ +}
+ +
+ +
+ +/**
+ + * Lock-free journal.
+ + */
+ +
+ +/* Journal item. Put LFJOUR_ITEM_INHERIT(name) into your structure
+ + * to inherit lfjour_item */
+ +#define LFJOUR_ITEM   \
+ +  u64 seq;            \
+ +
+ +struct lfjour_item {
+ +  LFJOUR_ITEM;
+ +};
+ +
+ +#define LFJOUR_ITEM_INHERIT(name) union { \
+ +  struct lfjour_item name; \
+ +  struct { LFJOUR_ITEM; }; \
+ +}
+ +
+ +/* Journal item block. Internal structure, no need to check out. */
+ +#define TLIST_PREFIX lfjour_block
+ +#define TLIST_TYPE struct lfjour_block
+ +#define TLIST_ITEM n
+ +#define TLIST_WANT_ADD_TAIL
+ +
+ +struct lfjour_block {
+ +  TLIST_DEFAULT_NODE;
+ +  _Atomic u32 end;
- static inline _Bool lfjour_reset_seqno(struct lfjour_recipient *r)
++  _Atomic bool not_last;
+ +
+ +  struct lfjour_item _block[0];
+ +};
+ +
+ +/* Defines lfjour_block_list */
+ +#include "lib/tlists.h"
+ +
+ +/* Journal recipient. Inherit this in your implementation. */
+ +#define TLIST_PREFIX lfjour_recipient
+ +#define TLIST_TYPE struct lfjour_recipient
+ +#define TLIST_ITEM n
+ +#define TLIST_WANT_ADD_TAIL
+ +#define TLIST_WANT_WALK
+ +
+ +struct lfjour_recipient {
+ +  TLIST_DEFAULT_NODE;
+ +  event *event;                                       /* Event running when something is in the journal */
+ +  event_list *target;                         /* Event target */
+ +  const struct lfjour_item * _Atomic last;    /* Last item processed */
+ +  u64 first_holding_seq;                      /* First item not released yet */
+ +  struct lfjour_item *cur;                    /* Processing this now */
+ +  _Atomic u64 recipient_flags;                        /* LFJOUR_R_* */
+ +};
+ +
+ +enum lfjour_recipient_flags {
+ +  LFJOUR_R_SEQ_RESET = 1,                     /* Signalling of sequence number reset */
+ +};
+ +
+ +/* Defines lfjour_recipient_list */
+ +#include "lib/tlists.h"
+ +
+ +/* Journal base structure. Include this. */
+ +struct lfjour {
+ +  struct domain_generic *domain;              /* The journal itself belongs to this domain (if different from the loop) */
+ +  struct birdloop *loop;                      /* Cleanup loop */
+ +  u32 item_size, item_count;                  /* Allocation parameters */
+ +  struct lfjour_block_list pending;           /* List of packed journal blocks */
+ +  struct lfjour_item * _Atomic first;         /* First journal item to announce */
+ +  struct lfjour_item *open;                   /* Journal item in progress */
+ +  u64 next_seq;                                       /* Next export to push has this ID */
+ +  struct lfjour_recipient_list recipients;    /* Announce updates to these */
+ +  event announce_kick_event;                  /* Kicks announce_timer */
+ +  struct settle announce_timer;                       /* Announces changes to recipients */
+ +  event cleanup_event;                                /* Runs the journal cleanup routine */
+ +
+ +  /* Callback on item removal from journal */
+ +  void (*item_done)(struct lfjour *, struct lfjour_item *);
+ +
+ +  /* Callback when the cleanup routine is ending */
+ +  void (*cleanup_done)(struct lfjour *, u64 begin_seq, u64 end_seq);
+ +};
+ +
+ +struct lfjour_item *lfjour_push_prepare(struct lfjour *);
+ +void lfjour_push_commit(struct lfjour *);
+ +
+ +struct lfjour_item *lfjour_get(struct lfjour_recipient *);
+ +void lfjour_release(struct lfjour_recipient *, const struct lfjour_item *);
++static inline bool lfjour_reset_seqno(struct lfjour_recipient *r)
+ +{
+ +  return atomic_fetch_and_explicit(&r->recipient_flags, ~LFJOUR_R_SEQ_RESET, memory_order_acq_rel) & LFJOUR_R_SEQ_RESET;
+ +}
+ +
+ +void lfjour_announce_now(struct lfjour *);
+ +u64 lfjour_pending_items(struct lfjour *);
+ +
+ +static inline void lfjour_schedule_cleanup(struct lfjour *j)
+ +{ ev_send_loop(j->loop, &j->cleanup_event); }
+ +
+ +static inline void lfjour_do_cleanup_now(struct lfjour *j)
+ +{
+ +  /* This requires the caller to own the cleanup event loop */
+ +  ev_postpone(&j->cleanup_event);
+ +  j->cleanup_event.hook(j->cleanup_event.data);
+ +}
+ +
+ +void lfjour_register(struct lfjour *, struct lfjour_recipient *);
+ +void lfjour_unregister(struct lfjour_recipient *);
+ +static inline uint lfjour_count_recipients(struct lfjour *j)
+ +{ return TLIST_LENGTH(lfjour_recipient, &j->recipients); }
+ +
+ +void lfjour_init(struct lfjour *, struct settle_config *);
+ +
+ +
+ +static inline struct lfjour *lfjour_of_recipient(struct lfjour_recipient *r)
+ +{
+ +  struct lfjour_recipient_list *list = lfjour_recipient_enlisted(r);
+ +  return list ? SKIP_BACK(struct lfjour, recipients, list) : NULL;
+ +}
+ +#endif
diff --cc lib/locking.h

index 30450535facd8e4609b3095f1d67841a953ca026,0000000000000000000000000000000000000000..0251f87697cbb090b9fb4b658d7d78ee3f437041

mode 100644,000000..100644
--- 1/lib/locking.h
--- /dev/null
+++ b/lib/locking.h
@@@ -1,531 -1,0 +1,531 @@@
- struct domain_generic *domain_new(uint order, _Bool allow_rcu);
+ +/*
+ + *    BIRD Library -- Locking
+ + *
+ + *    (c) 2020--2021 Maria Matejka <mq@jmq.cz>
+ + *
+ + *    Can be freely distributed and used under the terms of the GNU GPL.
+ + */
+ +
+ +#ifndef _BIRD_LOCKING_H_
+ +#define _BIRD_LOCKING_H_
+ +
+ +#include "lib/birdlib.h"
+ +#include "lib/macro.h"
+ +#include "lib/rcu.h"
+ +
+ +struct domain_generic;
+ +struct pool;
+ +
+ +#define LOCK_ORDER \
+ +  the_bird, \
+ +  meta, \
+ +  control, \
+ +  proto, \
+ +  service, \
+ +  rtable, \
+ +  attrs, \
+ +  logging, \
+ +  resource, \
+ +
+ +/* Here define the global lock order; first to last. */
+ +struct lock_order {
+ +#define LOCK_ORDER_EXPAND(p)  struct domain_generic *p;
+ +  MACRO_FOREACH(LOCK_ORDER_EXPAND, LOCK_ORDER)
+ +#undef LOCK_ORDER_EXPAND
+ +};
+ +
+ +#define LOCK_ORDER_EXPAND(p)  struct domain__##p { struct domain_generic *p; };
+ +  MACRO_FOREACH(LOCK_ORDER_EXPAND, LOCK_ORDER)
+ +#undef LOCK_ORDER_EXPAND
+ +
+ +extern _Thread_local struct lock_order locking_stack;
+ +extern _Thread_local struct domain_generic **last_locked;
+ +
+ +#define DOMAIN(type) struct domain__##type
+ +#define DOMAIN_ORDER(type)  OFFSETOF(struct lock_order, type)
+ +
+ +#define DOMAIN_NEW(type)  (DOMAIN(type)) { .type = domain_new(DOMAIN_ORDER(type), 1) }
+ +#define DOMAIN_NEW_RCU_SYNC(type)  (DOMAIN(type)) { .type = domain_new(DOMAIN_ORDER(type), 0) }
- static inline void rws_mark(rw_spinlock *p, _Bool write, _Bool lock)
++struct domain_generic *domain_new(uint order, bool allow_rcu);
+ +
+ +#define DOMAIN_FREE(type, d)  domain_free((d).type)
+ +void domain_free(struct domain_generic *);
+ +
+ +#define DOMAIN_NAME(type, d)  domain_name((d).type)
+ +const char *domain_name(struct domain_generic *);
+ +
+ +#define DOMAIN_SETUP(type, d, n, p)   domain_setup((d).type, n, p)
+ +void domain_setup(struct domain_generic *, const char *name, struct pool *);
+ +
+ +#define DOMAIN_NULL(type)   (DOMAIN(type)) {}
+ +
+ +#define LOCK_DOMAIN(type, d)  do_lock(((d).type), &(locking_stack.type))
+ +#define UNLOCK_DOMAIN(type, d)  do_unlock(((d).type), &(locking_stack.type))
+ +
+ +#define DOMAIN_IS_LOCKED(type, d) (((d).type) == (locking_stack.type))
+ +#define DG_IS_LOCKED(d)       ((d) == *(DG_LSP(d)))
+ +
+ +/* Internal for locking */
+ +void do_lock(struct domain_generic *dg, struct domain_generic **lsp);
+ +void do_unlock(struct domain_generic *dg, struct domain_generic **lsp);
+ +
+ +uint dg_order(struct domain_generic *dg);
+ +
+ +#define DG_LSP(d)     ((struct domain_generic **) (((void *) &locking_stack) + dg_order(d)))
+ +#define DG_LOCK(d)    do_lock(d, DG_LSP(d))
+ +#define DG_UNLOCK(d)  do_unlock(d, DG_LSP(d))
+ +
+ +/* Use with care. To be removed in near future. */
+ +extern DOMAIN(the_bird) the_bird_domain;
+ +
+ +#define the_bird_lock()               LOCK_DOMAIN(the_bird, the_bird_domain)
+ +#define the_bird_unlock()     UNLOCK_DOMAIN(the_bird, the_bird_domain)
+ +#define the_bird_locked()     DOMAIN_IS_LOCKED(the_bird, the_bird_domain)
+ +
+ +#define ASSERT_THE_BIRD_LOCKED        ({ if (!the_bird_locked()) bug("The BIRD lock must be locked here: %s:%d", __FILE__, __LINE__); })
+ +
+ +/*
+ + * RW spinlocks
+ + */
+ +
+ +#define RWS_READ_PENDING_POS  0
+ +#define RWS_READ_ACTIVE_POS   20
+ +#define RWS_WRITE_PENDING_POS 40
+ +#define RWS_WRITE_ACTIVE_POS  56
+ +
+ +#define RWS_READ_PENDING      (1ULL << RWS_READ_PENDING_POS)
+ +#define RWS_READ_ACTIVE               (1ULL << RWS_READ_ACTIVE_POS)
+ +#define RWS_WRITE_PENDING     (1ULL << RWS_WRITE_PENDING_POS)
+ +#define RWS_WRITE_ACTIVE      (1ULL << RWS_WRITE_ACTIVE_POS)
+ +
+ +#define RWS_READ_PENDING_MASK (RWS_READ_ACTIVE - 1)
+ +#define RWS_READ_ACTIVE_MASK  ((RWS_WRITE_PENDING - 1) & ~(RWS_READ_ACTIVE - 1))
+ +#define RWS_WRITE_PENDING_MASK        ((RWS_WRITE_ACTIVE - 1) & ~(RWS_WRITE_PENDING - 1))
+ +#define RWS_WRITE_ACTIVE_MASK (~(RWS_WRITE_ACTIVE - 1))
+ +
+ +typedef struct {
+ +  u64 _Atomic spin;
+ +} rw_spinlock;
+ +
+ +#ifdef DEBUGGING
+ +#define MAX_RWS_AT_ONCE               32
+ +extern _Thread_local rw_spinlock *rw_spinlocks_taken[MAX_RWS_AT_ONCE];
+ +extern _Thread_local btime rw_spinlocks_time[MAX_RWS_AT_ONCE];
+ +extern _Thread_local u32 rw_spinlocks_taken_cnt;
+ +extern _Thread_local u32 rw_spinlocks_taken_write;
+ +
+ +/* Borrowed from lib/timer.h */
+ +btime current_time_now(void);
+ +
++static inline void rws_mark(rw_spinlock *p, bool write, bool lock)
+ +{
+ +  if (lock) {
+ +    ASSERT_DIE(rw_spinlocks_taken_cnt < MAX_RWS_AT_ONCE);
+ +    if (write)
+ +      rw_spinlocks_taken_write |= (1 << rw_spinlocks_taken_cnt);
+ +    else
+ +      rw_spinlocks_taken_write &= ~(1 << rw_spinlocks_taken_cnt);
+ +    rw_spinlocks_time[rw_spinlocks_taken_cnt] = current_time_now();
+ +    rw_spinlocks_taken[rw_spinlocks_taken_cnt++] = p;
+ +
+ +  }
+ +  else {
+ +    ASSERT_DIE(rw_spinlocks_taken_cnt > 0);
+ +    ASSERT_DIE(rw_spinlocks_taken[--rw_spinlocks_taken_cnt] == p);
+ +    ASSERT_DIE(!(rw_spinlocks_taken_write & (1 << rw_spinlocks_taken_cnt)) == !write);
+ +    btime tdif = current_time_now() - rw_spinlocks_time[rw_spinlocks_taken_cnt];
+ +    if (tdif > 1 S_)
+ +      log(L_WARN "Spent an alarming time %t s in spinlock %p (%s); "
+ +       "if this happens often to you, please contact the developers.",
+ +       tdif, p, write ? "write" : "read");
+ +  }
+ +}
+ +#else
+ +#define rws_mark(...)
+ +#endif
+ +
+ +static inline void rws_init(rw_spinlock *p)
+ +{
+ +  atomic_store_explicit(&p->spin, 0, memory_order_relaxed);
+ +}
+ +
+ +static inline void rws_read_lock(rw_spinlock *p)
+ +{
+ +  u64 old = atomic_fetch_add_explicit(&p->spin, RWS_READ_PENDING, memory_order_acquire);
+ +
+ +  while (1)
+ +  {
+ +    /* Wait until all writers end */
+ +    while (old & (RWS_WRITE_PENDING_MASK | RWS_WRITE_ACTIVE_MASK))
+ +    {
+ +      birdloop_yield();
+ +      old = atomic_load_explicit(&p->spin, memory_order_acquire);
+ +    }
+ +
+ +    /* Convert to active */
+ +    old = atomic_fetch_add_explicit(&p->spin, RWS_READ_ACTIVE - RWS_READ_PENDING, memory_order_acq_rel);
+ +
+ +    if (old & RWS_WRITE_ACTIVE_MASK)
+ +      /* Oh but some writer was faster */
+ +      old = atomic_fetch_sub_explicit(&p->spin, RWS_READ_ACTIVE - RWS_READ_PENDING, memory_order_acq_rel);
+ +    else
+ +      /* No writers, approved */
+ +      break;
+ +  }
+ +
+ +  rws_mark(p, 0, 1);
+ +}
+ +
+ +static inline void rws_read_unlock(rw_spinlock *p)
+ +{
+ +  rws_mark(p, 0, 0);
+ +  u64 old = atomic_fetch_sub_explicit(&p->spin, RWS_READ_ACTIVE, memory_order_release);
+ +  ASSERT_DIE(old & RWS_READ_ACTIVE_MASK);
+ +}
+ +
+ +static inline void rws_write_lock(rw_spinlock *p)
+ +{
+ +  u64 old = atomic_fetch_add_explicit(&p->spin, RWS_WRITE_PENDING, memory_order_acquire);
+ +
+ +  /* Wait until all active readers end */
+ +  while (1)
+ +  {
+ +    while (old & (RWS_READ_ACTIVE_MASK | RWS_WRITE_ACTIVE_MASK))
+ +    {
+ +      birdloop_yield();
+ +      old = atomic_load_explicit(&p->spin, memory_order_acquire);
+ +    }
+ +
+ +    /* Mark self as active */
+ +    u64 updated = atomic_fetch_or_explicit(&p->spin, RWS_WRITE_ACTIVE, memory_order_acquire);
+ +
+ +    /* And it's us */
+ +    if (!(updated & RWS_WRITE_ACTIVE))
+ +    {
+ +      if (updated & RWS_READ_ACTIVE_MASK)
+ +      /* But some reader was faster */
+ +      atomic_fetch_and_explicit(&p->spin, ~RWS_WRITE_ACTIVE, memory_order_release);
+ +      else
+ +      /* No readers, approved */
+ +      break;
+ +    }
+ +  }
+ +
+ +  /* It's us, then we aren't actually pending */
+ +  u64 updated = atomic_fetch_sub_explicit(&p->spin, RWS_WRITE_PENDING, memory_order_acquire);
+ +  ASSERT_DIE(updated & RWS_WRITE_PENDING_MASK);
+ +  rws_mark(p, 1, 1);
+ +}
+ +
+ +static inline void rws_write_unlock(rw_spinlock *p)
+ +{
+ +  rws_mark(p, 1, 0);
+ +  u64 old = atomic_fetch_and_explicit(&p->spin, ~RWS_WRITE_ACTIVE, memory_order_release);
+ +  ASSERT_DIE(old & RWS_WRITE_ACTIVE);
+ +}
+ +
+ +
+ +/*
+ + * Unwind stored lock state helpers
+ + */
+ +struct locking_unwind_status {
+ +  struct lock_order *desired;
+ +  enum {
+ +    LOCKING_UNWIND_SAME,
+ +    LOCKING_UNWIND_UNLOCK,
+ +  } state;
+ +};
+ +
+ +static inline struct locking_unwind_status locking_unwind_helper(struct locking_unwind_status status, uint order)
+ +{
+ +  struct domain_generic **lsp = ((void *) &locking_stack) + order;
+ +  struct domain_generic **dp = ((void *) status.desired) + order;
+ +
+ +  if (!status.state)
+ +  {
+ +    /* Just checking that the rest of the stack is consistent */
+ +    if (*lsp != *dp)
+ +      bug("Mangled lock unwind state at order %d", order);
+ +  }
+ +  else if (*dp)
+ +    /* Stored state expects locked */
+ +    if (*lsp == *dp)
+ +      /* Indeed is locked, switch to check mode */
+ +      status.state = 0;
+ +    else
+ +      /* Not locked or locked elsewhere */
+ +      bug("Mangled lock unwind state at order %d", order);
+ +  else if (*lsp)
+ +    /* Stored state expects unlocked but we're locked */
+ +    DG_UNLOCK(*lsp);
+ +
+ +  return status;
+ +}
+ +
+ +static inline void locking_unwind(struct lock_order *desired)
+ +{
+ +  struct locking_unwind_status status = {
+ +    .desired = desired,
+ +    .state = LOCKING_UNWIND_UNLOCK,
+ +  };
+ +
+ +#define LOCK_ORDER_POS_HELPER(x)      DOMAIN_ORDER(x),
+ +#define LOCK_ORDER_POS                        MACRO_FOREACH(LOCK_ORDER_POS_HELPER, LOCK_ORDER)
+ +  MACRO_RPACK(locking_unwind_helper, status, LOCK_ORDER_POS);
+ +#undef LOCK_ORDER_POS_HELPER
+ +}
+ +
+ +/**
+ + *  Objects bound with domains
+ + *
+ + *  First, we need some object to have its locked and unlocked part.
+ + *  This is accomplished typically by the following pattern:
+ + *
+ + *    struct foo_public {
+ + *      ...                   // Public fields
+ + *      DOMAIN(bar) lock;     // The assigned domain
+ + *    };
+ + *
+ + *    struct foo_private {
+ + *      struct foo_public;    // Importing public fields
+ + *      struct foo_private **locked_at;       // Auxiliary field for locking routines
+ + *      ...                   // Private fields
+ + *    };
+ + *
+ + *    typedef union foo {
+ + *      struct foo_public;
+ + *      struct foo_private priv;
+ + *    } foo;
+ + *
+ + *  All persistently stored object pointers MUST point to the public parts.
+ + *  If accessing the locked object from embedded objects, great care must
+ + *  be applied to always SKIP_BACK to the public object version, not the
+ + *  private one.
+ + *
+ + *  To access the private object parts, either the private object pointer
+ + *  is explicitly given to us, therefore assuming somewhere else the domain
+ + *  has been locked, or we have to lock the domain ourselves. To do that,
+ + *  there are some handy macros.
+ + */
+ +
+ +#define LOBJ_LOCK_SIMPLE(_obj, _level) \
+ +  ({ LOCK_DOMAIN(_level, (_obj)->lock); &(_obj)->priv; })
+ +
+ +#define LOBJ_UNLOCK_SIMPLE(_obj, _level) \
+ +  UNLOCK_DOMAIN(_level, (_obj)->lock)
+ +
+ +/*
+ + *  These macros can be used to define specific macros for given class.
+ + *
+ + *  #define FOO_LOCK_SIMPLE(foo)      LOBJ_LOCK_SIMPLE(foo, bar)
+ + *  #define FOO_UNLOCK_SIMPLE(foo)    LOBJ_UNLOCK_SIMPLE(foo, bar)
+ + *
+ + *  Then these can be used like this:
+ + *
+ + *  void foo_frobnicate(foo *f)
+ + *  {
+ + *    // Unlocked context
+ + *    ...
+ + *    struct foo_private *fp = FOO_LOCK_SIMPLE(f);
+ + *    // Locked context
+ + *    ...
+ + *    FOO_UNLOCK_SIMPLE(f);
+ + *    // Unlocked context
+ + *    ...
+ + *  }
+ + *
+ + *  These simple calls have two major drawbacks. First, if you return
+ + *  from locked context, you don't unlock, which may lock you dead.
+ + *  And second, the foo_private pointer is still syntactically valid
+ + *  even after unlocking.
+ + *
+ + *  To fight this, we need more magic and the switch should stay in that
+ + *  position.
+ + *
+ + *  First, we need an auxiliary _function_ for unlocking. This function
+ + *  is intended to be called in a local variable cleanup context.
+ + */
+ +
+ +#define LOBJ_UNLOCK_CLEANUP_NAME(_stem) _lobj__##_stem##_unlock_cleanup
+ +
+ +#define LOBJ_UNLOCK_CLEANUP(_stem, _level) \
+ +  static inline void LOBJ_UNLOCK_CLEANUP_NAME(_stem)(struct _stem##_private **obj) { \
+ +    if (!*obj) return; \
+ +    ASSERT_DIE(LOBJ_IS_LOCKED((*obj), _level)); \
+ +    ASSERT_DIE((*obj)->locked_at == obj); \
+ +    (*obj)->locked_at = NULL; \
+ +    UNLOCK_DOMAIN(_level, (*obj)->lock); \
+ +  }
+ +
+ +#define LOBJ_LOCK(_obj, _pobj, _stem, _level) \
+ +  CLEANUP(LOBJ_UNLOCK_CLEANUP_NAME(_stem)) struct _stem##_private *_pobj = LOBJ_LOCK_SIMPLE(_obj, _level); _pobj->locked_at = &_pobj;
+ +
+ +/*
+ + *  And now the usage of these macros. You first need to declare the auxiliary
+ + *  cleanup function.
+ + *
+ + *  LOBJ_UNLOCK_CLEANUP(foo, bar);
+ + *
+ + *  And then declare the lock-local macro:
+ + *
+ + *  #define FOO_LOCK(foo, fpp)        LOBJ_LOCK(foo, fpp, foo, bar)
+ + *
+ + *  This construction then allows you to lock much more safely:
+ + *
+ + *  void foo_frobnicate_safer(foo *f)
+ + *  {
+ + *    // Unlocked context
+ + *    ...
+ + *    do {
+ + *      FOO_LOCK(foo, fpp);
+ + *    // Locked context, fpp is valid here
+ + *
+ + *    if (something) return;  // This implicitly unlocks
+ + *    if (whatever) break;    // This unlocks too
+ + *
+ + *      // Finishing context with no unlock at all
+ + *    } while (0);
+ + *
+ + *    // Here is fpp invalid and the object is back unlocked.
+ + *    ...
+ + *  }
+ + *
+ + *  There is no explicit unlock statement. To unlock, simply leave the block
+ + *  with locked context.
+ + *
+ + *  This may be made even nicer to use by employing a for-cycle.
+ + */
+ +
+ +#define LOBJ_LOCKED(_obj, _pobj, _stem, _level) \
+ +  for (CLEANUP(LOBJ_UNLOCK_CLEANUP_NAME(_stem)) struct _stem##_private *_pobj = LOBJ_LOCK_SIMPLE(_obj, _level); \
+ +      _pobj ? (_pobj->locked_at = &_pobj) : NULL; \
+ +      LOBJ_UNLOCK_CLEANUP_NAME(_stem)(&_pobj), _pobj = NULL)
+ +
+ +/*
+ + *  This for-cycle employs heavy magic to hide as much of the boilerplate
+ + *  from the user as possibly needed. Here is how it works.
+ + *
+ + *  First, the for-1 clause is executed, setting up _pobj, to the private
+ + *  object pointer. It has a cleanup hook set.
+ + *
+ + *  Then, the for-2 clause is checked. As _pobj is non-NULL, _pobj->locked_at
+ + *  is initialized to the _pobj address to ensure that the cleanup hook unlocks
+ + *  the right object.
+ + *
+ + *  Now the user block is executed. If it ends by break or return, the cleanup
+ + *  hook fires for _pobj, triggering object unlock.
+ + *
+ + *  If the user block executed completely, the for-3 clause is run, executing
+ + *  the cleanup hook directly and then deactivating it by setting _pobj to NULL.
+ + *
+ + *  Finally, the for-2 clause is checked again but now with _pobj being NULL,
+ + *  causing the loop to end. As the object has already been unlocked, nothing
+ + *  happens after leaving the context.
+ + *
+ + *  #define FOO_LOCKED(foo, fpp)      LOBJ_LOCKED(foo, fpp, foo, bar)
+ + *
+ + *  Then the previous code can be modified like this:
+ + *
+ + *  void foo_frobnicate_safer(foo *f)
+ + *  {
+ + *    // Unlocked context
+ + *    ...
+ + *    FOO_LOCKED(foo, fpp)
+ + *    {
+ + *    // Locked context, fpp is valid here
+ + *
+ + *    if (something) return;  // This implicitly unlocks
+ + *    if (whatever) break;    // This unlocks too
+ + *
+ + *      // Finishing context with no unlock at all
+ + *    }
+ + *
+ + *    // Unlocked context
+ + *    ...
+ + *
+ + *    // Locking once again without an explicit block
+ + *    FOO_LOCKED(foo, fpp)
+ + *    do_something(fpp);
+ + *
+ + *    // Here is fpp invalid and the object is back unlocked.
+ + *    ...
+ + *  }
+ + *
+ + *
+ + *  For many reasons, a lock-check macro is handy.
+ + *
+ + *  #define FOO_IS_LOCKED(foo)        LOBJ_IS_LOCKED(foo, bar)
+ + */
+ +
+ +#define LOBJ_IS_LOCKED(_obj, _level)  DOMAIN_IS_LOCKED(_level, (_obj)->lock)
+ +
+ +/*
+ + *  An example implementation is available in lib/locking_test.c
+ + */
+ +
+ +
+ +/*
+ + *  Please don't use this macro unless you at least try to prove that
+ + *  it's completely safe. It's a can of worms.
+ + *
+ + *  NEVER RETURN OR BREAK FROM THIS MACRO, it will crash.
+ + */
+ +
+ +#define LOBJ_UNLOCKED_TEMPORARILY(_obj, _pobj, _stem, _level) \
+ +  for (union _stem *_obj = SKIP_BACK(union _stem, priv, _pobj), **_lataux = (union _stem **) _pobj->locked_at; \
+ +      _obj ? (_pobj->locked_at = NULL, LOBJ_UNLOCK_SIMPLE(_obj, _level), _obj) : NULL; \
+ +      LOBJ_LOCK_SIMPLE(_obj, _level), _pobj->locked_at = (struct _stem##_private **) _lataux, _obj = NULL)
+ +
+ +/*
+ + *  Get the locked object when the lock is already taken
+ + */
+ +
+ +#define LOBJ_PRIV(_obj, _level) \
+ +  ({ ASSERT_DIE(DOMAIN_IS_LOCKED(_level, (_obj)->lock)); &(_obj)->priv; })
+ +
+ +
+ +/*
+ + * RCU retry unwinder
+ + *
+ + * Start a retriable operation with RCU_ANCHOR() and pass the _i object along
+ + * with the code which may then call RCU_RETRY() to return back to RCU_ANCHOR
+ + * and try again.
+ + */
+ +
+ +struct rcu_unwinder {
+ +  struct lock_order locking_stack;
+ +  u32 retry;
+ +  u8 fast;
+ +  jmp_buf buf;
+ +};
+ +
+ +static inline void _rcu_unwinder_unlock_(struct rcu_unwinder *o UNUSED)
+ +{
+ +  rcu_read_unlock();
+ +}
+ +
+ +#define RCU_UNWIND_WARN       4096
+ +
+ +#define RCU_ANCHOR(_i)        \
+ +  CLEANUP(_rcu_unwinder_unlock_) struct rcu_unwinder _s##_i = {};     \
+ +  struct rcu_unwinder *_i = &_s##_i;                                  \
+ +  if (setjmp(_i->buf)) {                                              \
+ +    rcu_read_unlock();                                                        \
+ +    locking_unwind(&_i->locking_stack);                                       \
+ +    if (_i->fast) _i->fast = 0;                                               \
+ +    else {                                                            \
+ +      birdloop_yield();                                                       \
+ +      if (!(++_i->retry % RCU_UNWIND_WARN))                           \
+ +      log(L_WARN "Suspiciously many RCU_ANCHORs retried (%lu)"        \
+ +         " at %s:%d", _i->retry, __FILE__, __LINE__);                 \
+ +    }                                                                 \
+ +  }                                                                   \
+ +  _i->locking_stack = locking_stack;                                  \
+ +  rcu_read_lock();                                                    \
+ +
+ +#define RCU_RETRY(_i) do { if (_i) longjmp(_i->buf, 1); else bug("No rcu retry allowed here"); } while (0)
+ +
+ +#define RCU_RETRY_FAST(_i) do { (_i)->fast++; RCU_RETRY(_i); } while (0)
+ +
+ +#define RCU_WONT_RETRY        ((struct rcu_unwinder *) NULL)
+ +#endif
diff --cc lib/locking_test.c

index 38faed61b0cadb4ca52d78a605963feb103fb17c,0000000000000000000000000000000000000000..8792194bc1d9f7ee6dab683d1d3c2f55436f7545

mode 100644,000000..100644
--- 1/lib/locking_test.c
--- /dev/null
+++ b/lib/locking_test.c
@@@ -1,180 -1,0 +1,180 @@@
-   for (_Bool sorted = 0; !sorted++; )
+ +#include "test/birdtest.h"
+ +#include "test/bt-utils.h"
+ +
+ +#include "lib/locking.h"
+ +#include <stdatomic.h>
+ +#include <pthread.h>
+ +
+ +#define FOO_PUBLIC \
+ +  const char *name;   \
+ +  _Atomic uint counter;       \
+ +  DOMAIN(proto) lock; \
+ +
+ +struct foo_private {
+ +  struct { FOO_PUBLIC; };
+ +  struct foo_private **locked_at;
+ +  uint private_counter;
+ +};
+ +
+ +typedef union foo {
+ +  struct { FOO_PUBLIC; };
+ +  struct foo_private priv;
+ +} foo;
+ +
+ +LOBJ_UNLOCK_CLEANUP(foo, proto);
+ +#define FOO_LOCK(_foo, _fpp)  LOBJ_LOCK(_foo, _fpp, foo, proto)
+ +#define FOO_LOCKED(_foo, _fpp)        LOBJ_LOCKED(_foo, _fpp, foo, proto)
+ +#define FOO_IS_LOCKED(_foo)   LOBJ_IS_LOCKED(_foo, proto)
+ +
+ +static uint
+ +inc_public(foo *f)
+ +{
+ +  return atomic_fetch_add_explicit(&f->counter, 1, memory_order_relaxed) + 1;
+ +}
+ +
+ +static uint
+ +inc_private(foo *f)
+ +{
+ +  FOO_LOCKED(f, fp) return ++fp->private_counter;
+ +  bug("Returning always");
+ +}
+ +
+ +#define BLOCKCOUNT  4096
+ +#define THREADS           16
+ +#define REPEATS           128
+ +
+ +static void *
+ +thread_run(void *_foo)
+ +{
+ +  foo *f = _foo;
+ +
+ +  for (int i=0; i<REPEATS; i++)
+ +    if (i % 2)
+ +      for (int j=0; j<BLOCKCOUNT; j++)
+ +      inc_public(f);
+ +    else
+ +      for (int j=0; j<BLOCKCOUNT; j++)
+ +      inc_private(f);
+ +
+ +  return NULL;
+ +}
+ +
+ +static int
+ +t_locking(void)
+ +{
+ +  pthread_t thr[THREADS];
+ +  foo f = { .lock = DOMAIN_NEW(proto), };
+ +
+ +  for (int i=0; i<THREADS; i++)
+ +    bt_assert(pthread_create(&thr[i], NULL, thread_run, &f) == 0);
+ +
+ +  for (int i=0; i<THREADS; i++)
+ +    bt_assert(pthread_join(thr[i], NULL) == 0);
+ +
+ +  bt_assert(f.priv.private_counter == atomic_load_explicit(&f.counter, memory_order_relaxed));
+ +  bt_assert(f.priv.private_counter == THREADS * BLOCKCOUNT * REPEATS / 2);
+ +
+ +  return 1;
+ +}
+ +
+ +#define RWS_DATASIZE  333
+ +#define RWS_THREADS   128
+ +
+ +struct rws_test_data {
+ +  int data[RWS_DATASIZE];
+ +  rw_spinlock rws[RWS_DATASIZE];
+ +};
+ +
+ +static void *
+ +rwspin_thread_run(void *_rtd)
+ +{
+ +  struct rws_test_data *d = _rtd;
+ +
++  for (bool sorted = 0; !sorted++; )
+ +  {
+ +    for (int i=0; (i<RWS_DATASIZE-1) && sorted; i++)
+ +    {
+ +      rws_read_lock(&d->rws[i]);
+ +      rws_read_lock(&d->rws[i+1]);
+ +
+ +      ASSERT_DIE(d->data[i] >= 0);
+ +      ASSERT_DIE(d->data[i+1] >= 0);
+ +      if (d->data[i] > d->data[i+1])
+ +      sorted = 0;
+ +
+ +      rws_read_unlock(&d->rws[i+1]);
+ +      rws_read_unlock(&d->rws[i]);
+ +    }
+ +
+ +    for (int i=0; (i<RWS_DATASIZE-1); i++)
+ +    {
+ +      rws_write_lock(&d->rws[i]);
+ +      rws_write_lock(&d->rws[i+1]);
+ +
+ +      int first = d->data[i];
+ +      int second = d->data[i+1];
+ +
+ +      ASSERT_DIE(first >= 0);
+ +      ASSERT_DIE(second >= 0);
+ +
+ +      d->data[i] = d->data[i+1] = -1;
+ +
+ +      if (first > second)
+ +      {
+ +      d->data[i] = second;
+ +      d->data[i+1] = first;
+ +      }
+ +      else
+ +      {
+ +      d->data[i] = first;
+ +      d->data[i+1] = second;
+ +      }
+ +
+ +      rws_write_unlock(&d->rws[i+1]);
+ +      rws_write_unlock(&d->rws[i]);
+ +    }
+ +  }
+ +
+ +  return NULL;
+ +}
+ +
+ +static int
+ +t_rwspin(void)
+ +{
+ +  struct rws_test_data d;
+ +
+ +  /* Setup an array to sort */
+ +  for (int i=0; i<RWS_DATASIZE; i++)
+ +    d.data[i] = RWS_DATASIZE-i-1;
+ +
+ +  /* Spinlock for every place */
+ +  for (int i=0; i<RWS_DATASIZE; i++)
+ +    rws_init(&d.rws[i]);
+ +
+ +  /* Start the threads */
+ +  pthread_t thr[RWS_THREADS];
+ +  for (int i=0; i<RWS_THREADS; i++)
+ +    bt_assert(pthread_create(&thr[i], NULL, rwspin_thread_run, &d) == 0);
+ +
+ +  /* Wait for the threads */
+ +  for (int i=0; i<RWS_THREADS; i++)
+ +    bt_assert(pthread_join(thr[i], NULL) == 0);
+ +
+ +  for (int i=0; i<RWS_DATASIZE; i++)
+ +    bt_assert(d.data[i] == i);
+ +
+ +  return 1;
+ +}
+ +
+ +
+ +int
+ +main(int argc, char **argv)
+ +{
+ +  bt_init(argc, argv);
+ +  bt_bird_init();
+ +
+ +  bt_test_suite(t_locking, "Testing locks");
+ +  bt_test_suite(t_rwspin, "Testing rw spinlock");
+ +
+ +  return bt_exit_value();
+ +}
diff --cc lib/netindex.c

index 6c0bba62960cc381fa0e618dac7d3660cb2e22cd,0000000000000000000000000000000000000000..7596bffd5ba10d8add8b06f902068f1549025870

mode 100644,000000..100644
--- 1/lib/netindex.c
--- /dev/null
+++ b/lib/netindex.c
@@@ -1,342 -1,0 +1,342 @@@
- static _Bool
+ +/*
+ + *    BIRD Internet Routing Daemon -- Semi-global index of nets
+ + *
+ + *    (c) 2023       Maria Matejka <mq@jmq.cz>
+ + *
+ + *    Can be freely distributed and used under the terms of the GNU GPL.
+ + */
+ +
+ +#include "lib/birdlib.h"
+ +#include "lib/netindex_private.h"
+ +
+ +#define NETINDEX_INIT_BLOCK_SIZE      128
+ +
+ +#define NETINDEX_KEY(n)               (n)->hash, (n)->addr
+ +#define NETINDEX_NEXT(n)      (n)->next
+ +#define NETINDEX_EQ(h,n,i,o)  ((h == i) && net_equal(n,o))
+ +#define NETINDEX_FN(h,n)      (h)
+ +#define NETINDEX_ORDER                12 /* Initial */
+ +
+ +#define NETINDEX_REHASH               netindex_rehash
+ +#define NETINDEX_PARAMS               /8, *2, 2, 2, 12, 28
+ +
+ +static void NETINDEX_REHASH(void *_v) {
+ +  log(L_TRACE "Netindex rehash: begin");
+ +  netindex_spinhash *v = _v;
+ +  int step;
+ +  SPINHASH_REHASH_PREPARE(v,NETINDEX,struct netindex,step);
+ +
+ +  log(L_TRACE "Netindex rehash: step=%d", step);
+ +  if (!step)  return;
+ +
+ +  if (step > 0) SPINHASH_REHASH_UP(v,NETINDEX,struct netindex,step);
+ +  if (step < 0) SPINHASH_REHASH_DOWN(v,NETINDEX,struct netindex,-step);
+ +
+ +  log(L_TRACE "Netindex rehash: time to finish");
+ +  SPINHASH_REHASH_FINISH(v,NETINDEX);
+ +  log(L_TRACE "Netindex rehash: done");
+ +}
+ +
+ +static void netindex_hash_cleanup(void *netindex_hash);
+ +
+ +static struct netindex *
+ +net_lock_revive_unlock(netindex_hash *h, struct netindex *i)
+ +{
+ +  if (!i)
+ +    return NULL;
+ +
+ +  lfuc_lock_revive(&i->uc);
+ +  lfuc_unlock(&i->uc, h->cleanup_list, &h->cleanup_event);
+ +  return i;
+ +}
+ +
+ +/*
+ + * Index initialization
+ + */
+ +netindex_hash *
+ +netindex_hash_new(pool *sp, event_list *cleanup_target, u8 type)
+ +{
+ +  DOMAIN(attrs) dom = DOMAIN_NEW_RCU_SYNC(attrs);
+ +  LOCK_DOMAIN(attrs, dom);
+ +
+ +  pool *p = rp_new(sp, dom.attrs, "Network index");
+ +
+ +  struct netindex_hash_private *nh = mb_allocz(p, sizeof *nh);
+ +  nh->lock = dom;
+ +  nh->pool = p;
+ +  nh->net_type = type;
+ +
+ +  nh->slab = net_addr_length[type] ? sl_new(nh->pool, sizeof (struct netindex) + net_addr_length[type]) : NULL;
+ +
+ +  SPINHASH_INIT(nh->hash, NETINDEX, nh->pool, cleanup_target);
+ +  atomic_store_explicit(&nh->block_size, NETINDEX_INIT_BLOCK_SIZE, memory_order_release);
+ +  atomic_store_explicit(&nh->block,
+ +      mb_allocz(nh->pool, NETINDEX_INIT_BLOCK_SIZE * sizeof *nh->block),
+ +      memory_order_release);
+ +
+ +  hmap_init(&nh->id_map, nh->pool, 128);
+ +
+ +  nh->cleanup_list = cleanup_target;
+ +  nh->cleanup_event = (event) { .hook = netindex_hash_cleanup, nh };
+ +
+ +  UNLOCK_DOMAIN(attrs, dom);
+ +  return SKIP_BACK(netindex_hash, priv, nh);
+ +}
+ +
+ +static uint
+ +netindex_hash_cleanup_removed(struct netindex_hash_private *nh, struct netindex * _Atomic *block, struct netindex **removed, uint cnt)
+ +{
+ +  synchronize_rcu();
+ +
+ +  uint kept = 0;
+ +  for (uint q = 0; q < cnt; q++)
+ +  {
+ +    struct netindex *ni = removed[q];
+ +
+ +    /* Now no reader can possibly still have the old pointer,
+ +     * unless somebody found it inbetween and ref'd it. */
+ +    if (!lfuc_finished(&ni->uc))
+ +    {
+ +      /* Collision, return the netindex back. */
+ +      ASSERT_DIE(NULL == atomic_exchange_explicit(&block[ni->index], ni, memory_order_acq_rel));
+ +      SPINHASH_INSERT(nh->hash, NETINDEX, ni);
+ +      kept++;
+ +      continue;
+ +    }
+ +
+ +    /* Now the netindex is definitely obsolete, we can free it */
+ +    hmap_clear(&nh->id_map, ni->index);
+ +
+ +    if (nh->slab)
+ +      sl_free(ni);
+ +    else
+ +      mb_free(ni);
+ +  }
+ +
+ +  return kept;
+ +}
+ +
+ +static void
+ +netindex_hash_cleanup(void *_nh)
+ +{
+ +  struct netindex_hash_private *nh = _nh;
+ +
+ +  DOMAIN(attrs) dom = nh->lock;
+ +  LOCK_DOMAIN(attrs, dom);
+ +
+ +  uint kept = 0;
+ +
+ +  uint bs = atomic_load_explicit(&nh->block_size, memory_order_relaxed);
+ +  struct netindex * _Atomic *block = atomic_load_explicit(&nh->block, memory_order_relaxed);
+ +
+ +#define REMOVED_MAX 256
+ +  struct netindex *removed[REMOVED_MAX];
+ +  uint removed_cnt = 0;
+ +
+ +  for (uint i = 0; i < bs; i++)
+ +  {
+ +    struct netindex *ni = atomic_load_explicit(&block[i], memory_order_acquire);
+ +    if (!ni)
+ +      continue;
+ +
+ +    /* We may use the acquired netindex pointer as we are
+ +     * the only process which deletes them */
+ +    ASSERT_DIE(i == ni->index);
+ +
+ +    /* Check finished */
+ +    if (!lfuc_finished(&ni->uc))
+ +    {
+ +      kept++;
+ +      continue;
+ +    }
+ +
+ +    /* Looks finished, try dropping */
+ +    ASSERT_DIE(ni == atomic_exchange_explicit(&block[i], NULL, memory_order_acq_rel));
+ +    SPINHASH_REMOVE(nh->hash, NETINDEX, ni);
+ +
+ +    /* Store into the removed-block */
+ +    removed[removed_cnt++] = ni;
+ +
+ +    /* If removed-block is full, flush it */
+ +    if (removed_cnt == REMOVED_MAX)
+ +    {
+ +      kept += netindex_hash_cleanup_removed(nh, block, removed, removed_cnt);
+ +      removed_cnt = 0;
+ +    }
+ +  }
+ +
+ +  /* Flush remaining netindexes */
+ +  if (removed_cnt)
+ +    kept += netindex_hash_cleanup_removed(nh, block, removed, removed_cnt);
+ +
+ +  /* Return now unless we're deleted */
+ +  if (kept || !nh->deleted_event)
+ +  {
+ +    UNLOCK_DOMAIN(attrs, dom);
+ +    return;
+ +  }
+ +
+ +  ev_postpone(&nh->cleanup_event);
+ +
+ +  event *e = nh->deleted_event;
+ +  event_list *t = nh->deleted_target;
+ +
+ +  /* Check cleanliness */
+ +  SPINHASH_WALK(nh->hash, NETINDEX, i)
+ +    bug("Stray netindex in deleted hash");
+ +  SPINHASH_WALK_END;
+ +
+ +  /* Cleanup the spinhash itself */
+ +  SPINHASH_FREE(nh->hash);
+ +
+ +  /* Pool free is enough to drop everything else */
+ +  rp_free(nh->pool);
+ +
+ +  /* And only the lock remains */
+ +  UNLOCK_DOMAIN(attrs, dom);
+ +  DOMAIN_FREE(attrs, dom);
+ +
+ +  /* Notify the requestor */
+ +  ev_send(t, e);
+ +}
+ +
+ +void
+ +netindex_hash_delete(netindex_hash *h, event *e, event_list *t)
+ +{
+ +  NH_LOCK(h, hp);
+ +
+ +  hp->deleted_event = e;
+ +  hp->deleted_target = t;
+ +
+ +  ev_send(hp->cleanup_list, &hp->cleanup_event);
+ +}
+ +
+ +/*
+ + * Private index manipulation
+ + */
+ +static struct netindex *
+ +net_find_index_fragile(netindex_hash *nh, const net_addr *n)
+ +{
+ +  ASSERT_DIE(n->type == nh->net_type);
+ +
+ +  u32 h = net_hash(n);
+ +  return SPINHASH_FIND(nh->hash, NETINDEX, h, n);
+ +}
+ +
++static bool
+ +net_validate_index(netindex_hash *h, struct netindex *ni)
+ +{
+ +  struct netindex * _Atomic *block = atomic_load_explicit(&h->block, memory_order_relaxed);
+ +  u32 bs = atomic_load_explicit(&h->block_size, memory_order_relaxed);
+ +
+ +  ASSERT_DIE(ni->index < bs);
+ +  struct netindex *bni = atomic_load_explicit(&block[ni->index], memory_order_acquire);
+ +  return (bni == ni);
+ +}
+ +
+ +static struct netindex *
+ +net_new_index_locked(struct netindex_hash_private *hp, const net_addr *n)
+ +{
+ +  ASSERT_DIE(!hp->deleted_event);
+ +
+ +  u32 i = hmap_first_zero(&hp->id_map);
+ +  hmap_set(&hp->id_map, i);
+ +
+ +  struct netindex *ni = hp->slab ?
+ +    sl_alloc(hp->slab) :
+ +    mb_alloc(hp->pool, n->length + sizeof *ni);
+ +
+ +  *ni = (struct netindex) {
+ +    .hash = net_hash(n),
+ +    .index = i,
+ +  };
+ +  net_copy(ni->addr, n);
+ +
+ +  SPINHASH_INSERT(hp->hash, NETINDEX, ni);
+ +
+ +  struct netindex * _Atomic *block = atomic_load_explicit(&hp->block, memory_order_relaxed);
+ +  u32 bs = atomic_load_explicit(&hp->block_size, memory_order_relaxed);
+ +  u32 nbs = bs;
+ +  while (nbs <= i)
+ +    nbs *= 2;
+ +
+ +  if (nbs > bs)
+ +  {
+ +    struct netindex * _Atomic *nb = mb_alloc(hp->pool, bs * 2 * sizeof *nb);
+ +    memcpy(nb, block, bs * sizeof *nb);
+ +    memset(&nb[bs], 0, (nbs - bs) * sizeof *nb);
+ +
+ +    ASSERT_DIE(block == atomic_exchange_explicit(&hp->block, nb, memory_order_acq_rel));
+ +    ASSERT_DIE(bs == atomic_exchange_explicit(&hp->block_size, nbs, memory_order_acq_rel));
+ +    synchronize_rcu();
+ +
+ +    mb_free(block);
+ +    block = nb;
+ +
+ +    hp->block_epoch++;
+ +  }
+ +
+ +  ASSERT_DIE(i < nbs);
+ +  atomic_store_explicit(&block[i], ni, memory_order_release);
+ +
+ +  return ni;
+ +}
+ +
+ +
+ +/*
+ + * Public entry points
+ + */
+ +
+ +void net_lock_index(netindex_hash *h UNUSED, struct netindex *i)
+ +{
+ +//  log(L_TRACE "Lock index %p", i);
+ +  lfuc_lock(&i->uc);
+ +}
+ +
+ +void net_unlock_index(netindex_hash *h, struct netindex *i)
+ +{
+ +//  log(L_TRACE "Unlock index %p", i);
+ +  lfuc_unlock(&i->uc, h->cleanup_list, &h->cleanup_event);
+ +}
+ +
+ +struct netindex *
+ +net_find_index(netindex_hash *h, const net_addr *n)
+ +{
+ +  RCU_ANCHOR(u);
+ +  struct netindex *ni = net_find_index_fragile(h, n);
+ +  return (ni && net_validate_index(h, ni)) ? net_lock_revive_unlock(h, ni) : NULL;
+ +}
+ +
+ +struct netindex *
+ +net_get_index(netindex_hash *h, const net_addr *n)
+ +{
+ +  struct netindex *ni = net_find_index(h, n);
+ +  if (ni) return ni;
+ +
+ +  NH_LOCK(h, hp);
+ +
+ +  /* Somebody may have added one inbetween */
+ +  return net_lock_revive_unlock(h,
+ +      (net_find_index_fragile(h, n) ?:
+ +       net_new_index_locked(hp, n)));
+ +}
+ +
+ +struct netindex net_index_out_of_range;
+ +
+ +struct netindex *
+ +net_resolve_index(netindex_hash *h, u32 i)
+ +{
+ +  RCU_ANCHOR(u);
+ +
+ +  struct netindex * _Atomic *block = atomic_load_explicit(&h->block, memory_order_relaxed);
+ +  u32 bs = atomic_load_explicit(&h->block_size, memory_order_relaxed);
+ +
+ +  if (i >= bs)
+ +    return &net_index_out_of_range;
+ +
+ +  struct netindex *ni = atomic_load_explicit(&block[i], memory_order_acquire);
+ +  if (ni == NULL)
+ +    return NULL;
+ +
+ +  return net_lock_revive_unlock(h, ni);
+ +}
diff --cc lib/rcu.c

index 212d166a8ac7a366cdc471b95af4460649a303cc,0000000000000000000000000000000000000000..25c575f1f44897d9e0a85ca2d9579ad654bb691a

mode 100644,000000..100644
--- 1/lib/rcu.c
--- /dev/null
+++ b/lib/rcu.c
@@@ -1,110 -1,0 +1,110 @@@
-     _Bool critical = 0;
+ +/*
+ + *    BIRD Library -- Read-Copy-Update Basic Operations
+ + *
+ + *    (c) 2021 Maria Matejka <mq@jmq.cz>
+ + *    (c) 2021 CZ.NIC z.s.p.o.
+ + *
+ + *    Can be freely distributed and used under the terms of the GNU GPL.
+ + *    Note: all the relevant patents shall be expired.
+ + *
+ + *    Using the Supplementary Material for User-Level Implementations of Read-Copy-Update
+ + *    by Matthieu Desnoyers, Paul E. McKenney, Alan S. Stern, Michel R. Dagenais and Jonathan Walpole
+ + *    obtained from https://www.efficios.com/pub/rcu/urcu-supp-accepted.pdf
+ + */
+ +
+ +#include "lib/rcu.h"
+ +#include "lib/io-loop.h"
+ +#include "lib/locking.h"
+ +
+ +_Atomic u64 rcu_global_phase = RCU_GP_PHASE;
+ +_Thread_local struct rcu_thread this_rcu_thread;
+ +_Thread_local uint rcu_blocked;
+ +
+ +static struct rcu_thread * _Atomic rcu_thread_list = NULL;
+ +
+ +static _Atomic uint rcu_thread_spinlock = 0;
+ +
+ +static int
+ +rcu_critical(struct rcu_thread *t, u64 phase)
+ +{
+ +  uint val = atomic_load_explicit(&t->ctl, memory_order_acquire);
+ +  return
+ +    (val & RCU_NEST_MASK) /* Active */
+ +    && ((val & ~RCU_NEST_MASK) <= phase); /* In an older phase */
+ +}
+ +
+ +void
+ +synchronize_rcu(void)
+ +{
+ +  if (!rcu_blocked && (last_locked > &locking_stack.meta))
+ +    bug("Forbidden to synchronize RCU unless an appropriate lock is taken");
+ +
+ +  /* Increment phase */
+ +  u64 phase = atomic_fetch_add_explicit(&rcu_global_phase, RCU_GP_PHASE, memory_order_acq_rel);
+ +
+ +  while (1) {
+ +    /* Spinlock */
+ +    while (atomic_exchange_explicit(&rcu_thread_spinlock, 1, memory_order_acq_rel))
+ +      birdloop_yield();
+ +
+ +    /* Check all threads */
++    bool critical = 0;
+ +    for (struct rcu_thread * _Atomic *tp = &rcu_thread_list, *t;
+ +      t = atomic_load_explicit(tp, memory_order_acquire);
+ +      tp = &t->next)
+ +      /* Found a critical */
+ +      if (critical = rcu_critical(t, phase))
+ +      break;
+ +
+ +    /* Unlock */
+ +    ASSERT_DIE(atomic_exchange_explicit(&rcu_thread_spinlock, 0, memory_order_acq_rel));
+ +
+ +    /* Done if no critical */
+ +    if (!critical)
+ +      return;
+ +
+ +    /* Wait and retry if critical */
+ +    birdloop_yield();
+ +  }
+ +}
+ +
+ +void
+ +rcu_thread_start(void)
+ +{
+ +  /* Insert this thread to the thread list, no spinlock is needed */
+ +  struct rcu_thread *next = atomic_load_explicit(&rcu_thread_list, memory_order_acquire);
+ +  do atomic_store_explicit(&this_rcu_thread.next, next, memory_order_relaxed);
+ +  while (!atomic_compare_exchange_strong_explicit(
+ +      &rcu_thread_list, &next, &this_rcu_thread,
+ +      memory_order_acq_rel, memory_order_acquire));
+ +}
+ +
+ +void
+ +rcu_thread_stop(void)
+ +{
+ +  /* Spinlock */
+ +  while (atomic_exchange_explicit(&rcu_thread_spinlock, 1, memory_order_acq_rel))
+ +    birdloop_yield();
+ +
+ +  /* Find this thread */
+ +  for (struct rcu_thread * _Atomic *tp = &rcu_thread_list, *t;
+ +      t = atomic_load_explicit(tp, memory_order_acquire);
+ +      tp = &t->next)
+ +    if (t == &this_rcu_thread)
+ +    {
+ +      /* Remove this thread */
+ +      atomic_store_explicit(tp, atomic_load_explicit(&t->next, memory_order_acquire), memory_order_release);
+ +
+ +      /* Unlock and go */
+ +      ASSERT_DIE(atomic_exchange_explicit(&rcu_thread_spinlock, 0, memory_order_acq_rel));
+ +      return;
+ +    }
+ +
+ +  bug("Failed to find a stopped rcu thread");
+ +}
+ +
+ +void
+ +rcu_init(void)
+ +{
+ +  rcu_thread_start();
+ +}
diff --cc lib/rcu.h

index 214a568b165696e64588e28c11dfeaec6a702fd5,0000000000000000000000000000000000000000..a92440771d0b1d9bf50c7bb7357663216c230d9c

mode 100644,000000..100644
--- 1/lib/rcu.h
--- /dev/null
+++ b/lib/rcu.h
@@@ -1,72 -1,0 +1,72 @@@
- static inline _Bool rcu_read_active(void)
+ +/*
+ + *    BIRD Library -- Read-Copy-Update Basic Operations
+ + *
+ + *    (c) 2021 Maria Matejka <mq@jmq.cz>
+ + *    (c) 2021 CZ.NIC z.s.p.o.
+ + *
+ + *    Can be freely distributed and used under the terms of the GNU GPL.
+ + *    Note: all the relevant patents shall be expired.
+ + */
+ +
+ +#ifndef _BIRD_RCU_H_
+ +#define _BIRD_RCU_H_
+ +
+ +#include "lib/birdlib.h"
+ +#include "lib/lists.h"
+ +#include <stdatomic.h>
+ +
+ +#define RCU_GP_PHASE  0x100
+ +#define RCU_NEST_MASK (RCU_GP_PHASE-1)
+ +#define RCU_NEST_CNT  1
+ +
+ +extern _Atomic u64 rcu_global_phase;
+ +
+ +struct rcu_thread {
+ +  struct rcu_thread * _Atomic next;
+ +  u64 local_ctl;
+ +  _Atomic u64 ctl;
+ +};
+ +
+ +extern _Thread_local struct rcu_thread this_rcu_thread;
+ +extern _Thread_local uint rcu_blocked;
+ +
+ +static inline void rcu_read_lock(void)
+ +{
+ +  /* Increment the nesting counter */
+ +  atomic_store_explicit(&this_rcu_thread.ctl, (this_rcu_thread.local_ctl += RCU_NEST_CNT), memory_order_release);
+ +
+ +  /* Just nested */
+ +  u64 local_nest = this_rcu_thread.local_ctl & RCU_NEST_MASK;
+ +  if (local_nest > RCU_NEST_CNT)
+ +    return;
+ +
+ +  ASSUME(local_nest == RCU_NEST_CNT);
+ +
+ +  /* Update the phase */
+ +  u64 new = atomic_load_explicit(&rcu_global_phase, memory_order_acquire) + RCU_NEST_CNT;
+ +  atomic_store_explicit(&this_rcu_thread.ctl, new, memory_order_release);
+ +  this_rcu_thread.local_ctl = new;
+ +}
+ +
+ +static inline void rcu_read_unlock(void)
+ +{
+ +  /* Just decrement the nesting counter; when unlocked, nobody cares */
+ +  atomic_fetch_sub_explicit(&this_rcu_thread.ctl, RCU_NEST_CNT, memory_order_acq_rel);
+ +  this_rcu_thread.local_ctl--;
+ +}
+ +
++static inline bool rcu_read_active(void)
+ +{
+ +  return !!(this_rcu_thread.local_ctl & RCU_NEST_MASK);
+ +}
+ +
+ +void synchronize_rcu(void);
+ +
+ +/* Registering and unregistering a birdloop. To be called from birdloop implementation */
+ +void rcu_thread_start(void);
+ +void rcu_thread_stop(void);
+ +
+ +/* Run this from resource init */
+ +void rcu_init(void);
+ +
+ +#endif
diff --cc lib/rcu_test.c

index 7cc69710447d47754e1c921a32f629ecc5a1857e,0000000000000000000000000000000000000000..5b89c78319c584b24761d58283649c61bf86e8de

mode 100644,000000..100644
--- 1/lib/rcu_test.c
--- /dev/null
+++ b/lib/rcu_test.c
@@@ -1,202 -1,0 +1,202 @@@
-     _Bool seen = 0;
+ +/*
+ + *    BIRD Library -- Auto storage attribute cleanup test
+ + *
+ + *    (c) 2023 Maria Matejka <mq@jmq.cz>
+ + *    (c) 2023 CZ.NIC z.s.p.o.
+ + *
+ + *    Can be freely distributed and used under the terms of the GNU GPL.
+ + */
+ +
+ +#include "test/birdtest.h"
+ +
+ +#include "lib/rcu.h"
+ +#include "lib/io-loop.h"
+ +
+ +#include <pthread.h>
+ +
+ +#define WRITERS               3
+ +#define READERS               28
+ +
+ +#define WRITER_ROUNDS 20
+ +
+ +static struct block {
+ +  struct block * _Atomic next;
+ +  u64 value;
+ +} ball[WRITERS][WRITER_ROUNDS];
+ +
+ +static struct block *_Atomic bin;
+ +static _Atomic uint seen = 0;
+ +
+ +static void *
+ +t_rcu_basic_reader(void *_ UNUSED)
+ +{
+ +  rcu_thread_start();
+ +
+ +  while (atomic_load_explicit(&bin, memory_order_acquire) == NULL)
+ +    birdloop_yield();
+ +
+ +  atomic_fetch_add_explicit(&seen, 1, memory_order_release);
+ +
+ +  while (atomic_load_explicit(&bin, memory_order_acquire))
+ +  {
+ +    rcu_read_lock();
+ +
+ +    uint mod = 0;
+ +    for (struct block * _Atomic *bp = &bin, *b;
+ +      b = atomic_load_explicit(bp, memory_order_acquire);
+ +      bp = &b->next)
+ +    {
+ +      uint val = b->value % WRITERS + 1;
+ +      ASSERT_DIE(val > mod);
+ +      mod = val;
+ +    }
+ +
+ +    ASSERT_DIE(mod <= WRITERS);
+ +
+ +    rcu_read_unlock();
+ +  }
+ +
+ +  rcu_thread_stop();
+ +  return NULL;
+ +}
+ +
+ +static _Atomic uint spinlock = 0;
+ +
+ +static inline void
+ +spin_lock(void)
+ +{
+ +  while (atomic_exchange_explicit(&spinlock, 1, memory_order_acq_rel))
+ +    birdloop_yield();
+ +}
+ +
+ +static inline void
+ +spin_unlock(void)
+ +{
+ +  ASSERT_DIE(atomic_exchange_explicit(&spinlock, 0, memory_order_acq_rel));
+ +}
+ +
+ +static void *
+ +t_rcu_basic_writer(void *order_ptr)
+ +{
+ +  rcu_thread_start();
+ +
+ +  uint order = (uintptr_t) order_ptr;
+ +  struct block *cur = &ball[order][0];
+ +
+ +  /* Insert the object */
+ +  spin_lock();
+ +  for (struct block * _Atomic *bp = &bin; bp; )
+ +  {
+ +    struct block *b = atomic_load_explicit(bp, memory_order_acquire);
+ +    if (b && ((b->value % WRITERS) < order))
+ +      bp = &b->next;
+ +    else
+ +    {
+ +      ASSERT_DIE(cur->value == 0xbabababababababa);
+ +      cur->value = order;
+ +      atomic_store_explicit(&cur->next, b, memory_order_relaxed);
+ +      atomic_store_explicit(bp, cur, memory_order_release);
+ +      break;
+ +    }
+ +  }
+ +  spin_unlock();
+ +
+ +  /* Wait for readers */
+ +  while (atomic_load_explicit(&seen, memory_order_acquire) != READERS)
+ +    birdloop_yield();
+ +
+ +  /* Update the object */
+ +  for (uint i=1; i<WRITER_ROUNDS; i++)
+ +  {
+ +    struct block *next = &ball[order][i];
+ +    ASSERT_DIE(next->value == 0xbabababababababa);
+ +    next->value = order + i*WRITERS;
+ +
+ +    spin_lock();
-   _Bool seen = 0;
++    bool seen = 0;
+ +    for (struct block * _Atomic *bp = &bin, *b;
+ +      b = atomic_load_explicit(bp, memory_order_acquire);
+ +      bp = &b->next)
+ +      if (b == cur)
+ +      {
+ +      struct block *link = atomic_load_explicit(&b->next, memory_order_relaxed);
+ +      atomic_store_explicit(&next->next, link, memory_order_relaxed);
+ +      atomic_store_explicit(bp, next, memory_order_release);
+ +      seen = 1;
+ +      break;
+ +      }
+ +    ASSERT_DIE(seen);
+ +    spin_unlock();
+ +
+ +    synchronize_rcu();
+ +
+ +    ASSERT_DIE(cur->value + WRITERS == next->value);
+ +    cur->value = 0xd4d4d4d4d4d4d4d4;
+ +    atomic_store_explicit(&cur->next, ((void *) 0xd8d8d8d8d8d8d8d8), memory_order_relaxed);
+ +
+ +    cur = next;
+ +  }
+ +
+ +  /* Remove the object */
+ +  spin_lock();
++  bool seen = 0;
+ +  for (struct block * _Atomic *bp = &bin, *b;
+ +      b = atomic_load_explicit(bp, memory_order_acquire);
+ +      bp = &b->next)
+ +    if (b == cur)
+ +    {
+ +      struct block *link = atomic_load_explicit(&b->next, memory_order_relaxed);
+ +      atomic_store_explicit(bp, link, memory_order_relaxed);
+ +      seen = 1;
+ +      break;
+ +    }
+ +  ASSERT_DIE(seen);
+ +  spin_unlock();
+ +
+ +  synchronize_rcu();
+ +
+ +  cur->value = 0xd4d4d4d4d4d4d4d4;
+ +  atomic_store_explicit(&cur->next, ((void *) 0xd8d8d8d8d8d8d8d8), memory_order_relaxed);
+ +
+ +  rcu_thread_stop();
+ +  return NULL;
+ +}
+ +
+ +static int
+ +t_rcu_basic(void)
+ +{
+ +  memset(ball, 0xba, sizeof ball);
+ +
+ +  pthread_t readers[READERS];
+ +  pthread_t writers[WRITERS];
+ +
+ +  for (uint i=0; i<READERS; i++)
+ +    pthread_create(&readers[i], NULL, t_rcu_basic_reader, NULL);
+ +
+ +  for (uintptr_t i=0; i<WRITERS; i++)
+ +    pthread_create(&writers[i], NULL, t_rcu_basic_writer, (void *) i);
+ +
+ +  for (uintptr_t i=0; i<WRITERS; i++)
+ +    pthread_join(writers[i], NULL);
+ +
+ +  for (uintptr_t i=0; i<READERS; i++)
+ +    pthread_join(readers[i], NULL);
+ +
+ +  for (uint w = 0; w < WRITERS; w++)
+ +    for (uint r = 0; r < WRITER_ROUNDS; r++)
+ +    {
+ +      ASSERT_DIE(ball[w][r].value == 0xd4d4d4d4d4d4d4d4);
+ +      ASSERT_DIE(atomic_load_explicit(&ball[w][r].next, memory_order_relaxed) == (void *) 0xd8d8d8d8d8d8d8d8);
+ +    }
+ +
+ +  return 1;
+ +}
+ +
+ +int main(int argc, char **argv)
+ +{
+ +  bt_init(argc, argv);
+ +
+ +  bt_test_suite(t_rcu_basic, "Basic RCU check");
+ +
+ +  return bt_exit_value();
+ +}
diff --cc lib/route.h

index 0a95663569a5751087316c1f0a53059f5833565b,0000000000000000000000000000000000000000..a0cb75da0dabc8f6048c6c7b0c05442fe2baf06a

mode 100644,000000..100644
--- 1/lib/route.h
--- /dev/null
+++ b/lib/route.h
@@@ -1,610 -1,0 +1,610 @@@
- ea_unset_attr(ea_list **to, _Bool local, const struct ea_class *def)
+ +/*
+ + *    BIRD Internet Routing Daemon -- Routing data structures
+ + *
+ + *    (c) 1998--2000 Martin Mares <mj@ucw.cz>
+ + *    (c) 2022 Maria Matejka <mq@jmq.cz>
+ + *
+ + *    Can be freely distributed and used under the terms of the GNU GPL.
+ + */
+ +
+ +#ifndef _BIRD_LIB_ROUTE_H_
+ +#define _BIRD_LIB_ROUTE_H_
+ +
+ +#undef RT_SOURCE_DEBUG
+ +
+ +#include "lib/type.h"
+ +#include "lib/rcu.h"
+ +#include "lib/hash.h"
+ +#include "lib/event.h"
+ +#include "lib/lockfree.h"
+ +
+ +struct network;
+ +struct proto;
+ +struct cli;
+ +struct rtable_private;
+ +struct rte_storage;
+ +
+ +#define RTE_IN_TABLE_WRITABLE \
+ +  byte pflags;                                /* Protocol-specific flags; may change in-table (!) */ \
+ +  byte flags;                         /* Table-specific flags */ \
+ +  u8 stale_cycle;                     /* Auxiliary value for route refresh; may change in-table (!) */ \
+ +
+ +typedef struct rte {
+ +  RTE_IN_TABLE_WRITABLE;
+ +  u8 generation;                      /* If this route import is based on other previously exported route,
+ +                                         this value should be 1 + MAX(generation of the parent routes).
+ +                                         Otherwise the route is independent and this value is zero. */
+ +  u32 id;                             /* Table specific route id */
+ +  struct ea_list *attrs;              /* Attributes of this route */
+ +  const net_addr *net;                        /* Network this RTE belongs to */
+ +  struct rte_src *src;                        /* Route source that created the route */
+ +  struct rt_import_hook *sender;      /* Import hook used to send the route to the routing table */
+ +  btime lastmod;                      /* Last modified (set by table) */
+ +} rte;
+ +
+ +#define REF_FILTERED  2               /* Route is rejected by import filter */
+ +#define REF_OBSOLETE  16              /* Route is obsolete, pending propagation */
+ +#define REF_PENDING   32              /* Route has not propagated completely yet */
+ +
+ +/* Route is valid for propagation (may depend on other flags in the future), accepts NULL */
+ +static inline int rte_is_valid(const rte *r) { return r && !(r->flags & REF_FILTERED); }
+ +
+ +/* Route just has REF_FILTERED flag */
+ +static inline int rte_is_filtered(const rte *r) { return !!(r->flags & REF_FILTERED); }
+ +
+ +/* Strip the route of the table-specific values */
+ +static inline rte rte_init_from(const rte *r)
+ +{
+ +  return (rte) {
+ +    .attrs = r->attrs,
+ +    .net = r->net,
+ +    .src = r->src,
+ +  };
+ +}
+ +
+ +int rte_same(const rte *, const rte *);
+ +
+ +struct rte_src {
+ +  struct rte_src *next;                       /* Hash chain */
+ +  struct rte_owner *owner;            /* Route source owner */
+ +  u64 private_id;                     /* Private ID, assigned by the protocol */
+ +  u32 global_id;                      /* Globally unique ID of the source */
+ +  struct lfuc uc;                     /* Use count */
+ +};
+ +
+ +struct rte_owner_class {
+ +  void (*get_route_info)(const rte *, byte *buf); /* Get route information (for `show route' command) */
+ +  int (*rte_better)(const rte *, const rte *);
+ +  int (*rte_mergable)(const rte *, const rte *);
+ +  u32 (*rte_igp_metric)(const rte *);
+ +};
+ +
+ +struct rte_owner {
+ +  struct rte_owner_class *class;
+ +  int (*rte_recalculate)(struct rtable_private *, struct network *, struct rte_storage *new, struct rte_storage *, struct rte_storage *);
+ +  HASH(struct rte_src) hash;
+ +  const char *name;
+ +  u32 hash_key;
+ +  u32 uc;
+ +  u32 debug;
+ +  event_list *list;
+ +  event *prune;
+ +  event *stop;
+ +};
+ +
+ +extern DOMAIN(attrs) attrs_domain;
+ +
+ +#define RTA_LOCK       LOCK_DOMAIN(attrs, attrs_domain)
+ +#define RTA_UNLOCK     UNLOCK_DOMAIN(attrs, attrs_domain)
+ +
+ +#define RTE_SRC_PU_SHIFT      44
+ +#define RTE_SRC_IN_PROGRESS   (1ULL << RTE_SRC_PU_SHIFT)
+ +
+ +/* Get a route source. This also locks the source, therefore the caller has to
+ + * unlock the source after the route has been propagated. */
+ +struct rte_src *rt_get_source_o(struct rte_owner *o, u32 id);
+ +#define rt_get_source(p, id)  rt_get_source_o(&(p)->sources, (id))
+ +
+ +struct rte_src *rt_find_source_global(u32 id);
+ +
+ +#ifdef RT_SOURCE_DEBUG
+ +#define rt_lock_source _rt_lock_source_internal
+ +#define rt_unlock_source _rt_unlock_source_internal
+ +#endif
+ +
+ +static inline void rt_lock_source(struct rte_src *src)
+ +{
+ +  lfuc_lock(&src->uc);
+ +}
+ +
+ +static inline void rt_unlock_source(struct rte_src *src)
+ +{
+ +  lfuc_unlock(&src->uc, src->owner->list, src->owner->prune);
+ +}
+ +
+ +#ifdef RT_SOURCE_DEBUG
+ +#undef rt_lock_source
+ +#undef rt_unlock_source
+ +
+ +#define rt_lock_source(x) ( log(L_INFO "Lock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_lock_source_internal(x) )
+ +#define rt_unlock_source(x) ( log(L_INFO "Unlock source %uG at %s:%d", (x)->global_id, __FILE__, __LINE__), _rt_unlock_source_internal(x) )
+ +#endif
+ +
+ +void rt_init_sources(struct rte_owner *, const char *name, event_list *list);
+ +void rt_destroy_sources(struct rte_owner *, event *);
+ +
+ +void rt_dump_sources(struct rte_owner *);
+ +
+ +/*
+ + *    Route Attributes
+ + *
+ + *    Beware: All standard BGP attributes must be represented here instead
+ + *    of making them local to the route. This is needed to ensure proper
+ + *    construction of BGP route attribute lists.
+ + */
+ +
+ +/* Nexthop structure */
+ +struct nexthop {
+ +  ip_addr gw;                         /* Next hop */
+ +  struct iface *iface;                        /* Outgoing interface */
+ +  byte flags;
+ +  byte weight;
+ +  byte labels;                                /* Number of all labels */
+ +  u32 label[0];
+ +};
+ +
+ +/* For packing one into eattrs */
+ +struct nexthop_adata {
+ +  struct adata ad;
+ +  /* There is either a set of nexthops or a special destination (RTD_*) */
+ +  union {
+ +    struct nexthop nh;
+ +    uint dest;
+ +  };
+ +};
+ +
+ +/* For MPLS label stack generation */
+ +struct nexthop_adata_mpls {
+ +  struct nexthop_adata nhad;
+ +  u32 label_space[MPLS_MAX_LABEL_STACK];
+ +};
+ +
+ +#define NEXTHOP_DEST_SIZE     (OFFSETOF(struct nexthop_adata, dest) + sizeof(uint) - OFFSETOF(struct adata, data))
+ +#define NEXTHOP_DEST_LITERAL(x)       ((struct nexthop_adata) { \
+ +      .ad.length = NEXTHOP_DEST_SIZE, .dest = (x), })
+ +
+ +#define RNF_ONLINK            0x1     /* Gateway is onlink regardless of IP ranges */
+ +
+ +
+ +#define RTS_STATIC 1                  /* Normal static route */
+ +#define RTS_INHERIT 2                 /* Route inherited from kernel */
+ +#define RTS_DEVICE 3                  /* Device route */
+ +#define RTS_STATIC_DEVICE 4           /* Static device route */
+ +#define RTS_REDIRECT 5                        /* Learned via redirect */
+ +#define RTS_RIP 6                     /* RIP route */
+ +#define RTS_OSPF 7                    /* OSPF route */
+ +#define RTS_OSPF_IA 8                 /* OSPF inter-area route */
+ +#define RTS_OSPF_EXT1 9                       /* OSPF external route type 1 */
+ +#define RTS_OSPF_EXT2 10              /* OSPF external route type 2 */
+ +#define RTS_BGP 11                    /* BGP route */
+ +#define RTS_PIPE 12                   /* Inter-table wormhole */
+ +#define RTS_BABEL 13                  /* Babel route */
+ +#define RTS_RPKI 14                   /* Route Origin Authorization */
+ +#define RTS_PERF 15                   /* Perf checker */
+ +#define RTS_L3VPN 16                  /* MPLS L3VPN */
+ +#define RTS_AGGREGATED 17             /* Aggregated route */
+ +#define RTS_MAX 18
+ +
+ +#define RTD_NONE 0                    /* Undefined next hop */
+ +#define RTD_UNICAST 1                 /* A standard next hop */
+ +#define RTD_BLACKHOLE 2                       /* Silently drop packets */
+ +#define RTD_UNREACHABLE 3             /* Reject as unreachable */
+ +#define RTD_PROHIBIT 4                        /* Administratively prohibited */
+ +#define RTD_MAX 5
+ +
+ +extern const char * rta_dest_names[RTD_MAX];
+ +
+ +static inline const char *rta_dest_name(uint n)
+ +{ return (n < RTD_MAX) ? rta_dest_names[n] : "???"; }
+ +
+ +
+ +/*
+ + *    Extended Route Attributes
+ + */
+ +
+ +typedef struct eattr {
+ +  word id;                            /* EA_CODE(PROTOCOL_..., protocol-dependent ID) */
+ +  byte flags;                         /* Protocol-dependent flags */
+ +  byte type;                          /* Attribute type */
+ +  byte rfu:5;
+ +  byte originated:1;                  /* The attribute has originated locally */
+ +  byte fresh:1;                               /* An uncached attribute (e.g. modified in export filter) */
+ +  byte undef:1;                               /* Explicitly undefined */
+ +
+ +  PADDING(unused, 3, 3);
+ +
+ +  union bval u;
+ +} eattr;
+ +
+ +
+ +#define EA_CODE_MASK 0xffff
+ +#define EA_ALLOW_UNDEF 0x10000                /* ea_find: allow EAF_TYPE_UNDEF */
+ +#define EA_BIT(n) ((n) << 24)         /* Used in bitfield accessors */
+ +#define EA_BIT_GET(ea) ((ea) >> 24)
+ +
+ +typedef struct ea_list {
+ +  struct ea_list *next;                       /* In case we have an override list */
+ +  byte flags;                         /* Flags: EALF_... */
+ +  byte stored:5;                      /* enum ea_stored */
+ +  byte rfu:3;
+ +  word count;                         /* Number of attributes */
+ +  eattr attrs[0];                     /* Attribute definitions themselves */
+ +} ea_list;
+ +
+ +enum ea_stored {
+ +  EALS_NONE = 0,                      /* This is a temporary ea_list */
+ +  EALS_PREIMPORT = 1,                 /* State when route entered rte_update() */
+ +  EALS_FILTERED = 2,                  /* State after filters */
+ +  EALS_IN_TABLE = 3,                  /* State in table */
+ +  EALS_KEY = 4,                               /* EA list used as key */
+ +  EALS_CUSTOM = 0x10,                 /* OR this with custom values */
+ +  EALS_MAX = 0x20,
+ +};
+ +
+ +struct ea_storage {
+ +  struct ea_storage *next_hash;               /* Next in hash chain */
+ +  _Atomic u64 uc;                     /* Use count */
+ +  u32 hash_key;                               /* List hash */
+ +  PADDING(unused, 0, 4);              /* Sorry, we need u64 for the usecount */
+ +  ea_list l[0];                               /* The list itself */
+ +};
+ +
+ +#define EALF_SORTED 1                 /* Attributes are sorted by code */
+ +#define EALF_BISECT 2                 /* Use interval bisection for searching */
+ +#define EALF_HUGE   8                 /* List is too big to fit into slab */
+ +
+ +struct ea_class {
+ +#define EA_CLASS_INSIDE \
+ +  const char *name;                   /* Name (both print and filter) */ \
+ +  struct symbol *sym;                 /* Symbol to export to configs */ \
+ +  uint id;                            /* Autoassigned attribute ID */ \
+ +  uint uc;                            /* Reference count */ \
+ +  btype type;                         /* Data type ID */ \
+ +  u16 flags;                          /* Protocol-dependent flags */ \
+ +  uint readonly:1;                    /* This attribute can't be changed by filters */ \
+ +  uint conf:1;                                /* Requested by config */ \
+ +  uint hidden:1;                      /* Technical attribute, do not show, do not expose to filters */ \
+ +  void (*format)(const eattr *ea, byte *buf, uint size); \
+ +  void (*stored)(const eattr *ea);    /* When stored into global hash */ \
+ +  void (*freed)(const eattr *ea);     /* When released from global hash */ \
+ +
+ +  EA_CLASS_INSIDE;
+ +};
+ +
+ +struct ea_class_ref {
+ +  resource r;
+ +  struct ea_class *class;
+ +};
+ +
+ +void ea_register_init(struct ea_class *);
+ +struct ea_class_ref *ea_register_alloc(pool *, struct ea_class);
+ +struct ea_class_ref *ea_ref_class(pool *, struct ea_class *); /* Reference for an attribute alias */
+ +
+ +#define EA_REGISTER_ALL_HELPER(x)     ea_register_init(x);
+ +#define EA_REGISTER_ALL(...)          MACRO_FOREACH(EA_REGISTER_ALL_HELPER, __VA_ARGS__)
+ +
+ +struct ea_class *ea_class_find_by_id(uint id);
+ +struct ea_class *ea_class_find_by_name(const char *name);
+ +static inline struct ea_class *ea_class_self(struct ea_class *self) { return self; }
+ +#define ea_class_find(_arg)   _Generic((_arg), \
+ +  uint: ea_class_find_by_id, \
+ +  word: ea_class_find_by_id, \
+ +  char *: ea_class_find_by_name, \
+ +  const char *: ea_class_find_by_name, \
+ +  struct ea_class *: ea_class_self)(_arg)
+ +
+ +struct ea_walk_state {
+ +  ea_list *eattrs;                    /* Ccurrent ea_list, initially set by caller */
+ +  eattr *ea;                          /* Current eattr, initially NULL */
+ +  u32 visited[4];                     /* Bitfield, limiting max to 128 */
+ +};
+ +
+ +#define ea_find(_l, _arg)     _Generic((_arg), uint: ea_find_by_id, struct ea_class *: ea_find_by_class, char *: ea_find_by_name)(_l, _arg)
+ +eattr *ea_find_by_id(ea_list *, unsigned ea);
+ +static inline eattr *ea_find_by_class(ea_list *l, const struct ea_class *def)
+ +{ return ea_find_by_id(l, def->id); }
+ +static inline eattr *ea_find_by_name(ea_list *l, const char *name)
+ +{
+ +  const struct ea_class *def = ea_class_find_by_name(name);
+ +  return def ? ea_find_by_class(l, def) : NULL;
+ +}
+ +
+ +#define ea_get_int(_l, _ident, _def)  ({ \
+ +    struct ea_class *cls = ea_class_find((_ident)); \
+ +    ASSERT_DIE(cls->type & EAF_EMBEDDED); \
+ +    const eattr *ea = ea_find((_l), cls->id); \
+ +    (ea ? ea->u.data : (_def)); \
+ +    })
+ +
+ +#define ea_get_ip(_l, _ident, _def)  ({ \
+ +    struct ea_class *cls = ea_class_find((_ident)); \
+ +    ASSERT_DIE(cls->type == T_IP); \
+ +    const eattr *ea = ea_find((_l), cls->id); \
+ +    (ea ? *((const ip_addr *) ea->u.ptr->data) : (_def)); \
+ +    })
+ +
+ +#define ea_get_adata(_l, _ident)      ({ \
+ +    struct ea_class *cls = ea_class_find((_ident)); \
+ +    ASSERT_DIE(!(cls->type & EAF_EMBEDDED)); \
+ +    const eattr *ea = ea_find((_l), cls->id); \
+ +    (ea ? ea->u.ptr : &null_adata); \
+ +    })
+ +
+ +eattr *ea_walk(struct ea_walk_state *s, uint id, uint max);
+ +void ea_dump(ea_list *);
+ +int ea_same(ea_list *x, ea_list *y);  /* Test whether two ea_lists are identical */
+ +uint ea_hash(ea_list *e);             /* Calculate attributes hash value */
+ +ea_list *ea_append(ea_list *to, ea_list *what);
+ +void ea_format_bitfield(const struct eattr *a, byte *buf, int bufsize, const char **names, int min, int max);
+ +
+ +/* Normalize ea_list; allocates the result from tmp_linpool */
+ +ea_list *ea_normalize(ea_list *e, u32 upto);
+ +
+ +uint ea_list_size(ea_list *);
+ +void ea_list_copy(ea_list *dest, ea_list *src, uint size);
+ +
+ +#define EA_LOCAL_LIST(N)  struct { ea_list l; eattr a[N]; }
+ +
+ +#define EA_LITERAL_EMBEDDED(_class, _flags, _val) ({ \
+ +    btype _type = (_class)->type; \
+ +    ASSERT_DIE(_type & EAF_EMBEDDED); \
+ +    EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.i = _val); \
+ +    })
+ +
+ +#define EA_LITERAL_STORE_ADATA(_class, _flags, _buf, _len) ({ \
+ +    btype _type = (_class)->type; \
+ +    ASSERT_DIE(!(_type & EAF_EMBEDDED)); \
+ +    EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.ad = tmp_store_adata((_buf), (_len))); \
+ +    })
+ +
+ +#define EA_LITERAL_DIRECT_ADATA(_class, _flags, _adata) ({ \
+ +    btype _type = (_class)->type; \
+ +    ASSERT_DIE(!(_type & EAF_EMBEDDED)); \
+ +    EA_LITERAL_GENERIC((_class)->id, _type, _flags, .u.ad = _adata); \
+ +    })
+ +
+ +#define EA_LITERAL_GENERIC(_id, _type, _flags, ...) \
+ +  ((eattr) { .id = _id, .type = _type, .flags = _flags, __VA_ARGS__ })
+ +
+ +static inline eattr *
+ +ea_set_attr(ea_list **to, eattr a)
+ +{
+ +  if (!a.id)
+ +    bug("You have forgotten to register your EA class");
+ +
+ +  EA_LOCAL_LIST(1) *ea = tmp_alloc(sizeof(*ea));
+ +  *ea = (typeof(*ea)) {
+ +    .l.flags = EALF_SORTED,
+ +    .l.count = 1,
+ +    .l.next = *to,
+ +    .a[0] = a,
+ +  };
+ +
+ +  *to = &ea->l;
+ +  return &ea->a[0];
+ +}
+ +
+ +static inline void
++ea_unset_attr(ea_list **to, bool local, const struct ea_class *def)
+ +{
+ +  ea_set_attr(to, EA_LITERAL_GENERIC(def->id, 0, 0,
+ +      .fresh = local, .originated = local, .undef = 1));
+ +}
+ +
+ +static inline void
+ +ea_set_attr_u32(ea_list **to, const struct ea_class *def, uint flags, u64 data)
+ +{ ea_set_attr(to, EA_LITERAL_EMBEDDED(def, flags, data)); }
+ +
+ +static inline void
+ +ea_set_attr_data(ea_list **to, const struct ea_class *def, uint flags, const void *data, uint len)
+ +{ ea_set_attr(to, EA_LITERAL_STORE_ADATA(def, flags, data, len)); }
+ +
+ +static inline void
+ +ea_copy_attr(ea_list **to, ea_list *from, const struct ea_class *def)
+ +{
+ +  eattr *e = ea_find_by_class(from, def);
+ +  if (e)
+ +    if (e->type & EAF_EMBEDDED)
+ +      ea_set_attr_u32(to, def, e->flags, e->u.data);
+ +    else
+ +      ea_set_attr_data(to, def, e->flags, e->u.ptr->data, e->u.ptr->length);
+ +  else
+ +    ea_unset_attr(to, 0, def);
+ +}
+ +
+ +/*
+ + *    Common route attributes
+ + */
+ +
+ +/* Preference: first-order comparison */
+ +extern struct ea_class ea_gen_preference;
+ +static inline u32 rt_get_preference(const rte *rt)
+ +{ return ea_get_int(rt->attrs, &ea_gen_preference, 0); }
+ +
+ +/* IGP metric: second-order comparison */
+ +extern struct ea_class ea_gen_igp_metric;
+ +u32 rt_get_igp_metric(const rte *rt);
+ +#define IGP_METRIC_UNKNOWN 0x80000000 /* Default igp_metric used when no other
+ +                                         protocol-specific metric is availabe */
+ +
+ +/* From: Advertising router */
+ +extern struct ea_class ea_gen_from;
+ +
+ +
+ +/* MPLS Label, Policy and Class */
+ +extern struct ea_class ea_gen_mpls_label,
+ +       ea_gen_mpls_policy, ea_gen_mpls_class;
+ +
+ +
+ +/* Source: An old method to devise the route source protocol and kind.
+ + * To be superseded in a near future by something more informative. */
+ +extern struct ea_class ea_gen_source;
+ +static inline u32 rt_get_source_attr(const rte *rt)
+ +{ return ea_get_int(rt->attrs, &ea_gen_source, 0); }
+ +
+ +/* Flowspec validation result */
+ +enum flowspec_valid {
+ +  FLOWSPEC_UNKNOWN    = 0,
+ +  FLOWSPEC_VALID      = 1,
+ +  FLOWSPEC_INVALID    = 2,
+ +  FLOWSPEC__MAX,
+ +};
+ +
+ +extern const char * flowspec_valid_names[FLOWSPEC__MAX];
+ +static inline const char *flowspec_valid_name(enum flowspec_valid v)
+ +{ return (v < FLOWSPEC__MAX) ? flowspec_valid_names[v] : "???"; }
+ +
+ +extern struct ea_class ea_gen_flowspec_valid;
+ +static inline enum flowspec_valid rt_get_flowspec_valid(const rte *rt)
+ +{ return ea_get_int(rt->attrs, &ea_gen_flowspec_valid, FLOWSPEC_UNKNOWN); }
+ +
+ +/* Next hop: For now, stored as adata */
+ +extern struct ea_class ea_gen_nexthop;
+ +
+ +static inline void ea_set_dest(struct ea_list **to, uint flags, uint dest)
+ +{
+ +  struct nexthop_adata nhad = NEXTHOP_DEST_LITERAL(dest);
+ +  ea_set_attr_data(to, &ea_gen_nexthop, flags, &nhad.ad.data, nhad.ad.length);
+ +}
+ +
+ +/* Next hop structures */
+ +
+ +#define NEXTHOP_ALIGNMENT     (_Alignof(struct nexthop))
+ +#define NEXTHOP_MAX_SIZE      (sizeof(struct nexthop) + sizeof(u32)*MPLS_MAX_LABEL_STACK)
+ +#define NEXTHOP_SIZE(_nh)     NEXTHOP_SIZE_CNT(((_nh)->labels))
+ +#define NEXTHOP_SIZE_CNT(cnt) BIRD_ALIGN((sizeof(struct nexthop) + sizeof(u32) * (cnt)), NEXTHOP_ALIGNMENT)
+ +#define nexthop_size(nh)      NEXTHOP_SIZE((nh))
+ +
+ +#define NEXTHOP_NEXT(_nh)     ((void *) (_nh) + NEXTHOP_SIZE(_nh))
+ +#define NEXTHOP_END(_nhad)    ((_nhad)->ad.data + (_nhad)->ad.length)
+ +#define NEXTHOP_VALID(_nh, _nhad) ((void *) (_nh) < (void *) NEXTHOP_END(_nhad))
+ +#define NEXTHOP_ONE(_nhad)    (NEXTHOP_NEXT(&(_nhad)->nh) == NEXTHOP_END(_nhad))
+ +
+ +#define NEXTHOP_WALK(_iter, _nhad) for ( \
+ +    struct nexthop *_iter = &(_nhad)->nh; \
+ +    (void *) _iter < (void *) NEXTHOP_END(_nhad); \
+ +    _iter = NEXTHOP_NEXT(_iter))
+ +
+ +
+ +static inline int nexthop_same(struct nexthop_adata *x, struct nexthop_adata *y)
+ +{ return adata_same(&x->ad, &y->ad); }
+ +struct nexthop_adata *nexthop_merge(struct nexthop_adata *x, struct nexthop_adata *y, int max, linpool *lp);
+ +struct nexthop_adata *nexthop_sort(struct nexthop_adata *x, linpool *lp);
+ +int nexthop_is_sorted(struct nexthop_adata *x);
+ +
+ +#define NEXTHOP_IS_REACHABLE(nhad)    ((nhad)->ad.length > NEXTHOP_DEST_SIZE)
+ +
+ +static inline struct nexthop_adata *
+ +rte_get_nexthops(rte *r)
+ +{
+ +  eattr *nhea = ea_find(r->attrs, &ea_gen_nexthop);
+ +  return nhea ? SKIP_BACK(struct nexthop_adata, ad, nhea->u.ptr) : NULL;
+ +}
+ +
+ +/* Route has regular, reachable nexthop (i.e. not RTD_UNREACHABLE and like) */
+ +static inline int rte_is_reachable(rte *r)
+ +{
+ +  struct nexthop_adata *nhad = rte_get_nexthops(r);
+ +  return nhad && NEXTHOP_IS_REACHABLE(nhad);
+ +}
+ +
+ +static inline int nhea_dest(eattr *nhea)
+ +{
+ +  if (!nhea)
+ +    return RTD_NONE;
+ +
+ +  struct nexthop_adata *nhad = nhea ? (struct nexthop_adata *) nhea->u.ptr : NULL;
+ +  if (NEXTHOP_IS_REACHABLE(nhad))
+ +    return RTD_UNICAST;
+ +  else
+ +    return nhad->dest;
+ +}
+ +
+ +static inline int rte_dest(const rte *r)
+ +{
+ +  return nhea_dest(ea_find(r->attrs, &ea_gen_nexthop));
+ +}
+ +
+ +void rta_init(void);
+ +
+ +ea_list *ea_lookup_slow(ea_list *r, u32 squash_upto, enum ea_stored oid);
+ +
+ +static inline struct ea_storage *ea_get_storage(ea_list *r)
+ +{
+ +  ASSERT_DIE(r->stored);
+ +  return SKIP_BACK(struct ea_storage, l[0], r);
+ +}
+ +
+ +static inline ea_list *ea_ref(ea_list *r)
+ +{
+ +  ASSERT_DIE(0 < atomic_fetch_add_explicit(&ea_get_storage(r)->uc, 1, memory_order_acq_rel));
+ +  return r;
+ +}
+ +
+ +static inline ea_list *ea_lookup(ea_list *r, u32 squash_upto, enum ea_stored oid)
+ +{
+ +  ASSERT_DIE(oid);
+ +  if ((r->stored == oid) || BIT32_TEST(&squash_upto, r->stored))
+ +    return ea_ref(r);
+ +  else
+ +    return ea_lookup_slow(r, squash_upto, oid);
+ +}
+ +
+ +struct ea_free_deferred {
+ +  struct deferred_call dc;
+ +  ea_list *attrs;
+ +};
+ +
+ +void ea_free_deferred(struct deferred_call *dc);
+ +
+ +static inline ea_list *ea_free_later(ea_list *r)
+ +{
+ +  if (!r)
+ +    return NULL;
+ +
+ +  struct ea_free_deferred efd = {
+ +    .dc.hook = ea_free_deferred,
+ +    .attrs = r,
+ +  };
+ +
+ +  defer_call(&efd.dc, sizeof efd);
+ +  return r;
+ +}
+ +
+ +#define ea_free ea_free_later
+ +
+ +static inline ea_list *ea_lookup_tmp(ea_list *r, u32 squash_upto, enum ea_stored oid)
+ +{
+ +  return ea_free_later(ea_lookup(r, squash_upto, oid));
+ +}
+ +
+ +static inline ea_list *ea_ref_tmp(ea_list *r)
+ +{
+ +  ASSERT_DIE(r->stored);
+ +  return ea_free_later(ea_ref(r));
+ +}
+ +
+ +static inline ea_list *ea_strip_to(ea_list *r, u32 strip_to)
+ +{
+ +  ASSERT_DIE(strip_to);
+ +  while (r && !BIT32_TEST(&strip_to, r->stored))
+ +    r = r->next;
+ +
+ +  return r;
+ +}
+ +
+ +void ea_dump(ea_list *);
+ +void ea_dump_all(void);
+ +void ea_show_list(struct cli *, ea_list *);
+ +
+ +#endif
diff --cc lib/socket.h

index 4c80b96a4d4de0022996bb33b1b34e9e41c80059,231c10d86d7c6fb71715921ae2711452941c46b1..302c8e1e96f4f156b470c4a6457499a9f0ac7146
--- 1/lib/socket.h
--- 2/lib/socket.h
+++ b/lib/socket.h
@@@ -86,12 -84,8 +86,12 @@@ typedef struct birdsock 
   sock *sock_new(pool *);                       /* Allocate new socket */
   #define sk_new(X) sock_new(X)         /* Wrapper to avoid name collision with OpenSSL */
   
- -int sk_open(sock *);                  /* Open socket */
+ +int sk_open(sock *, struct birdloop *);               /* Open socket */
+ +void sk_reloop(sock *, struct birdloop *);    /* Move socket to another loop. Both loops must be locked. */
+ +static inline void sk_close(sock *s) { rfree(&s->r); }        /* Explicitly close socket */
+ +
   int sk_rx_ready(sock *s);
- _Bool sk_tx_pending(sock *s);
++bool sk_tx_pending(sock *s);
   int sk_send(sock *, uint len);                /* Send data, <0=err, >0=ok, 0=sleep */
   int sk_send_to(sock *, uint len, ip_addr to, uint port); /* sk_send to given destination */
   void sk_reallocate(sock *);           /* Free and allocate tbuf & rbuf */
diff --cc nest/mpls.c

index a362ef80b49ea3f4405e15f0252b684ab8daf51a,9cdcd572adef497b583341a920117d13016a71e8..b81f82c1f5860a7df98fb6009b31479a0b651efe
--- 1/nest/mpls.c
--- 2/nest/mpls.c
+++ b/nest/mpls.c
@@@ -874,31 -760,10 +874,31 @@@ mpls_fec_map_reconfigure(struct mpls_fe
     /* Remove old unused handles */
   
     if (old_d && !old_d->label_count)
- -    mpls_free_handle(m->domain, old_d);
+ +    mpls_free_handle(old_d);
   
     if (old_s && !old_s->label_count)
- -    mpls_free_handle(m->domain, old_s);
+ +    mpls_free_handle(old_s);
+ +}
+ +
+ +static void
+ +mpls_fec_map_cleanup(void *_m)
+ +{
+ +  struct mpls_fec_map *m = _m;
-   _Bool finished = (m->channel->channel_state == CS_STOP);
++  bool finished = (m->channel->channel_state == CS_STOP);
+ +  HASH_WALK_DELSAFE(m->label_hash, next_l, fec)
+ +    if (lfuc_finished(&fec->uc))
+ +      mpls_free_fec(m, fec);
+ +    else
+ +      finished = 0;
+ +  HASH_WALK_DELSAFE_END;
+ +
+ +  DBGL("FEC Map %p Cleanup: %sfinished", m, finished ? "" : "not ");
+ +
+ +  if (finished)
+ +  {
+ +    ev_postpone(m->cleanup_event);
+ +    channel_del_obstacle(m->channel);
+ +  }
   }
   
   void
@@@ -1435,17 -1290,9 +1435,17 @@@ mpls_show_ranges(struct mpls_show_range
       mpls_show_ranges_dom(cmd, cmd->domain->domain);
     else
     {
- -    struct mpls_domain *m;
- -    WALK_LIST(m, mpls_domains)
+ +    struct mpls_domain_pub *m;
-     _Bool first = 1;
++    bool first = 1;
+ +    WALK_LIST(m, MPLS_GLOBAL->domains)
+ +    {
+ +      if (first)
+ +      first = 0;
+ +      else
+ +      cli_msg(-1026, "");
+ +
         mpls_show_ranges_dom(cmd, m);
+ +    }
     }
   
     cli_msg(0, "");
diff --cc nest/proto.c

index 279ac069858e09a65f4cc43bb4443b3e265aed0a,88f4813ef572de5ca5a92cd360e4ca3dbf062815..81d1e9240e1338a3656a1ad8982aa2ee1bed15f4
--- 1/nest/proto.c
--- 2/nest/proto.c
+++ b/nest/proto.c
@@@ -667,129 -478,67 +667,129 @@@ channel_start_export(struct channel *c
   }
   
   static void
- -channel_reload_loop(void *ptr)
+ +channel_check_stopped(struct channel *c)
   {
- -  struct channel *c = ptr;
+ +  switch (c->channel_state)
+ +  {
+ +    case CS_STOP:
+ +      if (c->obstacles || !EMPTY_LIST(c->roa_subscriptions) || c->in_req.hook)
+ +      return;
   
- -  /* Start reload */
- -  if (!c->reload_active)
- -    c->reload_pending = 0;
+ +      ASSERT_DIE(rt_export_get_state(&c->out_req) == TES_DOWN);
+ +      ASSERT_DIE(!rt_export_feed_active(&c->reimporter));
   
- -  if (!rt_reload_channel(c))
- -  {
- -    ev_schedule_work(c->reload_event);
- -    return;
+ +      channel_set_state(c, CS_DOWN);
+ +      proto_send_event(c->proto, c->proto->event);
+ +
+ +      break;
+ +    case CS_PAUSE:
+ +      if (c->obstacles || !EMPTY_LIST(c->roa_subscriptions))
+ +      return;
+ +
+ +      ASSERT_DIE(rt_export_get_state(&c->out_req) == TES_DOWN);
+ +      ASSERT_DIE(!rt_export_feed_active(&c->reimporter));
+ +
+ +      channel_set_state(c, CS_START);
+ +      break;
     }
   
- -  /* Restart reload */
- -  if (c->reload_pending)
- -    channel_request_reload(c);
+ +  DBG("%s.%s: Channel requests/hooks stopped (in state %s)\n", c->proto->name, c->name, c_states[c->channel_state]);
   }
   
- -static void
- -channel_reset_import(struct channel *c)
+ +void
+ +channel_add_obstacle(struct channel *c)
   {
- -  /* Need to abort feeding */
- -  ev_postpone(c->reload_event);
- -  rt_reload_channel_abort(c);
- -
- -  rt_prune_sync(c->in_table, 1);
+ +  c->obstacles++;
   }
   
- -static void
- -channel_reset_export(struct channel *c)
+ +void
+ +channel_del_obstacle(struct channel *c)
   {
- -  /* Just free the routes */
- -  rt_prune_sync(c->out_table, 1);
+ +  if (!--c->obstacles)
+ +    channel_check_stopped(c);
   }
   
- -/* Called by protocol to activate in_table */
   void
- -channel_setup_in_table(struct channel *c)
+ +channel_import_stopped(struct rt_import_request *req)
   {
- -  struct rtable_config *cf = mb_allocz(c->proto->pool, sizeof(struct rtable_config));
+ +  SKIP_BACK_DECLARE(struct channel, c, in_req, req);
   
- -  cf->name = "import";
- -  cf->addr_type = c->net_type;
- -  cf->internal = 1;
+ +  mb_free(c->in_req.name);
+ +  c->in_req.name = NULL;
   
- -  c->in_table = cf->table = rt_setup(c->proto->pool, cf);
+ +  bmap_free(&c->imported_map);
   
- -  c->reload_event = ev_new_init(c->proto->pool, channel_reload_loop, c);
+ +  channel_check_stopped(c);
   }
   
- -/* Called by protocol to activate out_table */
- -void
- -channel_setup_out_table(struct channel *c)
+ +static u32
+ +channel_reimport_next_feed_index(struct rt_export_feeder *f, u32 try_this)
   {
- -  struct rtable_config *cf = mb_allocz(c->proto->pool, sizeof(struct rtable_config));
- -  cf->name = "export";
- -  cf->addr_type = c->net_type;
- -  cf->internal = 1;
+ +  SKIP_BACK_DECLARE(struct channel, c, reimporter, f);
+ +  while (!bmap_test(&c->imported_map, try_this))
+ +    if (!(try_this & (try_this - 1))) /* return every power of two to check for maximum */
+ +      return try_this;
+ +    else
+ +      try_this++;
+ +
+ +  return try_this;
+ +}
+ +
+ +static void
+ +channel_do_reload(void *_c)
+ +{
+ +  struct channel *c = _c;
+ +
+ +  RT_FEED_WALK(&c->reimporter, f)
+ +  {
-     _Bool seen = 0;
++    bool seen = 0;
+ +    for (uint i = 0; i < f->count_routes; i++)
+ +    {
+ +      rte *r = &f->block[i];
+ +
+ +      if (r->flags & REF_OBSOLETE)
+ +      break;
+ +
+ +      if (r->sender == c->in_req.hook)
+ +      {
+ +      /* Strip the table-specific information */
+ +      rte new = rte_init_from(r);
+ +
+ +      /* Strip the later attribute layers */
+ +      new.attrs = ea_strip_to(new.attrs, BIT32_ALL(EALS_PREIMPORT));
+ +
+ +      /* And reload the route */
+ +      rte_update(c, r->net, &new, new.src);
   
- -  c->out_table = rt_setup(c->proto->pool, cf);
+ +      seen = 1;
+ +      }
+ +    }
+ +
+ +    if (!seen)
+ +      bmap_clear(&c->imported_map, f->ni->index);
+ +
+ +    /* Local data needed no more */
+ +    tmp_flush();
+ +
+ +    MAYBE_DEFER_TASK(proto_work_list(c->proto), &c->reimport_event,
+ +      "%s.%s reimport", c->proto->name, c->name);
+ +  }
+ +}
+ +
+ +/* Called by protocol to activate in_table */
+ +static void
+ +channel_setup_in_table(struct channel *c)
+ +{
+ +  c->reimporter = (struct rt_export_feeder) {
+ +    .name = mb_sprintf(c->proto->pool, "%s.%s.reimport", c->proto->name, c->name),
+ +    .trace_routes = c->debug,
+ +    .next_feed_index = channel_reimport_next_feed_index,
+ +  };
+ +  c->reimport_event = (event) {
+ +    .hook = channel_do_reload,
+ +    .data = c,
+ +  };
+ +  rt_feeder_subscribe(&c->table->export_all, &c->reimporter);
   }
   
   
diff --cc nest/route.h

index c640a8a210e6b2813d78d4e2539f8111bcf7029f,659783a8b5942c0d7b282065be64809d750487ba..3b56ccb6b4ed8feda4d751d851ccda730c6c609e
--- 1/nest/route.h
--- 2/nest/route.h
+++ b/nest/route.h
@@@ -46,300 -27,108 +46,300 @@@ struct f_trie_walk_state
   struct cli;
   
   /*
- - *    Generic data structure for storing network prefixes. Also used
- - *    for the master routing table. Currently implemented as a hash
- - *    table.
+ + *    Master Routing Tables. Generally speaking, each of them contains a FIB
+ + *    with each entry pointing to a list of route entries representing routes
+ + *    to given network (with the selected one at the head).
+ + *
+ + *    Each of the RTE's contains variable data (the preference and protocol-dependent
+ + *    metrics) and a pointer to a route attribute block common for many routes).
    *
- - *    Available operations:
- - *            - insertion of new entry
- - *            - deletion of entry
- - *            - searching for entry by network prefix
- - *            - asynchronous retrieval of fib contents
+ + *    It's guaranteed that there is at most one RTE for every (prefix,proto) pair.
    */
   
- -struct fib_node {
- -  struct fib_node *next;              /* Next in hash chain */
- -  struct fib_iterator *readers;               /* List of readers of this node */
- -  net_addr addr[0];
+ +struct rtable_config {
+ +  node n;
+ +  char *name;
+ +  union rtable *table;
+ +  struct proto_config *krt_attached;  /* Kernel syncer attached to this table */
+ +  uint addr_type;                     /* Type of address data stored in table (NET_*) */
+ +  uint gc_threshold;                  /* Maximum number of operations before GC is run */
+ +  uint gc_period;                     /* Approximate time between two consecutive GC runs */
+ +  u32 debug;                          /* Debugging flags (D_*) */
+ +  byte sorted;                                /* Routes of network are sorted according to rte_better() */
+ +  byte trie_used;                     /* Rtable has attached trie */
+ +  struct rt_cork_threshold {
+ +    u64 low, high;
+ +  } cork_threshold;                   /* Cork threshold values */
+ +  struct settle_config export_settle; /* Export announcement settler */
+ +  struct settle_config export_rr_settle;/* Export announcement settler config valid when any
+ +                                         route refresh is running */
+ +  struct settle_config digest_settle; /* Settle times for digests */
+ +  struct rtable_config *roa_aux_table;        /* Auxiliary table config for ROA connections */
+ +  struct rt_stream_config {
+ +    struct rtable_config *src;
+ +    void (*setup)(union rtable *);
+ +    void (*stop)(union rtable *);
+ +  } master;                           /* Data source (this table is aux) */
   };
   
- -struct fib_iterator {                 /* See lib/slists.h for an explanation */
- -  struct fib_iterator *prev, *next;   /* Must be synced with struct fib_node! */
- -  byte efef;                          /* 0xff to distinguish between iterator and node */
- -  byte pad[3];
- -  struct fib_node *node;              /* Or NULL if freshly merged */
- -  uint hash;
+ +/*
+ + *    Route export journal
+ + *
+ + *    The journal itself is held in struct rt_exporter.
+ + *    Workflow:
+ + *      (1) Initialize by rt_exporter_init()
+ + *      (2) Push data by rt_exporter_push() (the export item is copied)
+ + *      (3) Shutdown by rt_exporter_shutdown(), event is called after cleanup
+ + *
+ + *    Subscribers:
+ + *      (1) Initialize by rt_export_subscribe()
+ + *      (2a) Get data by rt_export_get();
+ + *      (2b) Release data after processing by rt_export_release()
+ + *      (3) Request refeed by rt_export_refeed()
+ + *      (4) Unsubscribe by rt_export_unsubscribe()
+ + */
+ +
+ +struct rt_export_request {
+ +  /* Formal name */
+ +  char *name;
+ +
+ +  /* Memory */
+ +  pool *pool;
+ +
+ +  /* State information */
+ +  enum rt_export_state {
+ +#define RT_EXPORT_STATES \
+ +    DOWN, \
+ +    FEEDING, \
+ +    PARTIAL, \
+ +    READY, \
+ +    STOP, \
+ +
+ +#define RT_EXPORT_STATES_ENUM_HELPER(p) TES_##p,
+ +    MACRO_FOREACH(RT_EXPORT_STATES_ENUM_HELPER, RT_EXPORT_STATES)
+ +    TES_MAX
+ +#undef RT_EXPORT_STATES_ENUM_HELPER
+ +  } _Atomic export_state;
+ +  btime last_state_change;
+ +
+ +  /* Table feeding contraption */
+ +  struct rt_export_feeder {
+ +    /* Formal name */
+ +    char *name;
+ +
+ +    /* Enlisting */
+ +    struct rt_exporter * _Atomic exporter;
+ +    DOMAIN(rtable) domain;                    /* Lock this instead of RCU */
+ +
+ +    /* Prefiltering, useful for more scenarios */
+ +    struct rt_prefilter {
+ +      /* Network prefilter mode (TE_ADDR_*) */
+ +      enum {
+ +      TE_ADDR_NONE = 0,       /* No address matching */
+ +      TE_ADDR_EQUAL,          /* Exact query - show route <addr> */
+ +      TE_ADDR_FOR,            /* Longest prefix match - show route for <addr> */
+ +      TE_ADDR_IN,             /* Interval query - show route in <addr> */
+ +      TE_ADDR_TRIE,           /* Query defined by trie */
+ +      TE_ADDR_HOOK,           /* Query processed by supplied custom hook */
+ +      } mode;
+ +
+ +      union {
+ +      const struct f_trie *trie;
+ +      const net_addr *addr;
+ +      int (*hook)(const struct rt_prefilter *, const net_addr *);
+ +      };
+ +    } prefilter;
+ +
+ +#define TLIST_PREFIX  rt_export_feeder
+ +#define TLIST_TYPE    struct rt_export_feeder
+ +#define TLIST_ITEM    n
+ +#define TLIST_WANT_WALK
+ +#define TLIST_WANT_ADD_TAIL
+ +
+ +    /* Feeding itself */
+ +    u32 feed_index;                           /* Index of the feed in progress */
+ +    u32 (*next_feed_index)(struct rt_export_feeder *, u32 try_this);
+ +    struct rt_feeding_request {
+ +      struct rt_feeding_request *next;                /* Next in request chain */
+ +      void (*done)(struct rt_feeding_request *);/* Called when this refeed finishes */
+ +      struct rt_prefilter prefilter;          /* Reload only matching nets */
+ +      PACKED enum {
+ +      RFRS_INACTIVE = 0,      /* Inactive request */
+ +      RFRS_PENDING,           /* Request enqueued, do not touch */
+ +      RFRS_RUNNING,           /* Request active, do not touch */
+ +      } state;
+ +    } *feeding, *feed_pending;
+ +    TLIST_DEFAULT_NODE;
+ +    u8 trace_routes;
+ +  } feeder;
+ +
+ +  /* Regular updates */
+ +  struct bmap seq_map;                /* Which lfjour items are already processed */
+ +  struct bmap feed_map;               /* Which nets were already fed (for initial feeding) */
+ +  struct lfjour_recipient r;
+ +  struct rt_export_union *cur;
+ +
+ +  /* Statistics */
+ +  struct rt_export_stats {
+ +    u32 updates_received;     /* Number of route updates received */
+ +    u32 withdraws_received;   /* Number of route withdraws received */
+ +  } stats;
+ +
+ +  /* Tracing */
+ +  u8 trace_routes;
+ +  void (*dump)(struct rt_export_request *req);
+ +  void (*fed)(struct rt_export_request *req);
   };
   
- -typedef void (*fib_init_fn)(struct fib *, void *);
- -
- -struct fib {
- -  pool *fib_pool;                     /* Pool holding all our data */
- -  slab *fib_slab;                     /* Slab holding all fib nodes */
- -  struct fib_node **hash_table;               /* Node hash table */
- -  uint hash_size;                     /* Number of hash table entries (a power of two) */
- -  uint hash_order;                    /* Binary logarithm of hash_size */
- -  uint hash_shift;                    /* 32 - hash_order */
- -  uint addr_type;                     /* Type of address data stored in fib (NET_*) */
- -  uint node_size;                     /* FIB node size, 0 for nonuniform */
- -  uint node_offset;                   /* Offset of fib_node struct inside of user data */
- -  uint entries;                               /* Number of entries */
- -  uint entries_min, entries_max;      /* Entry count limits (else start rehashing) */
- -  fib_init_fn init;                   /* Constructor */
+ +#include "lib/tlists.h"
+ +
+ +struct rt_export_union {
+ +  enum rt_export_kind {
+ +    RT_EXPORT_STOP = 1,
+ +    RT_EXPORT_FEED,
+ +    RT_EXPORT_UPDATE,
+ +  } kind;
+ +  const struct rt_export_item {
+ +    LFJOUR_ITEM_INHERIT(li);          /* Member of lockfree journal */
+ +    char data[0];                     /* Memcpy helper */
+ +    const rte *new, *old;             /* Route update */
+ +  } *update;
+ +  const struct rt_export_feed {
+ +    uint count_routes, count_exports;
+ +    struct netindex *ni;
+ +    rte *block;
+ +    u64 *exports;
+ +    char data[0];
+ +  } *feed;
+ +  struct rt_export_request *req;
   };
   
- -static inline void * fib_node_to_user(struct fib *f, struct fib_node *e)
- -{ return e ? (void *) ((char *) e - f->node_offset) : NULL; }
+ +struct rt_exporter {
+ +  struct lfjour journal;                      /* Journal for update keeping */
+ +  TLIST_LIST(rt_export_feeder) feeders;               /* List of active feeder structures */
-   _Bool _Atomic feeders_lock;                 /* Spinlock for the above list */
++  bool _Atomic feeders_lock;                  /* Spinlock for the above list */
+ +  u8 trace_routes;                            /* Debugging flags (D_*) */
+ +  u8 net_type;                                        /* Which net this exporter provides */
+ +  DOMAIN(rtable) domain;                      /* Lock this instead of RCU */
+ +  u32 _Atomic max_feed_index;                 /* Stop feeding at this index */
+ +  const char *name;                           /* Name for logging */
+ +  netindex_hash *netindex;                    /* Table for net <-> id conversion */
+ +  void (*stopped)(struct rt_exporter *);      /* Callback when exporter can stop */
+ +  void (*cleanup_done)(struct rt_exporter *, u64 end);        /* Callback when cleanup has been done */
-   struct rt_export_feed *(*feed_net)(struct rt_exporter *, struct rcu_unwinder *, u32, _Bool (*)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *, const struct rt_export_item *first);
++  struct rt_export_feed *(*feed_net)(struct rt_exporter *, struct rcu_unwinder *, u32, bool (*)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *, const struct rt_export_item *first);
+ +  void (*feed_cleanup)(struct rt_exporter *, struct rt_export_feeder *);
+ +};
   
- -static inline struct fib_node * fib_user_to_node(struct fib *f, void *e)
- -{ return e ? (void *) ((char *) e + f->node_offset) : NULL; }
+ +extern struct rt_export_feed rt_feed_index_out_of_range;
   
- -void fib_init(struct fib *f, pool *p, uint addr_type, uint node_size, uint node_offset, uint hash_order, fib_init_fn init);
- -void *fib_find(struct fib *, const net_addr *);       /* Find or return NULL if doesn't exist */
- -void *fib_get_chain(struct fib *f, const net_addr *a); /* Find first node in linked list from hash table */
- -void *fib_get(struct fib *, const net_addr *);        /* Find or create new if nonexistent */
- -void *fib_route(struct fib *, const net_addr *); /* Longest-match routing lookup */
- -void fib_delete(struct fib *, void *);        /* Remove fib entry */
- -void fib_free(struct fib *);          /* Destroy the fib */
- -void fib_check(struct fib *);         /* Consistency check for debugging */
+ +/* Exporter API */
+ +void rt_exporter_init(struct rt_exporter *, struct settle_config *);
+ +struct rt_export_item *rt_exporter_push(struct rt_exporter *, const struct rt_export_item *);
+ +struct rt_export_feed *rt_alloc_feed(uint routes, uint exports);
+ +void rt_exporter_shutdown(struct rt_exporter *, void (*stopped)(struct rt_exporter *));
   
- -void fit_init(struct fib_iterator *, struct fib *); /* Internal functions, don't call */
- -struct fib_node *fit_get(struct fib *, struct fib_iterator *);
- -void fit_put(struct fib_iterator *, struct fib_node *);
- -void fit_put_next(struct fib *f, struct fib_iterator *i, struct fib_node *n, uint hpos);
- -void fit_put_end(struct fib_iterator *i);
- -void fit_copy(struct fib *f, struct fib_iterator *dst, struct fib_iterator *src);
+ +/* Standalone feeds */
+ +void rt_feeder_subscribe(struct rt_exporter *, struct rt_export_feeder *);
+ +void rt_feeder_unsubscribe(struct rt_export_feeder *);
+ +void rt_export_refeed_feeder(struct rt_export_feeder *, struct rt_feeding_request *);
   
+ +struct rt_export_feed *rt_export_next_feed(struct rt_export_feeder *);
+ +#define RT_FEED_WALK(_feeder, _f)     \
+ +  for (const struct rt_export_feed *_f; _f = rt_export_next_feed(_feeder); ) \
   
- static inline _Bool rt_export_feed_active(struct rt_export_feeder *f)
- -#define FIB_WALK(fib, type, z) do {                           \
- -      struct fib_node *fn_, **ff_ = (fib)->hash_table;        \
- -      uint count_ = (fib)->hash_size;                         \
- -      type *z;                                                \
- -      while (count_--)                                        \
- -        for (fn_ = *ff_++; z = fib_node_to_user(fib, fn_); fn_=fn_->next)
++static inline bool rt_export_feed_active(struct rt_export_feeder *f)
+ +{ return !!atomic_load_explicit(&f->exporter, memory_order_acquire); }
   
- -#define FIB_WALK_END } while (0)
+ +/* Full blown exports */
+ +void rtex_export_subscribe(struct rt_exporter *, struct rt_export_request *);
+ +void rtex_export_unsubscribe(struct rt_export_request *);
   
- -#define FIB_ITERATE_INIT(it, fib) fit_init(it, fib)
+ +const struct rt_export_union * rt_export_get(struct rt_export_request *);
+ +void rt_export_release(const struct rt_export_union *);
+ +void rt_export_retry_later(const struct rt_export_union *);
+ +void rt_export_processed(struct rt_export_request *, u64);
+ +void rt_export_refeed_request(struct rt_export_request *rer, struct rt_feeding_request *rfr);
   
- -#define FIB_ITERATE_START(fib, it, type, z) do {              \
- -      struct fib_node *fn_ = fit_get(fib, it);                \
- -      uint count_ = (fib)->hash_size;                         \
- -      uint hpos_ = (it)->hash;                                \
- -      type *z;                                                \
- -      for(;;) {                                               \
- -        if (!fn_)                                             \
- -          {                                                   \
- -             if (++hpos_ >= count_)                           \
- -               break;                                         \
- -             fn_ = (fib)->hash_table[hpos_];                  \
- -             continue;                                        \
- -          }                                                   \
- -        z = fib_node_to_user(fib, fn_);
+ +static inline enum rt_export_state rt_export_get_state(struct rt_export_request *r)
+ +{ return atomic_load_explicit(&r->export_state, memory_order_acquire); }
+ +const char *rt_export_state_name(enum rt_export_state state);
   
- -#define FIB_ITERATE_END fn_ = fn_->next; } } while(0)
+ +static inline void rt_export_walk_cleanup(const struct rt_export_union **up)
+ +{
+ +  if (*up)
+ +    rt_export_release(*up);
+ +}
   
- -#define FIB_ITERATE_PUT(it) fit_put(it, fn_)
+ +#define RT_EXPORT_WALK(_reader, _u)   \
+ +  for (CLEANUP(rt_export_walk_cleanup) const struct rt_export_union *_u;\
+ +      _u = rt_export_get(_reader);                                    \
+ +      rt_export_release(_u))                                          \
+ +
+ +/* Convenince common call to request refeed */
+ +#define rt_export_refeed(h, r)        _Generic((h), \
+ +    struct rt_export_feeder *: rt_export_refeed_feeder, \
+ +    struct rt_export_request *: rt_export_refeed_request, \
+ +    void *: bug)(h, r)
+ +
+ +/* Subscription to regular table exports needs locking */
+ +#define rt_export_subscribe(_t, _kind, f) do { \
+ +  RT_LOCKED(_t, tp) { \
+ +    rt_lock_table(tp); \
+ +    rtex_export_subscribe(&tp->export_##_kind, f); \
+ +  }} while (0) \
+ +
+ +#define rt_export_unsubscribe(_kind, _fx) do { \
+ +  struct rt_export_request *_f = _fx; \
+ +  struct rt_exporter *e = atomic_load_explicit(&_f->feeder.exporter, memory_order_acquire); \
+ +  RT_LOCKED(SKIP_BACK(rtable, export_##_kind, e), _tp) { \
+ +    rtex_export_unsubscribe(_f); \
+ +    rt_unlock_table(_tp); \
+ +  }} while (0) \
+ +
+ +static inline int rt_prefilter_net(const struct rt_prefilter *p, const net_addr *n)
+ +{
+ +  switch (p->mode)
+ +  {
+ +    case TE_ADDR_NONE:        return 1;
+ +    case TE_ADDR_IN:  return net_in_netX(n, p->addr);
+ +    case TE_ADDR_EQUAL:       return net_equal(n, p->addr);
+ +    case TE_ADDR_FOR: return net_in_netX(p->addr, n);
+ +    case TE_ADDR_TRIE:        return trie_match_net(p->trie, n);
+ +    case TE_ADDR_HOOK:        return p->hook(p, n);
+ +  }
+ +
+ +  bug("Crazy prefilter application attempt failed wildly.");
+ +}
+ +
- static inline _Bool
++static inline bool
+ +rt_net_is_feeding_feeder(struct rt_export_feeder *ref, const net_addr *n)
+ +{
+ +  if (!rt_prefilter_net(&ref->prefilter, n))
+ +    return 0;
+ +
+ +  if (!ref->feeding)
+ +    return 1;
   
- -#define FIB_ITERATE_PUT_NEXT(it, fib) fit_put_next(fib, it, fn_, hpos_)
+ +  for (struct rt_feeding_request *rfr = ref->feeding; rfr; rfr = rfr->next)
+ +    if (rt_prefilter_net(&rfr->prefilter, n))
+ +      return 1;
   
- -#define FIB_ITERATE_PUT_END(it) fit_put_end(it)
+ +  return 0;
+ +}
   
- static inline _Bool
- -#define FIB_ITERATE_UNLINK(it, fib) fit_get(fib, it)
++static inline bool
+ +rt_net_is_feeding_request(struct rt_export_request *req, const net_addr *n)
+ +{
+ +  struct netindex *ni = NET_TO_INDEX(n);
+ +  return
+ +    !bmap_test(&req->feed_map, ni->index)
+ +    && rt_net_is_feeding_feeder(&req->feeder, n);
+ +}
   
- -#define FIB_ITERATE_COPY(dst, src, fib) fit_copy(fib, dst, src)
+ +#define rt_net_is_feeding(h, n)       _Generic((h), \
+ +    struct rt_export_feeder *: rt_net_is_feeding_feeder, \
+ +    struct rt_export_request *: rt_net_is_feeding_request, \
+ +    void *: bug)(h, n)
   
   
   /*
@@@ -400,266 -181,105 +400,266 @@@ struct rtable_private 
                                          * delete as soon as use_count becomes 0 and remove
                                          * obstacle from this routing table.
                                          */
- -  struct event *rt_event;             /* Routing table event */
+ +  struct rt_export_request best_req;  /* Internal request from best route announcement cleanup */
+ +  struct rt_uncork_callback nhu_uncork;       /* Helper event to schedule NHU on uncork */
+ +  struct rt_uncork_callback hcu_uncork;       /* Helper event to schedule HCU on uncork */
     struct timer *prune_timer;          /* Timer for periodic pruning / GC */
+ +  struct event *prune_event;          /* Event for prune execution */
     btime last_rt_change;                       /* Last time when route changed */
- -  btime base_settle_time;             /* Start time of rtable settling interval */
     btime gc_time;                      /* Time of last GC */
     uint gc_counter;                    /* Number of operations since last GC */
+ +  uint rr_counter;                    /* Number of currently running route refreshes,
+ +                                         in fact sum of (stale_set - stale_pruned) over all importers
+ +                                         + one for each TIS_FLUSHING importer */
+ +  uint wait_counter;                  /* Number of imports in TIS_WAITING state */
     byte prune_state;                   /* Table prune state, 1 -> scheduled, 2-> running */
     byte prune_trie;                    /* Prune prefix trie during next table prune */
- -  byte hcu_scheduled;                 /* Hostcache update is scheduled */
+ +  byte imports_flushing;              /* Some imports are being flushed right now */
     byte nhu_state;                     /* Next Hop Update state */
- -  struct fib_iterator prune_fit;      /* Rtable prune FIB iterator */
- -  struct fib_iterator nhu_fit;                /* Next Hop Update FIB iterator */
+ +  byte nhu_corked;                    /* Next Hop Update is corked with this state */
+ +  byte export_used;                   /* Pending Export pruning is scheduled */
+ +  byte cork_active;                   /* Cork has been activated */
+ +  struct rt_cork_threshold cork_threshold;    /* Threshold for table cork */
+ +  u32 prune_index;                    /* Rtable prune FIB iterator */
+ +  u32 nhu_index;                      /* Next Hop Update FIB iterator */
+ +  event *nhu_event;                   /* Nexthop updater */
     struct f_trie *trie_new;            /* New prefix trie defined during pruning */
- -  struct f_trie *trie_old;            /* Old prefix trie waiting to be freed */
+ +  const struct f_trie *trie_old;      /* Old prefix trie waiting to be freed */
     u32 trie_lock_count;                        /* Prefix trie locked by walks */
     u32 trie_old_lock_count;            /* Old prefix trie locked by walks */
+ +  struct tbf rl_pipe;                 /* Rate limiting token buffer for pipe collisions */
   
- -  list subscribers;                   /* Subscribers for notifications */
- -  struct timer *settle_timer;         /* Settle time for notifications */
- -  list flowspec_links;                        /* List of flowspec links, src for NET_IPx and dst for NET_FLOWx */
     struct f_trie *flowspec_trie;               /* Trie for evaluation of flowspec notifications */
     // struct mpls_domain *mpls_domain; /* Label allocator for MPLS */
- -} rtable;
+ +  u32 rte_free_deferred;              /* Counter of deferred rte_free calls */
   
- -struct rt_subscription {
- -  node n;
- -  rtable *tab;
- -  void (*hook)(struct rt_subscription *b);
- -  void *data;
+ +  struct rt_digestor *export_digest;  /* Route export journal for digest tries */
+ +  struct rt_stream *master;           /* Data source (this table is aux) */
   };
   
- -struct rt_flowspec_link {
- -  node n;
- -  rtable *src;
- -  rtable *dst;
- -  u32 uc;
+ +/* The final union private-public rtable structure */
+ +typedef union rtable {
+ +  struct {
+ +    RTABLE_PUBLIC;
+ +  };
+ +  struct rtable_private priv;
+ +} rtable;
+ +
+ +/* Define the lock cleanup function */
+ +LOBJ_UNLOCK_CLEANUP(rtable, rtable);
+ +
+ +#define RT_IS_LOCKED(tab)     LOBJ_IS_LOCKED((tab), rtable)
+ +#define RT_LOCKED(tab, tp)    LOBJ_LOCKED((tab), tp, rtable, rtable)
+ +#define RT_LOCK(tab, tp)      LOBJ_LOCK((tab), tp, rtable, rtable)
+ +
+ +#define RT_LOCK_SIMPLE(tab)   LOBJ_LOCK_SIMPLE((tab), rtable)
+ +#define RT_UNLOCK_SIMPLE(tab) LOBJ_UNLOCK_SIMPLE((tab), rtable)
+ +
+ +#define RT_UNLOCKED_TEMPORARILY(tab, tp)      LOBJ_UNLOCKED_TEMPORARILY((tab), tp, rtable, rtable)
+ +
+ +#define RT_PUB(tab)   SKIP_BACK(rtable, priv, tab)
+ +
+ +#define RT_UNCORKING  (1ULL << 44)
+ +
+ +extern struct rt_cork {
+ +  _Atomic u64 active;
+ +  DOMAIN(resource) dom;
+ +  event_list queue;
+ +} rt_cork;
+ +
+ +static inline void rt_cork_acquire(void)
+ +{
+ +  atomic_fetch_add_explicit(&rt_cork.active, 1, memory_order_acq_rel);
+ +}
+ +
+ +static inline void rt_cork_release(void)
+ +{
+ +  u64 upd = atomic_fetch_add_explicit(&rt_cork.active, RT_UNCORKING, memory_order_acq_rel) + RT_UNCORKING;
+ +
+ +  /* Actualy released? */
+ +  if ((upd >> 44) == (upd & (RT_UNCORKING - 1)))
+ +  {
+ +    LOCK_DOMAIN(resource, rt_cork.dom);
+ +    synchronize_rcu();
+ +    ev_run_list(&rt_cork.queue);
+ +    UNLOCK_DOMAIN(resource, rt_cork.dom);
+ +  }
+ +
+ +  atomic_fetch_sub_explicit(&rt_cork.active, RT_UNCORKING + 1, memory_order_acq_rel);
+ +}
+ +
+ +void rt_cork_send_callback(void *_data);
+ +
- static inline _Bool rt_cork_check(struct rt_uncork_callback *rcc)
++static inline bool rt_cork_check(struct rt_uncork_callback *rcc)
+ +{
+ +  /* Wait until all uncorks have finished */
+ +  while (1)
+ +  {
+ +    rcu_read_lock();
+ +
+ +    /* Not corked */
+ +    u64 corked = atomic_load_explicit(&rt_cork.active, memory_order_acquire);
+ +    if (!corked)
+ +    {
+ +      rcu_read_unlock();
+ +      return 0;
+ +    }
+ +
+ +    /* Yes, corked */
+ +    if (corked < RT_UNCORKING)
+ +    {
+ +      if (!rcc->ev.hook)
+ +      {
+ +      rcc->ev.hook = rt_cork_send_callback;
+ +      rcc->ev.data = rcc;
+ +      }
+ +
+ +      ev_send(&rt_cork.queue, &rcc->ev);
+ +      rcu_read_unlock();
+ +      return 1;
+ +    }
+ +
+ +    /* In progress, retry */
+ +    rcu_read_unlock();
+ +    birdloop_yield();
+ +  }
+ +}
+ +
+ +struct rt_pending_export {
+ +  struct rt_export_item it;
+ +  struct rt_pending_export *_Atomic next;     /* Next export for the same net */
+ +  u64 seq_all;                                        /* Interlink from BEST to ALL */
   };
   
- -#define NHU_CLEAN     0
- -#define NHU_SCHEDULED 1
- -#define NHU_RUNNING   2
- -#define NHU_DIRTY     3
+ +struct rt_net_pending_export {
+ +  struct rt_pending_export * _Atomic first, * _Atomic last;
+ +};
   
   typedef struct network {
- -  struct rte *routes;                 /* Available routes for this network */
- -  struct fib_node n;                  /* FIB flags reserved for kernel syncer */
+ +  struct rte_storage * _Atomic routes;                /* Available routes for this network */
+ +
+ +  /* Uncleaned pending exports */
+ +  struct rt_net_pending_export all;
+ +  struct rt_net_pending_export best;
   } net;
   
- -struct hostcache {
- -  slab *slab;                         /* Slab holding all hostentries */
- -  struct hostentry **hash_table;      /* Hash table for hostentries */
- -  unsigned hash_order, hash_shift;
- -  unsigned hash_max, hash_min;
- -  unsigned hash_items;
- -  linpool *lp;                                /* Linpool for trie */
- -  struct f_trie *trie;                        /* Trie of prefixes that might affect hostentries */
- -  list hostentries;                   /* List of all hostentries */
- -  byte update_hostcache;
+ +struct rte_storage {
+ +  struct rte_storage * _Atomic next;          /* Next in chain */
+ +  union {
+ +    struct {
+ +      RTE_IN_TABLE_WRITABLE;
+ +    };
+ +    const struct rte rte;                     /* Route data */
+ +  };
   };
   
- -struct hostentry {
- -  node ln;
- -  ip_addr addr;                               /* IP address of host, part of key */
- -  ip_addr link;                               /* (link-local) IP address of host, used as gw
- -                                         if host is directly attached */
- -  rtable *tab;                                /* Dependent table, part of key */
- -  rtable *owner;                      /* Nexthop owner table */
- -  struct hostentry *next;             /* Next in hash chain */
- -  unsigned hash_key;                  /* Hash key */
- -  unsigned uc;                                /* Use count */
- -  struct rta *src;                    /* Source rta entry */
- -  byte dest;                          /* Chosen route destination type (RTD_...) */
- -  byte nexthop_linkable;              /* Nexthop list is completely non-device */
- -  u32 igp_metric;                     /* Chosen route IGP metric */
+ +#define RTE_COPY(r)           ((r) ? (r)->rte : (rte) {})
+ +#define RTE_COPY_VALID(r)     (((r) && (rte_is_valid((r)))) ? *(r) : (rte) {})
+ +#define RTE_OR_NULL(r)                ((r) ? &((r)->rte) : NULL)
+ +#define RTE_VALID_OR_NULL(r)  (((r) && (rte_is_valid((r)))) ? (r) : NULL)
+ +
+ +#define RTES_WRITE(r)         (((r) != ((struct rte_storage *) 0)) ? ((struct rte *) &(r)->rte) : NULL)
+ +
+ +#define RTE_GET_NETINDEX(e) NET_TO_INDEX((e)->net)
+ +
+ +/* Table import */
+ +
+ +struct rt_import_request {
+ +  struct rt_import_hook *hook;                /* The table part of importer */
+ +  char *name;
+ +  u8 trace_routes;
+ +
+ +  struct birdloop *loop;              /* Where to schedule cleanup event */
+ +
+ +  void (*dump_req)(struct rt_import_request *req);
+ +  void (*log_state_change)(struct rt_import_request *req, u8 state);
+ +  /* Preimport is called when the @new route is just-to-be inserted, replacing @old.
+ +   * Return a route (may be different or modified in-place) to continue or NULL to withdraw. */
+ +  int (*preimport)(struct rt_import_request *req, struct rte *new, const struct rte *old);
+ +};
+ +
+ +struct rt_import_hook {
+ +  node n;
+ +  rtable *table;                      /* The connected table */
+ +  struct rt_import_request *req;      /* The requestor */
+ +
+ +  struct rt_import_stats {
+ +    /* Import - from protocol to core */
+ +    u32 pref;                         /* Number of routes selected as best in the (adjacent) routing table */
+ +    u32 updates_ignored;              /* Number of route updates rejected as already in route table */
+ +    u32 updates_accepted;             /* Number of route updates accepted and imported */
+ +    u32 withdraws_ignored;            /* Number of route withdraws rejected as already not in route table */
+ +    u32 withdraws_accepted;           /* Number of route withdraws accepted and processed */
+ +  } stats;
+ +
+ +  u64 flush_seq;                      /* Table export seq when the channel announced flushing */
+ +  btime last_state_change;            /* Time of last state transition */
+ +
+ +  u8 import_state;                    /* IS_* */
+ +  u8 stale_set;                               /* Set this stale_cycle to imported routes */
+ +  u8 stale_valid;                     /* Routes with this stale_cycle and bigger are considered valid */
+ +  u8 stale_pruned;                    /* Last prune finished when this value was set at stale_valid */
+ +  u8 stale_pruning;                   /* Last prune started when this value was set at stale_valid */
+ +
+ +  void (*stopped)(struct rt_import_request *);        /* Stored callback when import is stopped */
+ +  event cleanup_event;                        /* Used to finally unhook the import from the table */
   };
   
- -typedef struct rte {
- -  struct rte *next;
- -  net *net;                           /* Network this RTE belongs to */
- -  struct rte_src *src;                        /* Route source that created the route */
- -  struct channel *sender;             /* Channel used to send the route to the routing table */
- -  struct rta *attrs;                  /* Attributes of this route */
- -  u32 id;                             /* Table specific route id */
- -  byte flags;                         /* Flags (REF_...) */
- -  byte pflags;                                /* Protocol-specific flags */
- -  btime lastmod;                      /* Last modified */
- -} rte;
   
- -#define REF_COW               1               /* Copy this rte on write */
- -#define REF_FILTERED  2               /* Route is rejected by import filter */
- -#define REF_STALE     4               /* Route is stale in a refresh cycle */
- -#define REF_DISCARD   8               /* Route is scheduled for discard */
- -#define REF_MODIFY    16              /* Route is scheduled for modify */
+ +#define TIS_DOWN      0
+ +#define TIS_UP                1
+ +#define TIS_STOP      2
+ +#define TIS_FLUSHING  3
+ +#define TIS_WAITING   4
+ +#define TIS_CLEARED   5
+ +#define TIS_MAX               6
   
- -/* Route is valid for propagation (may depend on other flags in the future), accepts NULL */
- -static inline int rte_is_valid(rte *r) { return r && !(r->flags & REF_FILTERED); }
   
- -/* Route just has REF_FILTERED flag */
- -static inline int rte_is_filtered(rte *r) { return !!(r->flags & REF_FILTERED); }
+ +void rt_request_import(rtable *tab, struct rt_import_request *req);
+ +void rt_stop_import(struct rt_import_request *, void (*stopped)(struct rt_import_request *));
+ +const char *rt_import_state_name(u8 state);
+ +static inline u8 rt_import_get_state(struct rt_import_hook *ih) { return ih ? ih->import_state : TIS_DOWN; }
+ +
+ +void rte_import(struct rt_import_request *req, const net_addr *net, rte *new, struct rte_src *src);
+ +
+ +/* When rtable is just a view / aggregate, this is the basis for its source */
+ +struct rt_stream {
+ +  struct rt_import_request dst;
+ +  rtable *dst_tab;
+ +};
+ +      
+ +
+ +#if 0
+ +/*
+ + * For table export processing
+ + */
+ +
+ +/* Get next rpe. If src is given, it must match. */
+ +struct rt_pending_export *rpe_next(struct rt_pending_export *rpe, struct rte_src *src);
+ +
+ +/* Walk all rpe's */
+ +#define RPE_WALK(first, it, src) \
+ +  for (struct rt_pending_export *it = (first); it; it = rpe_next(it, (src)))
+ +
+ +/* Mark the pending export processed */
+ +void rpe_mark_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe);
+ +
+ +#define rpe_mark_seen_all(hook, first, last, src) do { \
+ +  RPE_WALK((first), _rpe, (src)) { \
+ +    rpe_mark_seen((hook), _rpe); \
+ +    if (_rpe == last) break; \
+ +  }} while (0)
+ +
+ +/* Get pending export seen status */
+ +int rpe_get_seen(struct rt_export_hook *hook, struct rt_pending_export *rpe);
+ +
+ +#endif
+ +
+ +/*
+ + * Channel export hooks. To be refactored out.
+ + */
+ +
+ +int channel_preimport(struct rt_import_request *req, rte *new, const rte *old);
   
   
   /* Types of route announcement, also used as flags */
diff --cc nest/rt-attr.c

index 6e96bc5219fc9d3cb34658ec620be717a43728ef,c8ef8e081c69c27802c34a160a275515b67cd936..ff0d54fa6adc2bbdb0dcff22784a0924d0922558
--- 1/nest/rt-attr.c
--- 2/nest/rt-attr.c
+++ b/nest/rt-attr.c
@@@ -913,8 -615,6 +913,8 @@@ ea_do_sort(ea_list *e
     while (ss);
   }
   
- static _Bool eattr_same_value(const eattr *a, const eattr *b);
++static bool eattr_same_value(const eattr *a, const eattr *b);
+ +
   /**
    * In place discard duplicates and undefs in sorted ea_list. We use stable sort
    * for this reason.
@@@ -1083,63 -733,9 +1083,63 @@@ ea_merge(ea_list *e, ea_list *t, u32 up
         t->count += e->count;
         d += e->count;
         e = e->next;
+ +
+ +      if (e && BIT32_TEST(&upto, e->stored))
+ +      break;
       }
- static _Bool
+ +
+ +  t->next = e;
+ +}
+ +
+ +ea_list *
+ +ea_normalize(ea_list *e, u32 upto)
+ +{
+ +#if 0
+ +  debug("(normalize)");
+ +  ea_dump(e);
+ +  debug(" ----> ");
+ +#endif
+ +  ea_list *t = tmp_allocz(ea_scan(e, upto));
+ +  ea_merge(e, t, upto);
+ +  ea_sort(t);
+ +#if 0
+ +  ea_dump(t);
+ +  debug("\n");
+ +#endif
+ +
+ +  return t;
+ +}
+ +
++static bool
+ +eattr_same_value(const eattr *a, const eattr *b)
+ +{
+ +  if (
+ +      a->id != b->id ||
+ +      a->flags != b->flags ||
+ +      a->type != b->type ||
+ +      a->undef != b->undef
+ +    )
+ +    return 0;
+ +
+ +  if (a->undef)
+ +    return 1;
+ +
+ +  if (a->type & EAF_EMBEDDED)
+ +    return a->u.data == b->u.data;
+ +  else
+ +    return adata_same(a->u.ptr, b->u.ptr);
   }
   
- static _Bool
++static bool
+ +eattr_same(const eattr *a, const eattr *b)
+ +{
+ +  return
+ +    eattr_same_value(a, b) &&
+ +    a->originated == b->originated &&
+ +    a->fresh == b->fresh;
+ +}
+ +
+ +
   /**
    * ea_same - compare two &ea_list's
    * @x: attribute list
diff --cc nest/rt-export.c

index cc5cd353be80e2d060bc5ba9d8143dd6798efc82,0000000000000000000000000000000000000000..e7cfeab84376945a6f8504dc64874f1ee275a313

mode 100644,000000..100644
--- 1/nest/rt-export.c
--- /dev/null
+++ b/nest/rt-export.c
@@@ -1,576 -1,0 +1,576 @@@
-   _Bool done = 1;
+ +/*
+ + *    BIRD -- Route Export Mechanisms
+ + *
+ + *    (c) 2024       Maria Matejka <mq@jmq.cz>
+ + *
+ + *    Can be freely distributed and used under the terms of the GNU GPL.
+ + */
+ +
+ +#include "nest/bird.h"
+ +#include "nest/route.h"
+ +#include "nest/protocol.h"
+ +
+ +struct rt_export_feed rt_feed_index_out_of_range;
+ +
+ +#define rtex_trace(_req, _cat, msg, args...) do { \
+ +  if ((_req)->trace_routes & _cat) \
+ +    log(L_TRACE "%s: " msg, (_req)->name, ##args); \
+ +} while (0)
+ +
+ +static inline enum rt_export_state
+ +rt_export_change_state(struct rt_export_request *r, u32 expected_mask, enum rt_export_state state)
+ +{
+ +  r->last_state_change = current_time();
+ +  enum rt_export_state old = atomic_exchange_explicit(&r->export_state, state, memory_order_acq_rel);
+ +  if (!((1 << old) & expected_mask))
+ +    bug("Unexpected export state change from %s to %s, expected mask %02x",
+ +      rt_export_state_name(old),
+ +      rt_export_state_name(state),
+ +      expected_mask
+ +      );
+ +
+ +  rtex_trace(r, D_STATES, "Export state changed from %s to %s",
+ +      rt_export_state_name(old), rt_export_state_name(state));
+ +
+ +  return old;
+ +}
+ +
+ +const struct rt_export_union *
+ +rt_export_get(struct rt_export_request *r)
+ +{
+ +  ASSERT_DIE(!r->cur);
+ +
+ +#define EXPORT_FOUND(_kind) do { \
+ +  struct rt_export_union *reu = tmp_alloc(sizeof *reu); \
+ +  *reu = (struct rt_export_union) { \
+ +    .kind = _kind, \
+ +    .req = r, \
+ +    .update = update, \
+ +    .feed = feed, \
+ +  }; \
+ +  return (r->cur = reu); \
+ +} while (0)
+ +
+ +#define NOT_THIS_UPDATE       \
+ +  lfjour_release(&r->r, &update->li); \
+ +  continue;
+ +
+ +  while (1)
+ +  {
+ +    enum rt_export_state es = rt_export_get_state(r);
+ +    switch (es)
+ +    {
+ +      case TES_DOWN:
+ +      rtex_trace(r, (D_ROUTES|D_STATES), "Export is down");
+ +      return NULL;
+ +
+ +      case TES_STOP:
+ +      rtex_trace(r, (D_ROUTES|D_STATES), "Received stop event");
+ +      struct rt_export_union *reu = tmp_alloc(sizeof *reu);
+ +      *reu = (struct rt_export_union) {
+ +        .kind = RT_EXPORT_STOP,
+ +        .req = r,
+ +      };
+ +      return (r->cur = reu);
+ +
+ +      case TES_PARTIAL:
+ +      case TES_FEEDING:
+ +      case TES_READY:
+ +      break;
+ +
+ +      case TES_MAX:
+ +      bug("invalid export state");
+ +    }
+ +
+ +    /* Process sequence number reset event */
+ +    if (lfjour_reset_seqno(&r->r))
+ +      bmap_reset(&r->seq_map, 4);
+ +
+ +    /* Get a new update */
+ +    SKIP_BACK_DECLARE(struct rt_export_item, update, li, lfjour_get(&r->r));
+ +    SKIP_BACK_DECLARE(struct rt_exporter, e, journal, lfjour_of_recipient(&r->r));
+ +    struct rt_export_feed *feed = NULL;
+ +
+ +    /* No update, try feed */
+ +    if (!update)
+ +    {
+ +      if (es == TES_READY)
+ +      {
+ +      /* Fed up of feeding */
+ +      rtex_trace(r, D_ROUTES, "Export drained");
+ +      return NULL;
+ +      }
+ +      else if (feed = rt_export_next_feed(&r->feeder))
+ +      {
+ +      /* Feeding more */
+ +      bmap_set(&r->feed_map, feed->ni->index);
+ +      rtex_trace(r, D_ROUTES, "Feeding %N", feed->ni->addr);
+ +
+ +      EXPORT_FOUND(RT_EXPORT_FEED);
+ +      }
+ +      else if (rt_export_get_state(r) == TES_DOWN)
+ +      {
+ +      /* Torn down inbetween */
+ +      rtex_trace(r, D_STATES, "Export ended itself");
+ +      return NULL;
+ +      }
+ +      else
+ +      {
+ +      /* No more food */
+ +      rt_export_change_state(r, BIT32_ALL(TES_FEEDING, TES_PARTIAL), TES_READY);
+ +      rtex_trace(r, D_STATES, "Fed up");
+ +      CALL(r->fed, r);
+ +      return NULL;
+ +      }
+ +    }
+ +
+ +    /* There actually is an update */
+ +    if (bmap_test(&r->seq_map, update->seq))
+ +    {
+ +      /* But this update has been already processed, let's try another one */
+ +      rtex_trace(r, D_ROUTES, "Skipping an already processed update %lu", update->seq);
+ +      NOT_THIS_UPDATE;
+ +    }
+ +
+ +    /* Is this update allowed by prefilter? */
+ +    const net_addr *n = (update->new ?: update->old)->net;
+ +    struct netindex *ni = NET_TO_INDEX(n);
+ +
+ +    if (!rt_prefilter_net(&r->feeder.prefilter, n))
+ +    {
+ +      rtex_trace(r, D_ROUTES, "Not exporting %N due to prefilter", n);
+ +      NOT_THIS_UPDATE;
+ +    }
+ +
+ +    if ((es != TES_READY) && rt_net_is_feeding(r, n))
+ +    {
+ +      /* But this net shall get a feed first! */
+ +      rtex_trace(r, D_ROUTES, "Expediting %N feed due to pending update %lu", n, update->seq);
+ +      if (r->feeder.domain.rtable)
+ +      {
+ +      LOCK_DOMAIN(rtable, r->feeder.domain);
+ +      feed = e->feed_net(e, NULL, ni->index, NULL, NULL, update);
+ +      UNLOCK_DOMAIN(rtable, r->feeder.domain);
+ +      }
+ +      else
+ +      {
+ +      RCU_ANCHOR(u);
+ +      feed = e->feed_net(e, u, ni->index, NULL, NULL, update);
+ +      }
+ +
+ +      bmap_set(&r->feed_map, ni->index);
+ +      ASSERT_DIE(feed && (feed != &rt_feed_index_out_of_range));
+ +
+ +      EXPORT_FOUND(RT_EXPORT_FEED);
+ +    }
+ +
+ +    /* OK, now this actually is an update, thank you for your patience */
+ +    rtex_trace(r, D_ROUTES, "Updating %N, seq %lu", n, update->seq);
+ +
+ +    EXPORT_FOUND(RT_EXPORT_UPDATE);
+ +  }
+ +
+ +#undef NOT_THIS_UPDATE
+ +#undef EXPORT_FOUND
+ +}
+ +
+ +void
+ +rt_export_release(const struct rt_export_union *u)
+ +{
+ +  /* May be already released */
+ +  if (!u->req)
+ +    return;
+ +
+ +  struct rt_export_request *r = u->req;
+ +
+ +  /* Must be crosslinked */
+ +  ASSERT_DIE(r->cur == u);
+ +  r->cur = NULL;
+ +
+ +  switch (u->kind)
+ +  {
+ +    case RT_EXPORT_FEED:
+ +      for (uint i = 0; i < u->feed->count_exports; i++)
+ +      bmap_set(&r->seq_map, u->feed->exports[i]);
+ +
+ +      if (!u->update)
+ +      break;
+ +
+ +      /* fall through */
+ +
+ +    case RT_EXPORT_UPDATE:
+ +      rtex_trace(r, D_ROUTES, "Export %lu released", u->update->seq);
+ +      lfjour_release(&r->r, &u->update->li);
+ +
+ +      break;
+ +
+ +    case RT_EXPORT_STOP:
+ +      /* Checking that we have indeed stopped the exporter */
+ +      ASSERT_DIE(rt_export_get_state(r) == TES_DOWN);
+ +      rtex_trace(r, D_ROUTES, "Export stopped");
+ +      break;
+ +
+ +    default:
+ +      bug("strange export kind");
+ +  }
+ +}
+ +
+ +void
+ +rt_export_processed(struct rt_export_request *r, u64 seq)
+ +{
+ +  rtex_trace(r, D_ROUTES, "Marking export %lu as processed", seq);
+ +
+ +  /* Check sequence number reset event */
+ +  if (lfjour_reset_seqno(&r->r))
+ +    bmap_reset(&r->seq_map, 4);
+ +
+ +  ASSERT_DIE(!bmap_test(&r->seq_map, seq));
+ +  bmap_set(&r->seq_map, seq);
+ +}
+ +
+ +struct rt_export_feed *
+ +rt_alloc_feed(uint routes, uint exports)
+ +{
+ +  struct rt_export_feed *feed;
+ +  uint size = sizeof *feed
+ +    + routes * sizeof *feed->block + _Alignof(typeof(*feed->block))
+ +    + exports * sizeof *feed->exports + _Alignof(typeof(*feed->exports));
+ +
+ +  feed = tmp_alloc(size);
+ +
+ +  feed->count_routes = routes;
+ +  feed->count_exports = exports;
+ +  BIRD_SET_ALIGNED_POINTER(feed->block, feed->data);
+ +  BIRD_SET_ALIGNED_POINTER(feed->exports, &feed->block[routes]);
+ +
+ +  /* Consistency check */
+ +  ASSERT_DIE(((void *) &feed->exports[exports]) <= ((void *) feed) + size);
+ +
+ +  return feed;
+ +}
+ +
+ +static struct rt_export_feed *
+ +rt_export_get_next_feed(struct rt_export_feeder *f, struct rcu_unwinder *u)
+ +{
+ +  for (uint retry = 0; retry < (u ? 1024 : ~0U); retry++)
+ +  {
+ +    ASSERT_DIE(u || DOMAIN_IS_LOCKED(rtable, f->domain));
+ +
+ +    struct rt_exporter *e = atomic_load_explicit(&f->exporter, memory_order_acquire);
+ +    if (!e)
+ +    {
+ +      rtex_trace(f, (D_ROUTES|D_STATES), "Exporter kicked us away");
+ +      return NULL;
+ +    }
+ +
+ +    struct rt_export_feed *feed = e->feed_net(e, u, f->feed_index,
+ +      rt_net_is_feeding_feeder, f, NULL);
+ +    if (feed == &rt_feed_index_out_of_range)
+ +    {
+ +      rtex_trace(f, D_ROUTES, "Nothing more to feed", f->feed_index);
+ +      f->feed_index = ~0;
+ +      return NULL;
+ +    }
+ +
+ +#define NEXT_INDEX(f) f->feed_index = f->next_feed_index ? f->next_feed_index(f, f->feed_index + 1) : f->feed_index + 1
+ +
+ +#define NOT_THIS_FEED(...) {          \
+ +  rtex_trace(f, D_ROUTES, __VA_ARGS__);       \
+ +  NEXT_INDEX(f);                      \
+ +  continue;                           \
+ +}
+ +
+ +    if (!feed)
+ +      NOT_THIS_FEED("Nothing found for index %u", f->feed_index);
+ +
+ +    NEXT_INDEX(f);
+ +    return feed;
+ +  }
+ +
+ +  RCU_RETRY_FAST(u);
+ +}
+ +
+ +struct rt_export_feed *
+ +rt_export_next_feed(struct rt_export_feeder *f)
+ +{
+ +  ASSERT_DIE(f);
+ +
+ +  struct rt_export_feed *feed = NULL;
+ +  if (f->domain.rtable)
+ +  {
+ +    LOCK_DOMAIN(rtable, f->domain);
+ +    feed = rt_export_get_next_feed(f, NULL);
+ +    UNLOCK_DOMAIN(rtable, f->domain);
+ +  }
+ +  else
+ +  {
+ +    RCU_ANCHOR(u);
+ +    feed = rt_export_get_next_feed(f, u);
+ +  }
+ +
+ +  if (feed)
+ +    return feed;
+ +
+ +  /* Feeding done */
+ +  struct rt_feeding_request *reverse = NULL;
+ +  while (f->feeding)
+ +  {
+ +    struct rt_feeding_request *rfr = f->feeding;
+ +    f->feeding = rfr->next;
+ +    rfr->next = reverse;
+ +    reverse = rfr;
+ +  }
+ +
+ +  /* Call the done hook in the same order as requests came in */
+ +  while (reverse)
+ +  {
+ +    struct rt_feeding_request *rfr = reverse;
+ +    reverse = rfr->next;
+ +    CALL(rfr->done, rfr);
+ +  }
+ +
+ +  f->feed_index = 0;
+ +
+ +  uint count = 0;
+ +  for (struct rt_feeding_request *rfr = f->feed_pending; rfr; rfr = rfr->next)
+ +    count++;
+ +
+ +  rtex_trace(f, D_STATES, "Feeding done, %u refeed request%s pending",
+ +      count, (count == 1) ? "" : "s");
+ +
+ +  if (!f->feed_pending)
+ +    return NULL;
+ +
+ +  f->feeding = f->feed_pending;
+ +  f->feed_pending = NULL;
+ +  return rt_export_next_feed(f);
+ +}
+ +
+ +static void
+ +rt_feeding_request_default_done(struct rt_feeding_request *rfr)
+ +{
+ +  mb_free(rfr);
+ +}
+ +
+ +void
+ +rt_export_refeed_feeder(struct rt_export_feeder *f, struct rt_feeding_request *rfr)
+ +{
+ +  if (!rfr)
+ +    return;
+ +
+ +  rfr->next = f->feed_pending;
+ +  f->feed_pending = rfr;
+ +}
+ +
+ +void rt_export_refeed_request(struct rt_export_request *rer, struct rt_feeding_request *rfr)
+ +{
+ +  if (!rfr)
+ +  {
+ +    rfr = mb_allocz(rer->pool, sizeof *rfr);
+ +    rfr->done = rt_feeding_request_default_done;
+ +  }
+ +
+ +  bmap_reset(&rer->feed_map, 4);
+ +  rt_export_refeed_feeder(&rer->feeder, rfr);
+ +  rt_export_change_state(rer, BIT32_ALL(TES_FEEDING, TES_PARTIAL, TES_READY), TES_PARTIAL);
+ +  if (rer->r.event)
+ +    ev_send(rer->r.target, rer->r.event);
+ +}
+ +
+ +void
+ +rtex_export_subscribe(struct rt_exporter *e, struct rt_export_request *r)
+ +{
+ +  rt_export_change_state(r, BIT32_ALL(TES_DOWN), TES_FEEDING);
+ +
+ +  ASSERT_DIE(r->pool);
+ +
+ +  rt_feeder_subscribe(e, &r->feeder);
+ +
+ +  lfjour_register(&e->journal, &r->r);
+ +
+ +  r->stats = (struct rt_export_stats) {};
+ +  r->last_state_change = current_time();
+ +  bmap_init(&r->seq_map, r->pool, 4);
+ +  bmap_init(&r->feed_map, r->pool, 4);
+ +
+ +  rt_export_refeed_request(r, NULL);
+ +}
+ +
+ +void
+ +rtex_export_unsubscribe(struct rt_export_request *r)
+ +{
+ +  rt_feeder_unsubscribe(&r->feeder);
+ +
+ +  if (r->cur)
+ +    rt_export_release(r->cur);
+ +
+ +  switch (rt_export_change_state(r, BIT32_ALL(TES_FEEDING, TES_PARTIAL, TES_READY, TES_STOP), TES_DOWN))
+ +  {
+ +    case TES_FEEDING:
+ +    case TES_PARTIAL:
+ +    case TES_READY:
+ +    case TES_STOP:
+ +      lfjour_unregister(&r->r);
+ +      break;
+ +    default:
+ +      bug("not implemented");
+ +  }
+ +
+ +  bmap_free(&r->feed_map);
+ +  bmap_free(&r->seq_map);
+ +}
+ +
+ +static void
+ +rt_exporter_cleanup_done(struct lfjour *j, u64 begin_seq UNUSED, u64 end_seq)
+ +{
+ +  SKIP_BACK_DECLARE(struct rt_exporter, e, journal, j);
+ +
+ +  /* TODO: log the begin_seq / end_seq values */
+ +
+ +  CALL(e->cleanup_done, e, end_seq);
+ +  if (e->stopped && (lfjour_count_recipients(j) == 0))
+ +  {
+ +    settle_cancel(&j->announce_timer);
+ +    ev_postpone(&j->cleanup_event);
+ +    e->stopped(e);
+ +  }
+ +}
+ +
+ +void
+ +rt_exporter_init(struct rt_exporter *e, struct settle_config *scf)
+ +{
+ +  rtex_trace(e, D_STATES, "Exporter init");
+ +  e->journal.cleanup_done = rt_exporter_cleanup_done;
+ +  lfjour_init(&e->journal, scf);
+ +  ASSERT_DIE(e->feed_net);
+ +  ASSERT_DIE(e->netindex);
+ +}
+ +
+ +struct rt_export_item *
+ +rt_exporter_push(struct rt_exporter *e, const struct rt_export_item *uit)
+ +{
+ +  /* Get the object */
+ +  struct lfjour_item *lit = lfjour_push_prepare(&e->journal);
+ +  if (!lit)
+ +    return NULL;
+ +
+ +  SKIP_BACK_DECLARE(struct rt_export_item, it, li, lit);
+ +
+ +  /* Copy the data, keeping the header */
+ +  memcpy(&it->data, &uit->data, e->journal.item_size - OFFSETOF(struct rt_export_item, data));
+ +
+ +  /* Commit the update */
+ +  rtex_trace(e, D_ROUTES, "Announcing change %lu at %N: %p (%u) -> %p (%u)",
+ +      lit->seq, (uit->new ?: uit->old)->net,
+ +      uit->old, uit->old ? uit->old->id : 0,
+ +      uit->new, uit->new ? uit->new->id : 0);
+ +
+ +  lfjour_push_commit(&e->journal);
+ +
+ +  /* Return the update pointer */
+ +  return it;
+ +}
+ +
+ +#define RTEX_FEEDERS_LOCK(e)  \
+ +  while (atomic_exchange_explicit(&e->feeders_lock, 1, memory_order_acq_rel)) \
+ +    birdloop_yield(); \
+ +  CLEANUP(_rtex_feeders_unlock_) UNUSED struct rt_exporter *_rtex_feeders_locked_ = e;
+ +
+ +static inline void _rtex_feeders_unlock_(struct rt_exporter **e)
+ +{
+ +  ASSERT_DIE(atomic_exchange_explicit(&(*e)->feeders_lock, 0, memory_order_acq_rel));
+ +}
+ +
+ +void
+ +rt_feeder_subscribe(struct rt_exporter *e, struct rt_export_feeder *f)
+ +{
+ +  f->feed_index = 0;
+ +
+ +  atomic_store_explicit(&f->exporter, e, memory_order_relaxed);
+ +  f->domain = e->domain;
+ +
+ +  RTEX_FEEDERS_LOCK(e);
+ +  rt_export_feeder_add_tail(&e->feeders, f);
+ +
+ +  rtex_trace(f, D_STATES, "Subscribed to exporter %s", e->name);
+ +}
+ +
+ +static void
+ +rt_feeder_do_unsubscribe(struct rt_export_feeder *f)
+ +{
+ +  struct rt_exporter *e = atomic_exchange_explicit(&f->exporter, NULL, memory_order_acquire);
+ +  if (e)
+ +  {
+ +    RTEX_FEEDERS_LOCK(e);
+ +    rt_export_feeder_rem_node(&e->feeders, f);
+ +
+ +    rtex_trace(f, D_STATES, "Unsubscribed from exporter %s", e->name);
+ +  }
+ +  else
+ +    rtex_trace(f, D_STATES, "Already unsubscribed");
+ +}
+ +
+ +void
+ +rt_feeder_unsubscribe(struct rt_export_feeder *f)
+ +{
+ +  if (f->domain.rtable)
+ +  {
+ +    LOCK_DOMAIN(rtable, f->domain);
+ +    rt_feeder_do_unsubscribe(f);
+ +    UNLOCK_DOMAIN(rtable, f->domain);
+ +  }
+ +  else
+ +  {
+ +    RCU_ANCHOR(u);
+ +    rt_feeder_do_unsubscribe(f);
+ +  }
+ +}
+ +
+ +void
+ +rt_exporter_shutdown(struct rt_exporter *e, void (*stopped)(struct rt_exporter *))
+ +{
+ +  rtex_trace(e, D_STATES, "Exporter shutdown");
+ +
+ +  /* Last lock check before dropping the domain reference */
+ +  if (e->journal.domain)
+ +    ASSERT_DIE(DG_IS_LOCKED(e->journal.domain));
+ +
+ +  e->journal.domain = NULL;
+ +
+ +  /* We have to tell every receiver to stop */
++  bool done = 1;
+ +  WALK_TLIST(lfjour_recipient, r, &e->journal.recipients)
+ +  {
+ +    done = 0;
+ +    rt_export_change_state(
+ +      SKIP_BACK(struct rt_export_request, r, r),
+ +      BIT32_ALL(TES_FEEDING, TES_PARTIAL, TES_READY, TES_STOP),
+ +      TES_STOP);
+ +  }
+ +
+ +  /* We can drop feeders synchronously */
+ +  {
+ +    RTEX_FEEDERS_LOCK(e);
+ +    WALK_TLIST_DELSAFE(rt_export_feeder, f, &e->feeders)
+ +    {
+ +      ASSERT_DIE(atomic_exchange_explicit(&f->exporter, NULL, memory_order_acq_rel) == e);
+ +      rt_export_feeder_rem_node(&e->feeders, f);
+ +    }
+ +  }
+ +
+ +  /* Wait for feeders to finish */
+ +  synchronize_rcu();
+ +
+ +  /* The rest is done via the cleanup routine */
+ +  lfjour_do_cleanup_now(&e->journal);
+ +
+ +  if (done)
+ +  {
+ +    ev_postpone(&e->journal.cleanup_event);
+ +    settle_cancel(&e->journal.announce_timer);
+ +    CALL(stopped, e);
+ +  }
+ +  else
+ +//  e->stopped = stopped;
+ +    bug("not implemented yet");
+ +}
diff --cc nest/rt-table.c

index fbb0d985e63b3e57e4a9965430fceabf6e61f052,1b30e7dc794e84a4ea994e66ad94d774d10952cb..4f173815f3d4615eab24772e361bfe225ed1c0be
--- 1/nest/rt-table.c
--- 2/nest/rt-table.c
+++ b/nest/rt-table.c
@@@ -407,257 -275,6 +407,257 @@@ net_route(struct rtable_reading *tr, co
   #undef FVR_VPN
   }
   
-     _Bool withdraw = 0;
+ +/*
+ + * ROA aggregation subsystem
+ + */
+ +
+ +struct rt_roa_aggregator {
+ +  struct rt_stream stream;
+ +  struct rte_owner sources;
+ +  struct rte_src *main_source;
+ +  struct rt_export_request src;
+ +  event event;
+ +};
+ +
+ +static void
+ +rt_dump_roa_aggregator_dst_req(struct rt_import_request *req)
+ +{
+ +  debug("  ROA aggregator import request req=%p", req);
+ +}
+ +
+ +static void
+ +rt_dump_roa_aggregator_src_req(struct rt_export_request *req)
+ +{
+ +  debug("  ROA aggregator export request req=%p", req);
+ +}
+ +
+ +static void
+ +rt_roa_aggregator_state_change(struct rt_import_request *req, u8 state)
+ +{
+ +  if (req->trace_routes & D_STATES)
+ +    log("%s: import state changed to %s",
+ +      req->name, rt_import_state_name(state));
+ +}
+ +
+ +struct rt_roa_aggregated_adata {
+ +  adata ad;
+ +  u32 padding;
+ +  struct { u32 asn, max_pxlen; } u[0];
+ +};
+ +
+ +#define ROA_AGGR_COUNT(rad)   (((typeof (&(rad)->u[0])) (rad->ad.data + rad->ad.length)) - &(rad)->u[0])
+ +
+ +static void
+ +ea_roa_aggregate_format(const eattr *a, byte *buf, uint size)
+ +{
+ +  SKIP_BACK_DECLARE(struct rt_roa_aggregated_adata, rad, ad, a->u.ptr);
+ +  uint cnt = ROA_AGGR_COUNT(rad);
+ +  for (uint upos = 0; upos < cnt; upos++)
+ +  {
+ +    int x = bsnprintf(buf, size, "as %u max %u, ", rad->u[upos].asn, rad->u[upos].max_pxlen);
+ +    size -= x;
+ +    buf += x;
+ +    if (size < 30)
+ +    {
+ +      bsnprintf(buf, size, " ... ");
+ +      return;
+ +    }
+ +  }
+ +
+ +  buf[-2] = 0;
+ +}
+ +
+ +static struct ea_class ea_roa_aggregated = {
+ +  .name = "roa_aggregated",
+ +  .type = T_ROA_AGGREGATED,
+ +  .format = ea_roa_aggregate_format,
+ +};
+ +
+ +
+ +static void
+ +rt_aggregate_roa(void *_rag)
+ +{
+ +  struct rt_roa_aggregator *rag = _rag;
+ +
+ +  RT_EXPORT_WALK(&rag->src, u) TMP_SAVED
+ +  {
++    bool withdraw = 0;
+ +    const net_addr *nroa = NULL;
+ +    switch (u->kind)
+ +    {
+ +      case RT_EXPORT_STOP:
+ +      bug("Main table export stopped");
+ +      break;
+ +
+ +      case RT_EXPORT_FEED:
+ +      nroa = u->feed->ni->addr;
+ +      withdraw = (u->feed->count_routes == 0);
+ +      break;
+ +
+ +      case RT_EXPORT_UPDATE:
+ +      nroa = u->update->new ? u->update->new->net : u->update->old->net;
+ +      withdraw = !u->update->new;
+ +      break;
+ +    }
+ +
+ +    net_addr_union nip;
+ +    net_copy(&nip.n, nroa);
+ +
+ +    uint asn, max_pxlen;
+ +
+ +    switch (nip.n.type)
+ +    {
+ +      case NET_ROA6: nip.n.type = NET_IP6;
+ +                   nip.n.length = net_addr_length[NET_IP6];
+ +                   asn = nip.roa6.asn;
+ +                   max_pxlen = nip.roa6.max_pxlen;
+ +                   break;
+ +      case NET_ROA4: nip.n.type = NET_IP4;
+ +                   nip.n.length = net_addr_length[NET_IP4];
+ +                   asn = nip.roa4.asn;
+ +                   max_pxlen = nip.roa4.max_pxlen;
+ +                   break;
+ +      default: bug("exported garbage from ROA table");
+ +    }
+ +
+ +    rte prev = rt_net_best(rag->stream.dst_tab, &nip.n);
+ +
+ +    struct rt_roa_aggregated_adata *rad_new;
+ +    uint count;
+ +
+ +    if (prev.attrs)
+ +    {
+ +      eattr *ea = ea_find(prev.attrs, &ea_roa_aggregated);
+ +      SKIP_BACK_DECLARE(struct rt_roa_aggregated_adata, rad, ad, ea->u.ptr);
+ +
+ +      count = ROA_AGGR_COUNT(rad);
+ +      rad_new = tmp_alloc(sizeof *rad_new + (count + 1) * sizeof rad_new->u[0]);
+ +
+ +      /* Insertion into a sorted list */
+ +      uint p = 0;
+ +      for (p = 0; p < count; p++)
+ +      if ((rad->u[p].asn < asn) || (rad->u[p].asn == asn) && (rad->u[p].max_pxlen < max_pxlen))
+ +        rad_new->u[p] = rad->u[p];
+ +      else
+ +        break;
+ +
+ +      if ((rad->u[p].asn == asn) && (rad->u[p].max_pxlen))
+ +      /* Found */
+ +      if (withdraw)
+ +        memcpy(&rad_new->u[p], &rad->u[p+1], (--count - p) * sizeof rad->u[p]);
+ +      else
+ +        continue;
+ +      else
+ +      /* Not found */
+ +      if (withdraw)
+ +        continue;
+ +      else
+ +      {
+ +        rad_new->u[p].asn = asn;
+ +        rad_new->u[p].max_pxlen = max_pxlen;
+ +        memcpy(&rad_new->u[p+1], &rad->u[p], (count++ - p) * sizeof rad->u[p]);
+ +      }
+ +    }
+ +    else if (!withdraw)
+ +    {
+ +      count = 1;
+ +      rad_new = tmp_alloc(sizeof *rad_new + sizeof rad_new->u[0]);
+ +      rad_new->u[0].asn = asn;
+ +      rad_new->u[0].max_pxlen = max_pxlen;
+ +    }
+ +    else
+ +      continue;
+ +
+ +    rad_new->ad.length = (byte *) &rad_new->u[count] - rad_new->ad.data;
+ +
+ +    rte r = {
+ +      .src = rag->main_source,
+ +    };
+ +
+ +    ea_set_attr(&r.attrs, EA_LITERAL_DIRECT_ADATA(&ea_roa_aggregated, 0, &rad_new->ad));
+ +
+ +    rte_import(&rag->stream.dst, &nip.n, &r, rag->main_source);
+ +
+ +#if 0
+ +    /* Do not split ROA aggregator, we want this to be finished asap */
+ +    MAYBE_DEFER_TASK(rag->src.r.target, rag->src.r.event,
+ +      "export to %s", rag->src.name);
+ +#endif
+ +  }
+ +}
+ +
+ +static void
+ +rt_setup_roa_aggregator(rtable *t)
+ +{
+ +  rtable *src = t->config->master.src->table;
+ +  struct rt_roa_aggregator *rag;
+ +  {
+ +    RT_LOCK(t, tab);
+ +    char *ragname = mb_sprintf(tab->rp, "%s.roa-aggregator", src->name);
+ +    rag = mb_alloc(tab->rp, sizeof *rag);
+ +    *rag = (struct rt_roa_aggregator) {
+ +      .stream = {
+ +      .dst = {
+ +        .name = ragname,
+ +        .trace_routes = tab->debug,
+ +        .loop = t->loop,
+ +        .dump_req = rt_dump_roa_aggregator_dst_req,
+ +        .log_state_change = rt_roa_aggregator_state_change,
+ +      },
+ +      .dst_tab = t,
+ +      },
+ +      .src = {
+ +      .name = ragname,
+ +      .r = {
+ +        .target = birdloop_event_list(t->loop),
+ +        .event = &rag->event,
+ +      },
+ +      .pool = birdloop_pool(t->loop),
+ +      .dump = rt_dump_roa_aggregator_src_req,
+ +      .trace_routes = tab->debug,
+ +      },
+ +      .event = {
+ +      .hook = rt_aggregate_roa,
+ +      .data = rag,
+ +      },
+ +    };
+ +
+ +    rt_init_sources(&rag->sources, ragname, birdloop_event_list(t->loop));
+ +    rag->main_source = rt_get_source_o(&rag->sources, 0);
+ +
+ +    tab->master = &rag->stream;
+ +  }
+ +
+ +  rt_request_import(t, &rag->stream.dst);
+ +  rt_export_subscribe(src, best, &rag->src);
+ +}
+ +
+ +static void
+ +rt_roa_aggregator_sources_gone(void *t)
+ +{
+ +  rt_unlock_table((rtable *) t);
+ +}
+ +
+ +static void
+ +rt_stop_roa_aggregator(rtable *t)
+ +{
+ +  struct rt_roa_aggregator *rag;
+ +  RT_LOCKED(t, tab)
+ +  {
+ +    rag = SKIP_BACK(struct rt_roa_aggregator, stream, tab->master);
+ +
+ +    rt_lock_table(tab);
+ +    rt_destroy_sources(&rag->sources, ev_new_init(tab->rp,
+ +        rt_roa_aggregator_sources_gone, tab));
+ +    rt_unlock_source(rag->main_source);
+ +  }
+ +
+ +  /* Stopping both import and export.
+ +   * All memory will be freed with table shutdown,
+ +   * no need to do anything from import done callback */
+ +  rt_stop_import(&rag->stream.dst, NULL);
+ +  rt_export_unsubscribe(best, &rag->src);
+ +}
   
   /**
    * roa_check - check validity of route origination in a ROA table
@@@ -1079,50 -647,51 +1079,50 @@@ rt_notify_basic(struct channel *c, cons
   }
   
   static void
- -rt_notify_accepted(struct channel *c, net *net, rte *new_changed, rte *old_changed, int refeed)
+ +rt_notify_accepted(struct channel *c, const struct rt_export_feed *feed)
   {
- -  // struct proto *p = c->proto;
- -  rte *new_best = NULL;
- -  rte *old_best = NULL;
- -  rte *new_free = NULL;
- -  int new_first = 0;
+ +  rte *old_best, *new_best;
-   _Bool feeding = rt_net_is_feeding(&c->out_req, feed->ni->addr);
-   _Bool idempotent = 0;
++  bool feeding = rt_net_is_feeding(&c->out_req, feed->ni->addr);
++  bool idempotent = 0;
   
- -  /*
- -   * We assume that there are no changes in net route order except (added)
- -   * new_changed and (removed) old_changed. Therefore, the function is not
- -   * compatible with deterministic_med (where nontrivial reordering can happen
- -   * as a result of a route change) and with recomputation of recursive routes
- -   * due to next hop update (where many routes can be changed in one step).
- -   *
- -   * Note that we need this assumption just for optimizations, we could just
- -   * run full new_best recomputation otherwise.
- -   *
- -   * There are three cases:
- -   * feed or old_best is old_changed -> we need to recompute new_best
- -   * old_best is before new_changed -> new_best is old_best, ignore
- -   * old_best is after new_changed -> try new_changed, otherwise old_best
- -   */
- -
- -  if (net->routes)
- -    c->stats.exp_updates_received++;
- -  else
- -    c->stats.exp_withdraws_received++;
- -
- -  /* Find old_best - either old_changed, or route for net->routes */
- -  if (old_changed && bmap_test(&c->export_map, old_changed->id))
- -    old_best = old_changed;
- -  else
+ +  for (uint i = 0; i < feed->count_routes; i++)
     {
- -    for (rte *r = net->routes; rte_is_valid(r); r = r->next)
+ +    rte *r = &feed->block[i];
+ +
+ +    /* Previously exported */
+ +    if (!old_best && bmap_test(&c->export_accepted_map, r->id))
       {
- -      if (bmap_test(&c->export_map, r->id))
+ +      old_best = r;
+ +
+ +      /* Is still the best and need not be refed anyway */
+ +      if (!new_best && !feeding)
         {
- -      old_best = r;
- -      break;
+ +      idempotent = 1;
+ +      new_best = r;
         }
+ +    }
+ +
+ +    /* Unflag obsolete routes */
+ +    if (r->flags & REF_OBSOLETE)
+ +      bmap_clear(&c->export_rejected_map, r->id);
+ +
+ +    /* Mark invalid as rejected */
+ +    else if (!rte_is_valid(r))
+ +      bmap_set(&c->export_rejected_map, r->id);
   
- -      /* Note if new_changed found before old_best */
- -      if (r == new_changed)
- -      new_first = 1;
+ +    /* Already rejected */
+ +    else if (!feeding && bmap_test(&c->export_rejected_map, r->id))
+ +      ;
+ +
+ +    /* No new best route yet and this is a valid candidate */
+ +    else if (!new_best)
+ +    {
+ +      /* This branch should not be executed if this route is old best */
+ +      ASSERT_DIE(r != old_best);
+ +
+ +      /* Have no new best route yet, try this route not seen before */
+ +      new_best = export_filter(c, r, 0);
+ +      DBG("rt_notify_accepted: checking route id %u: %s\n", r->id, new_best ? "ok" : "no");
       }
     }
   
@@@ -1166,20 -730,15 +1166,20 @@@ channel_notify_accepted(void *_channel
   }
   
   rte *
- -rt_export_merged(struct channel *c, net *net, rte **rt_free, linpool *pool, int silent)
+ +rt_export_merged(struct channel *c, const struct rt_export_feed *feed, linpool *pool, int silent)
   {
-   _Bool feeding = !silent && rt_net_is_feeding(&c->out_req, feed->ni->addr);
++  bool feeding = !silent && rt_net_is_feeding(&c->out_req, feed->ni->addr);
+ +
     // struct proto *p = c->proto;
- -  struct nexthop *nhs = NULL;
- -  rte *best0, *best, *rt0, *rt, *tmp;
+ +  struct nexthop_adata *nhs = NULL;
+ +  rte *best0 = &feed->block[0];
+ +  rte *best = NULL;
   
- -  best0 = net->routes;
- -  *rt_free = NULL;
+ +  /* First route is obsolete */
+ +  if (best0->flags & REF_OBSOLETE)
+ +    return NULL;
   
+ +  /* First route is invalid */
     if (!rte_is_valid(best0))
       return NULL;
   
@@@ -1246,549 -782,382 +1246,549 @@@
   }
   
   static void
- -rt_notify_merged(struct channel *c, net *net, rte *new_changed, rte *old_changed,
- -               rte *new_best, rte *old_best, int refeed)
+ +rt_notify_merged(struct channel *c, const struct rt_export_feed *f)
   {
- -  // struct proto *p = c->proto;
- -  rte *new_free = NULL;
- -
- -  /* We assume that all rte arguments are either NULL or rte_is_valid() */
+ +  const rte *old_best = NULL;
+ +  /* Find old best route */
+ +  for (uint i = 0; i < f->count_routes; i++)
+ +    if (bmap_test(&c->export_accepted_map, f->block[i].id))
+ +    {
+ +      old_best = &f->block[i];
+ +      break;
+ +    }
   
- -  /* This check should be done by the caller */
- -  if (!new_best && !old_best)
- -    return;
+ +  /* Prepare new merged route */
+ +  rte *new_merged = f->count_routes ? rt_export_merged(c, f, tmp_linpool, 0) : NULL;
   
- -  /* Check whether the change is relevant to the merged route */
- -  if ((new_best == old_best) &&
- -      (new_changed != old_changed) &&
- -      !rte_mergable(new_best, new_changed) &&
- -      !rte_mergable(old_best, old_changed))
- -    return;
+ +  /* And notify the protocol */
+ +  if (new_merged || old_best)
+ +    do_rt_notify(c, f->ni->addr, new_merged, old_best);
+ +}
   
- -  if (new_best)
- -    c->stats.exp_updates_received++;
- -  else
- -    c->stats.exp_withdraws_received++;
   
- -  /* Prepare new merged route */
- -  if (new_best)
- -    new_best = rt_export_merged(c, net, &new_free, rte_update_pool, 0);
+ +void
+ +channel_notify_merged(void *_channel)
+ +{
+ +  struct channel *c = _channel;
   
- -  /* Check old merged route */
- -  if (old_best && !bmap_test(&c->export_map, old_best->id))
- -    old_best = NULL;
+ +  RT_EXPORT_WALK(&c->out_req, u)
+ +  {
+ +    switch (u->kind)
+ +    {
+ +      case RT_EXPORT_STOP:
+ +      bug("Main table export stopped");
   
- -  if (!new_best && !old_best)
- -    return;
+ +      case RT_EXPORT_FEED:
+ +      if (u->feed->count_routes)
+ +        rt_notify_merged(c, u->feed);
+ +      break;
   
- -  do_rt_notify(c, net, new_best, old_best, refeed);
+ +      case RT_EXPORT_UPDATE:
+ +      {
+ +        struct rt_export_feed *f = rt_net_feed(c->table, u->update->new ? u->update->new->net : u->update->old->net, SKIP_BACK(struct rt_pending_export, it, u->update));
+ +        rt_notify_merged(c, f);
+ +        for (uint i=0; i<f->count_exports; i++)
+ +          rt_export_processed(&c->out_req, f->exports[i]);
+ +        break;
+ +      }
+ +    }
   
- -  /* Discard temporary rte */
- -  if (new_free)
- -    rte_free(new_free);
+ +    MAYBE_DEFER_TASK(c->out_req.r.target, c->out_req.r.event,
+ +      "export to %s.%s (merged)", c->proto->name, c->name);
+ +  }
   }
   
- -
- -/**
- - * rte_announce - announce a routing table change
- - * @tab: table the route has been added to
- - * @type: type of route announcement (RA_UNDEF or RA_ANY)
- - * @net: network in question
- - * @new: the new or changed route
- - * @old: the previous route replaced by the new one
- - * @new_best: the new best route for the same network
- - * @old_best: the previous best route for the same network
- - *
- - * This function gets a routing table update and announces it to all protocols
- - * that are connected to the same table by their channels.
- - *
- - * There are two ways of how routing table changes are announced. First, there
- - * is a change of just one route in @net (which may caused a change of the best
- - * route of the network). In this case @new and @old describes the changed route
- - * and @new_best and @old_best describes best routes. Other routes are not
- - * affected, but in sorted table the order of other routes might change.
- - *
- - * Second, There is a bulk change of multiple routes in @net, with shared best
- - * route selection. In such case separate route changes are described using
- - * @type of %RA_ANY, with @new and @old specifying the changed route, while
- - * @new_best and @old_best are NULL. After that, another notification is done
- - * where @new_best and @old_best are filled (may be the same), but @new and @old
- - * are NULL.
- - *
- - * The function announces the change to all associated channels. For each
- - * channel, an appropriate preprocessing is done according to channel &ra_mode.
- - * For example, %RA_OPTIMAL channels receive just changes of best routes.
- - *
- - * In general, we first call preexport() hook of a protocol, which performs
- - * basic checks on the route (each protocol has a right to veto or force accept
- - * of the route before any filter is asked). Then we consult an export filter
- - * of the channel and verify the old route in an export map of the channel.
- - * Finally, the rt_notify() hook of the protocol gets called.
- - *
- - * Note that there are also calls of rt_notify() hooks due to feed, but that is
- - * done outside of scope of rte_announce().
- - */
- -static void
- -rte_announce(rtable *tab, uint type, net *net, rte *new, rte *old,
- -           rte *new_best, rte *old_best)
+ +void
+ +channel_notify_basic(void *_channel)
   {
- -  if (!rte_is_valid(new))
- -    new = NULL;
+ +  struct channel *c = _channel;
   
- -  if (!rte_is_valid(old))
- -    old = NULL;
+ +  RT_EXPORT_WALK(&c->out_req, u)
+ +  {
+ +    switch (u->kind)
+ +    {
+ +      case RT_EXPORT_STOP:
+ +      bug("Main table export stopped");
+ +
+ +      case RT_EXPORT_FEED:
+ +      {
+ +        /* Find where the old route block begins */
+ +        uint oldpos = 0;
+ +        while ((oldpos < u->feed->count_routes) && !(u->feed->block[oldpos].flags & REF_OBSOLETE))
+ +          oldpos++;
+ +
+ +        /* Send updates one after another */
+ +        for (uint i = 0; i < oldpos; i++)
+ +        {
+ +          rte *new = &u->feed->block[i];
+ +          rte *old = NULL;
+ +          for (uint o = oldpos; o < u->feed->count_routes; o++)
+ +            if (new->src == u->feed->block[o].src)
+ +            {
+ +              old = &u->feed->block[o];
+ +              break;
+ +            }
   
- -  if (!rte_is_valid(new_best))
- -    new_best = NULL;
+ +          rt_notify_basic(c, new, old);
   
- -  if (!rte_is_valid(old_best))
- -    old_best = NULL;
+ +          /* Mark old processed */
+ +          if (old)
+ +            old->src = NULL;
+ +        }
   
- -  if (!new && !old && !new_best && !old_best)
- -    return;
+ +        /* Send withdraws */
+ +        for (uint o = oldpos; o < u->feed->count_routes; o++)
+ +          if (u->feed->block[o].src)
+ +            rt_notify_basic(c, NULL, &u->feed->block[o]);
+ +      }
+ +      break;
   
- -  if (new_best != old_best)
- -  {
- -    if (new_best)
- -      new_best->sender->stats.pref_routes++;
- -    if (old_best)
- -      old_best->sender->stats.pref_routes--;
+ +      case RT_EXPORT_UPDATE:
+ +      {
+ +        const rte *new = u->update->new;
+ +        const rte *old = u->update->old;
+ +        struct rte_src *src = (c->ra_mode == RA_ANY) ? (new ? new->src : old->src) : NULL;
+ +
+ +        /* Squashing subsequent updates */
+ +        for (SKIP_BACK_DECLARE(const struct rt_pending_export, rpe, it, u->update);
+ +            rpe = atomic_load_explicit(&rpe->next, memory_order_acquire) ;)
+ +          /* Either new is the same as this update's "old". Then the squash
+ +           * is obvious.
+ +           *
+ +           * Or we're squashing an update-from-nothing with a withdrawal,
+ +           * and then either src is set because it must match (RA_ANY)
+ +           * or it doesn't matter at all (RA_OPTIMAL).
+ +           */
+ +          if ((rpe->it.old == new) && (new || src && (src == rpe->it.new->src)))
+ +          {
+ +            new = rpe->it.new;
+ +            rt_export_processed(&c->out_req, rpe->it.seq);
+ +          }
   
- -    if (tab->hostcache)
- -      rt_notify_hostcache(tab, net);
+ +        if (new && old && rte_same(new, old))
+ +        {
+ +          channel_rte_trace_out(D_ROUTES, c, new, "already exported");
   
- -    if (!EMPTY_LIST(tab->flowspec_links))
- -      rt_flowspec_notify(tab, net);
- -  }
+ +          if ((new->id != old->id) && bmap_test(&c->export_accepted_map, old->id))
+ +          {
+ +            bmap_set(&c->export_accepted_map, new->id);
+ +            bmap_clear(&c->export_accepted_map, old->id);
+ +          }
+ +        }
+ +        else if (!new && !old)
+ +          channel_rte_trace_out(D_ROUTES, c, u->update->new, "idempotent withdraw (squash)");
+ +        else
+ +          rt_notify_basic(c, new, old);
+ +
+ +        break;
+ +      }
+ +    }
   
- -  rt_schedule_notify(tab);
+ +    MAYBE_DEFER_TASK(c->out_req.r.target, c->out_req.r.event,
+ +      "export to %s.%s (regular)", c->proto->name, c->name);
+ +  }
+ +}
   
- -  struct channel *c; node *n;
- -  WALK_LIST2(c, n, tab->channels, table_node)
+ +static void
+ +rt_flush_best(struct rtable_private *tab, u64 upto)
+ +{
+ +  u64 last_seq = 0;
+ +  RT_EXPORT_WALK(&tab->best_req, u)
     {
- -    if (c->export_state == ES_DOWN)
- -      continue;
+ +    ASSERT_DIE(u->kind == RT_EXPORT_UPDATE);
+ +    ASSERT_DIE(u->update->seq <= upto);
+ +    last_seq = u->update->seq;
+ +    if (last_seq == upto)
+ +      return;
+ +  }
   
- -    if (type && (type != c->ra_mode))
- -      continue;
+ +  rt_trace(tab, D_STATES, "Export best full flushed regular up to %lu", last_seq);
+ +}
   
- -    switch (c->ra_mode)
- -    {
- -    case RA_OPTIMAL:
- -      if (new_best != old_best)
- -      rt_notify_basic(c, net, new_best, old_best, 0);
- -      break;
+ +static struct rt_pending_export *
+ +rte_announce_to(struct rt_exporter *e, struct rt_net_pending_export *npe, const rte *new, const rte *old)
+ +{
+ +  if (new == old)
+ +    return NULL;
   
- -    case RA_ANY:
- -      if (new != old)
- -      rt_notify_basic(c, net, new, old, 0);
- -      break;
+ +  struct rt_pending_export rpe = {
+ +    .it = {
+ +      .new = new,
+ +      .old = old,
+ +    },
+ +  };
   
- -    case RA_ACCEPTED:
- -      /*
- -       * The (new != old) condition is problematic here, as it would break
- -       * the second usage pattern (announcement after bulk change, used in
- -       * rt_next_hop_update_net(), which sends both new and old as NULL).
- -       *
- -       * But recursive next hops do not work with sorted tables anyways,
- -       * such configuration is forbidden in BGP and not supported in
- -       * rt_notify_accepted().
- -       *
- -       * The condition is needed to eliminate spurious announcements where
- -       * both old and new routes are not valid (so they are NULL).
- -       */
- -      if (new != old)
- -      rt_notify_accepted(c, net, new, old, 0);
- -      break;
+ +  struct rt_export_item *rei = rt_exporter_push(e, &rpe.it);
+ +  if (!rei)
+ +    return NULL;
   
- -    case RA_MERGED:
- -      rt_notify_merged(c, net, new, old, new_best, old_best, 0);
- -      break;
- -    }
- -  }
+ +  SKIP_BACK_DECLARE(struct rt_pending_export, pushed, it, rei);
+ +
+ +  struct rt_pending_export *last = atomic_load_explicit(&npe->last, memory_order_relaxed);
+ +  if (last)
+ +    ASSERT_DIE(atomic_exchange_explicit(&last->next, pushed, memory_order_acq_rel) == NULL);
+ +
+ +  atomic_store_explicit(&npe->last, pushed, memory_order_release);
+ +  if (!atomic_load_explicit(&npe->first, memory_order_relaxed))
+ +    atomic_store_explicit(&npe->first, pushed, memory_order_release);
+ +
+ +  return pushed;
   }
   
- -static inline int
- -rte_validate(rte *e)
+ +static void
+ +rte_announce(struct rtable_private *tab, const struct netindex *i UNUSED, net *net, const rte *new, const rte *old,
+ +           const rte *new_best, const rte *old_best)
   {
- -  int c;
- -  net *n = e->net;
+ +  /* Update network count */
+ +  tab->net_count += (!!new_best - !!old_best);
+ +
+ +  int new_best_valid = rte_is_valid(new_best);
+ +  int old_best_valid = rte_is_valid(old_best);
+ +
+ +  if ((new == old) && (new_best == old_best))
+ +    return;
   
- -  if (!net_validate(n->n.addr))
+ +  if (new_best_valid)
+ +    new_best->sender->stats.pref++;
+ +  if (old_best_valid)
+ +    old_best->sender->stats.pref--;
+ +
+ +  /* Try to push */
+ +  struct rt_pending_export *best_rpe = NULL;
+ +  struct rt_pending_export *all_rpe = rte_announce_to(&tab->export_all, &net->all, new, old);
+ +  if (all_rpe)
     {
- -    log(L_WARN "Ignoring bogus prefix %N received via %s",
- -      n->n.addr, e->sender->proto->name);
- -    return 0;
+ +    /* Also best may have changed */
+ +    best_rpe = rte_announce_to(&tab->export_best, &net->best, new_best, old_best);
+ +    if (best_rpe)
+ +      /* Announced best, need an anchor to all */
+ +      best_rpe->seq_all = all_rpe->it.seq;
+ +    else if (!lfjour_pending_items(&tab->export_best.journal))
+ +      /* Best is idle, flush its recipient immediately */
+ +      rt_flush_best(tab, all_rpe->it.seq);
+ +
+ +    rt_check_cork_high(tab);
     }
- -
- -  /* FIXME: better handling different nettypes */
- -  c = !net_is_flow(n->n.addr) ?
- -    net_classify(n->n.addr): (IADDR_HOST | SCOPE_UNIVERSE);
- -  if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
+ +  else
     {
- -    log(L_WARN "Ignoring bogus route %N received via %s",
- -      n->n.addr, e->sender->proto->name);
- -    return 0;
+ +    /* Not announced anything, cleanup now */
+ +    ASSERT_DIE(new_best == old_best);
+ +    hmap_clear(&tab->id_map, old->id);
+ +    rte_free(SKIP_BACK(struct rte_storage, rte, old), tab);
     }
+ +}
   
- -  if (net_type_match(n->n.addr, NB_DEST) == !e->attrs->dest)
- -  {
- -    /* Exception for flowspec that failed validation */
- -    if (net_is_flow(n->n.addr) && (e->attrs->dest == RTD_UNREACHABLE))
- -      return 1;
+ +static net *
+ +rt_cleanup_find_net(struct rtable_private *tab, struct rt_pending_export *rpe)
+ +{
+ +  /* Find the appropriate struct network */
+ +  ASSERT_DIE(rpe->it.new || rpe->it.old);
+ +  const net_addr *n = rpe->it.new ?
+ +    rpe->it.new->net :
+ +    rpe->it.old->net;
+ +  struct netindex *ni = NET_TO_INDEX(n);
+ +  ASSERT_DIE(ni->index < atomic_load_explicit(&tab->routes_block_size, memory_order_relaxed));
+ +  net *routes = atomic_load_explicit(&tab->routes, memory_order_relaxed);
+ +  return &routes[ni->index];
+ +}
   
- static _Bool
- -    log(L_WARN "Ignoring route %N with invalid dest %d received via %s",
- -      n->n.addr, e->attrs->dest, e->sender->proto->name);
- -    return 0;
- -  }
++static bool
+ +rt_cleanup_update_pointers(struct rt_net_pending_export *npe, struct rt_pending_export *rpe)
+ +{
+ +  struct rt_pending_export *first = atomic_load_explicit(&npe->first, memory_order_relaxed);
+ +  struct rt_pending_export *last = atomic_load_explicit(&npe->last, memory_order_relaxed);
+ +  ASSERT_DIE(rpe == first);
   
- -  if ((e->attrs->dest == RTD_UNICAST) && !nexthop_is_sorted(&(e->attrs->nh)))
- -  {
- -    log(L_WARN "Ignoring unsorted multipath route %N received via %s",
- -      n->n.addr, e->sender->proto->name);
+ +  atomic_store_explicit(
+ +      &npe->first,
+ +      atomic_load_explicit(&rpe->next, memory_order_relaxed),
+ +      memory_order_release
+ +      );
+ +
+ +  if (rpe != last)
       return 0;
- -  }
   
+ +  atomic_store_explicit(&npe->last, NULL, memory_order_release);
     return 1;
   }
   
- -/**
- - * rte_free - delete a &rte
- - * @e: &rte to be deleted
- - *
- - * rte_free() deletes the given &rte from the routing table it's linked to.
- - */
- -void
- -rte_free(rte *e)
+ +static void
+ +rt_cleanup_export_best(struct lfjour *j, struct lfjour_item *i)
   {
- -  rt_unlock_source(e->src);
- -  if (rta_is_cached(e->attrs))
- -    rta_free(e->attrs);
- -  sl_free(e);
+ +  SKIP_BACK_DECLARE(struct rt_pending_export, rpe, it.li, i);
+ +  SKIP_BACK_DECLARE(struct rtable_private, tab, export_best.journal, j);
+ +  rt_flush_best(tab, rpe->seq_all);
+ +
+ +  /* Find the appropriate struct network */
+ +  net *net = rt_cleanup_find_net(tab, rpe);
+ +
+ +  /* Update the first and last pointers */
+ +  rt_cleanup_update_pointers(&net->best, rpe);
   }
   
- -static inline void
- -rte_free_quick(rte *e)
+ +static void
+ +rt_cleanup_export_all(struct lfjour *j, struct lfjour_item *i)
   {
- -  rt_unlock_source(e->src);
- -  rta_free(e->attrs);
- -  sl_free(e);
+ +  SKIP_BACK_DECLARE(struct rt_pending_export, rpe, it.li, i);
+ +  SKIP_BACK_DECLARE(struct rtable_private, tab, export_all.journal, j);
+ +
+ +  /* Find the appropriate struct network */
+ +  net *net = rt_cleanup_find_net(tab, rpe);
+ +
+ +  /* Update the first and last pointers */
-   _Bool is_last = rt_cleanup_update_pointers(&net->all, rpe);
++  bool is_last = rt_cleanup_update_pointers(&net->all, rpe);
+ +
+ +  /* Free the old route */
+ +  if (rpe->it.old)
+ +  {
+ +    ASSERT_DIE(rpe->it.old->flags & REF_OBSOLETE);
+ +    hmap_clear(&tab->id_map, rpe->it.old->id);
+ +    rte_free(SKIP_BACK(struct rte_storage, rte, rpe->it.old), tab);
+ +  }
+ +
+ +  if (is_last)
+ +    tab->gc_counter++;
   }
   
- -int
- -rte_same(rte *x, rte *y)
+ +static void
+ +rt_dump_best_req(struct rt_export_request *req)
   {
- -  /* rte.flags / rte.pflags are not checked, as they are internal to rtable */
- -  return
- -    x->attrs == y->attrs &&
- -    x->src == y->src &&
- -    rte_is_filtered(x) == rte_is_filtered(y);
+ +  SKIP_BACK_DECLARE(struct rtable_private, tab, best_req, req);
+ +  debug("  Table %s best cleanup request (%p)\n", tab->name, req);
   }
   
- -static inline int rte_is_ok(rte *e) { return e && !rte_is_filtered(e); }
- -
   static void
- -rte_recalculate(struct channel *c, net *net, rte *new, struct rte_src *src)
+ +rt_import_cleared(void *_ih)
   {
- -  struct proto *p = c->proto;
- -  struct rtable *table = c->table;
- -  struct proto_stats *stats = &c->stats;
- -  static struct tbf rl_pipe = TBF_DEFAULT_LOG_LIMITS;
- -  rte *before_old = NULL;
- -  rte *old_best = net->routes;
- -  rte *old = NULL;
- -  rte **k;
- -
- -  k = &net->routes;                   /* Find and remove original route from the same protocol */
- -  while (old = *k)
- -    {
- -      if (old->src == src)
- -      {
- -        /* If there is the same route in the routing table but from
- -         * a different sender, then there are two paths from the
- -         * source protocol to this routing table through transparent
- -         * pipes, which is not allowed.
- -         *
- -         * We log that and ignore the route. If it is withdraw, we
- -         * ignore it completely (there might be 'spurious withdraws',
- -         * see FIXME in do_rte_announce())
- -         */
- -        if (old->sender->proto != p)
- -          {
- -            if (new)
- -              {
- -                log_rl(&rl_pipe, L_ERR "Pipe collision detected when sending %N to table %s",
- -                    net->n.addr, table->name);
- -                rte_free_quick(new);
- -              }
- -            return;
- -          }
- -
- -        if (new && rte_same(old, new))
- -          {
- -            /* No changes, ignore the new route and refresh the old one */
+ +  struct rt_import_hook *hook = _ih;
   
- -            old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
- -
- -            if (!rte_is_filtered(new))
- -              {
- -                stats->imp_updates_ignored++;
- -                rte_trace_in(D_ROUTES, c, new, "ignored");
- -              }
+ +  ASSERT_DIE(hook->import_state == TIS_CLEARED);
   
- -            rte_free_quick(new);
- -            return;
- -          }
- -        *k = old->next;
- -        table->rt_count--;
- -        break;
- -      }
- -      k = &old->next;
- -      before_old = old;
- -    }
+ +  /* Local copy of the otherwise freed callback data */
+ +  void (*stopped)(struct rt_import_request *) = hook->stopped;
+ +  struct rt_import_request *req = hook->req;
   
- -  /* Save the last accessed position */
- -  rte **pos = k;
+ +  /* Finally uncouple from the table */
+ +  RT_LOCKED(hook->table, tab)
+ +  {
+ +    req->hook = NULL;
   
- -  if (!old)
- -    before_old = NULL;
+ +    rt_trace(tab, D_EVENTS, "Hook %s stopped", req->name);
+ +    rem_node(&hook->n);
+ +    mb_free(hook);
+ +    rt_unlock_table(tab);
+ +  }
   
- -  if (!old && !new)
- -    {
- -      stats->imp_withdraws_ignored++;
- -      return;
- -    }
+ +  /* And call the callback */
+ +  CALL(stopped, req);
+ +}
   
- -  int new_ok = rte_is_ok(new);
- -  int old_ok = rte_is_ok(old);
+ +static void
+ +rt_cleanup_done_all(struct rt_exporter *e, u64 end_seq)
+ +{
+ +  SKIP_BACK_DECLARE(struct rtable_private, tab, export_all, e);
+ +  ASSERT_DIE(DG_IS_LOCKED(tab->lock.rtable));
   
- -  struct channel_limit *l = &c->rx_limit;
- -  if (l->action && !old && new && !c->in_table)
- -    {
- -      u32 all_routes = stats->imp_routes + stats->filt_routes;
+ +  if (~end_seq)
+ +    rt_trace(tab, D_STATES, "Export all cleanup done up to seq %lu", end_seq);
+ +  else
+ +    rt_trace(tab, D_STATES, "Export all cleanup complete");
   
- -      if (all_routes >= l->limit)
- -      channel_notify_limit(c, l, PLD_RX, all_routes);
+ +  rt_check_cork_low(tab);
   
- -      if (l->state == PLS_BLOCKED)
+ +  struct rt_import_hook *ih; node *x, *n;
+ +  uint cleared_counter = 0;
+ +  if (tab->wait_counter)
+ +    WALK_LIST2_DELSAFE(ih, n, x, tab->imports, n)
+ +      if (ih->import_state == TIS_WAITING)
+ +      {
+ +      if (end_seq >= ih->flush_seq)
         {
- -        /* In receive limit the situation is simple, old is NULL so
- -           we just free new and exit like nothing happened */
- -
- -        stats->imp_updates_ignored++;
- -        rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
- -        rte_free_quick(new);
- -        return;
+ +        ih->import_state = TIS_CLEARED;
+ +        tab->wait_counter--;
+ +        cleared_counter++;
+ +
+ +        ih->cleanup_event = (event) {
+ +          .hook = rt_import_cleared,
+ +          .data = ih,
+ +        };
+ +        ev_send_loop(ih->req->loop, &ih->cleanup_event);
         }
- -    }
- -
- -  l = &c->in_limit;
- -  if (l->action && !old_ok && new_ok)
- -    {
- -      if (stats->imp_routes >= l->limit)
- -      channel_notify_limit(c, l, PLD_IN, stats->imp_routes);
+ +      }
   
- -      if (l->state == PLS_BLOCKED)
- -      {
- -        /* In import limit the situation is more complicated. We
- -           shouldn't just drop the route, we should handle it like
- -           it was filtered. We also have to continue the route
- -           processing if old or new is non-NULL, but we should exit
- -           if both are NULL as this case is probably assumed to be
- -           already handled. */
+ +  if (!EMPTY_LIST(tab->imports) &&
+ +      (tab->gc_counter >= tab->config->gc_threshold))
+ +    rt_kick_prune_timer(tab);
+ +}
   
- -        stats->imp_updates_ignored++;
- -        rte_trace_in(D_FILTERS, c, new, "ignored [limit]");
+ +static void
+ +rt_cleanup_done_best(struct rt_exporter *e, u64 end_seq)
+ +{
+ +  SKIP_BACK_DECLARE(struct rtable_private, tab, export_best, e);
   
- -        if (c->in_keep_filtered)
- -          new->flags |= REF_FILTERED;
- -        else
- -          { rte_free_quick(new); new = NULL; }
+ +  if (~end_seq)
+ +    rt_trace(tab, D_STATES, "Export best cleanup done up to seq %lu", end_seq);
+ +  else
+ +  {
+ +    rt_trace(tab, D_STATES, "Export best cleanup complete, flushing regular");
+ +    rt_flush_best(tab, ~0ULL);
+ +  }
+ +}
   
- -        /* Note that old && !new could be possible when
- -           c->in_keep_filtered changed in the recent past. */
+ +#define RT_EXPORT_BULK        1024
   
- -        if (!old && !new)
- -          return;
+ +static inline int
+ +rte_validate(struct channel *ch, rte *e)
+ +{
+ +  int c;
+ +  const net_addr *n = e->net;
+ +
+ +#define IGNORING(pre, post) do { \
+ +    log(L_WARN "%s.%s: Ignoring " pre " %N " post, ch->proto->name, ch->name, n); \
+ +    return 0; \
+ +  } while (0)
+ +
+ +  if (!net_validate(n))
+ +    IGNORING("bogus prefix", "");
+ +
+ +  /* FIXME: better handling different nettypes */
+ +  c = !net_is_flow(n) ?
+ +    net_classify(n): (IADDR_HOST | SCOPE_UNIVERSE);
+ +  if ((c < 0) || !(c & IADDR_HOST) || ((c & IADDR_SCOPE_MASK) <= SCOPE_LINK))
+ +    IGNORING("bogus route", "");
+ +
+ +  if (net_type_match(n, NB_DEST))
+ +  {
+ +    eattr *nhea = ea_find(e->attrs, &ea_gen_nexthop);
+ +    int dest = nhea_dest(nhea);
+ +
+ +    if (dest == RTD_NONE)
+ +      IGNORING("route", "with no destination");
+ +
+ +    if ((dest == RTD_UNICAST) &&
+ +      !nexthop_is_sorted((struct nexthop_adata *) nhea->u.ptr))
+ +      IGNORING("unsorted multipath route", "");
+ +  }
+ +  else if (ea_find(e->attrs, &ea_gen_nexthop))
+ +    IGNORING("route", "having a superfluous nexthop attribute");
+ +
+ +  return 1;
+ +}
+ +
+ +int
+ +rte_same(const rte *x, const rte *y)
+ +{
+ +  /* rte.flags / rte.pflags are not checked, as they are internal to rtable */
+ +  return
+ +    (x == y) || (
+ +     (x->attrs == y->attrs) ||
+ +     ((!x->attrs->stored || !y->attrs->stored) && ea_same(x->attrs, y->attrs))
+ +    ) &&
+ +    x->src == y->src &&
+ +    rte_is_filtered(x) == rte_is_filtered(y);
+ +}
+ +
+ +static inline int rte_is_ok(const rte *e) { return e && !rte_is_filtered(e); }
+ +
+ +static void
+ +rte_recalculate(struct rtable_private *table, struct rt_import_hook *c, struct netindex *i, net *net, rte *new, struct rte_src *src)
+ +{
+ +  struct rt_import_request *req = c->req;
+ +  struct rt_import_stats *stats = &c->stats;
+ +  struct rte_storage *old_best_stored = NET_BEST_ROUTE(table, net);
+ +  const rte *old_best = old_best_stored ? &old_best_stored->rte : NULL;
+ +
+ +  /* If the new route is identical to the old one, we find the attributes in
+ +   * cache and clone these with no performance drop. OTOH, if we were to lookup
+ +   * the attributes, such a route definitely hasn't been anywhere yet,
+ +   * therefore it's definitely worth the time. */
+ +  struct rte_storage *new_stored = NULL;
+ +  if (new)
+ +  {
+ +    new_stored = rte_store(new, i, table);
+ +    new = RTES_WRITE(new_stored);
+ +  }
+ +
+ +  struct rte_storage * _Atomic *last_ptr = NULL;
+ +  struct rte_storage *old_stored = NULL;
+ +  const rte *old = NULL;
   
- -        new_ok = 0;
- -        goto skip_stats1;
+ +  /* Find the original route from the same protocol */
+ +  NET_WALK_ROUTES(table, net, ep, e)
+ +  {
+ +    last_ptr = &e->next;
+ +    if (e->rte.src == src)
+ +      if (old_stored)
+ +      bug("multiple routes in table with the same src");
+ +      else
+ +      old_stored = e;
+ +  }
+ +
+ +  if (old_stored)
+ +    {
+ +      old = &old_stored->rte;
+ +
+ +      /* If there is the same route in the routing table but from
+ +       * a different sender, then there are two paths from the
+ +       * source protocol to this routing table through transparent
+ +       * pipes, which is not allowed.
+ +       * We log that and ignore the route. */
+ +      if (old->sender != c)
+ +      {
+ +        if (!old->generation && !new->generation)
+ +          bug("Two protocols claim to author a route with the same rte_src in table %s: %N %s/%u:%u",
+ +              c->table->name, i->addr, old->src->owner->name, old->src->private_id, old->src->global_id);
+ +
+ +        log_rl(&table->rl_pipe, L_ERR "Route source collision in table %s: %N %s/%u:%u",
+ +              c->table->name, i->addr, old->src->owner->name, old->src->private_id, old->src->global_id);
         }
+ +
+ +        if (new && rte_same(old, &new_stored->rte))
+ +          {
+ +            /* No changes, ignore the new route and refresh the old one */
+ +            old_stored->stale_cycle = new->stale_cycle;
+ +
+ +            if (!rte_is_filtered(new))
+ +              {
+ +                stats->updates_ignored++;
+ +                rt_rte_trace_in(D_ROUTES, req, new, "ignored");
+ +              }
+ +
+ +            /* We need to free the already stored route here before returning */
+ +            rte_free(new_stored, table);
+ +            return;
+ +        }
+ +    }
+ +
+ +  if (!old && !new)
+ +    {
+ +      stats->withdraws_ignored++;
+ +      return;
       }
   
+ +  /* If rejected by import limit, we need to pretend there is no route */
+ +  if (req->preimport && (req->preimport(req, new, old) == 0))
+ +  {
+ +    rte_free(new_stored, table);
+ +    new_stored = NULL;
+ +    new = NULL;
+ +  }
+ +
+ +  if (!new && !old)
+ +  {
+ +    stats->withdraws_ignored++;
+ +    return;
+ +  }
+ +
+ +  int new_ok = rte_is_ok(new);
+ +  int old_ok = rte_is_ok(old);
+ +
     if (new_ok)
- -    stats->imp_updates_accepted++;
+ +    stats->updates_accepted++;
     else if (old_ok)
- -    stats->imp_withdraws_accepted++;
+ +    stats->withdraws_accepted++;
     else
- -    stats->imp_withdraws_ignored++;
+ +    stats->withdraws_ignored++;
   
     if (old_ok || new_ok)
       table->last_rt_change = current_time();
@@@ -2026,478 -1373,179 +2026,478 @@@ rte_update(struct channel *c, const net
   
     ASSERT(c->channel_state == CS_UP);
   
- -  rte_update_lock();
+ +  /* Storing prefilter routes as an explicit layer */
+ +  if (new && (c->in_keep & RIK_PREFILTER))
+ +    new->attrs = ea_lookup_tmp(new->attrs, 0, EALS_PREIMPORT);
+ +
+ +#if 0
+ +  debug("%s.%s -(prefilter)-> %s: %N ", c->proto->name, c->name, c->table->name, n);
+ +  if (new) ea_dump(new->attrs);
+ +  else debug("withdraw");
+ +  debug("\n");
+ +#endif
+ +
+ +  const struct filter *filter = c->in_filter;
+ +  struct channel_import_stats *stats = &c->import_stats;
+ +  struct mpls_fec *fec = NULL;
+ +
     if (new)
       {
- -      /* Create a temporary table node */
- -      nn = alloca(sizeof(net) + n->length);
- -      memset(nn, 0, sizeof(net) + n->length);
- -      net_copy(nn->n.addr, n);
+ +      new->net = n;
+ +      new->sender = c->in_req.hook;
   
- -      new->net = nn;
- -      new->sender = c;
+ +      int fr;
   
- -      stats->imp_updates_received++;
- -      if (!rte_validate(new))
+ +      stats->updates_received++;
+ +      if ((filter == FILTER_REJECT) ||
+ +      ((fr = f_run(filter, new, 0)) > F_ACCEPT))
         {
- -        rte_trace_in(D_FILTERS, c, new, "invalid");
- -        stats->imp_updates_invalid++;
- -        goto drop;
+ +        stats->updates_filtered++;
+ +        channel_rte_trace_in(D_FILTERS, c, new, "filtered out");
+ +
+ +        if (c->in_keep & RIK_REJECTED)
+ +          new->flags |= REF_FILTERED;
+ +        else
+ +          new = NULL;
         }
   
- -      if (filter == FILTER_REJECT)
- -      {
- -        stats->imp_updates_filtered++;
- -        rte_trace_in(D_FILTERS, c, new, "filtered out");
+ +      if (new && c->proto->mpls_channel)
+ +      if (mpls_handle_rte(c->proto->mpls_channel, n, new, &fec) < 0)
+ +        {
+ +          channel_rte_trace_in(D_FILTERS, c, new, "invalid");
+ +          stats->updates_invalid++;
+ +          new = NULL;
+ +        }
+ +
+ +      if (new)
+ +      {
+ +      new->attrs = ea_lookup_tmp(new->attrs,
+ +          (c->in_keep & RIK_PREFILTER) ? BIT32_ALL(EALS_PREIMPORT) : 0, EALS_FILTERED);
   
- -        if (! c->in_keep_filtered)
- -          goto drop;
+ +      if (net_is_flow(n))
+ +        rt_flowspec_resolve_rte(new, c);
+ +      else
+ +        rt_next_hop_resolve_rte(new);
+ +      }
   
- -        /* new is a private copy, i could modify it */
- -        new->flags |= REF_FILTERED;
- -      }
- -      else if (filter)
+ +      if (new && !rte_validate(c, new))
         {
- -        int fr = f_run(filter, &new, rte_update_pool, 0);
- -        if (fr > F_ACCEPT)
- -        {
- -          stats->imp_updates_filtered++;
- -          rte_trace_in(D_FILTERS, c, new, "filtered out");
+ +        channel_rte_trace_in(D_FILTERS, c, new, "invalid");
+ +        stats->updates_invalid++;
+ +        new = NULL;
+ +      }
+ +    }
+ +  else
+ +    stats->withdraws_received++;
   
- -          if (! c->in_keep_filtered)
- -            goto drop;
+ +  rte_import(&c->in_req, n, new, src);
   
- -          new->flags |= REF_FILTERED;
- -        }
- -      }
+ +  if (fec)
+ +  {
+ +    mpls_unlock_fec(fec);
+ +    DBGL( "Unlock FEC %p (rte_update %N)", fec, n);
+ +  }
+ +}
   
- -      if (p->mpls_map)
- -        {
- -        if (mpls_handle_rte(p->mpls_map, n, new, rte_update_pool, &fec) < 0)
- -          {
- -            rte_trace_in(D_FILTERS, c, new, "invalid");
- -            stats->imp_updates_invalid++;
- -            goto drop;
- -          }
- -      }
+ +void
+ +rte_import(struct rt_import_request *req, const net_addr *n, rte *new, struct rte_src *src)
+ +{
+ +  struct rt_import_hook *hook = req->hook;
+ +  if (!hook)
+ +  {
+ +    log(L_WARN "%s: Called rte_import without import hook", req->name);
+ +    return;
+ +  }
   
- -      if (!rta_is_cached(new->attrs)) /* Need to copy attributes */
- -      new->attrs = rta_lookup(new->attrs);
- -      new->flags |= REF_COW;
+ +  RT_LOCKED(hook->table, tab)
+ +  {
+ +    u32 bs = atomic_load_explicit(&tab->routes_block_size, memory_order_acquire);
   
- -      /* Use the actual struct network, not the dummy one */
- -      nn = net_get(c->table, n);
- -      new->net = nn;
- -    }
- -  else
+ +    struct netindex *i;
+ +    net *routes = atomic_load_explicit(&tab->routes, memory_order_acquire);
+ +    net *nn;
+ +    if (new)
       {
- -      stats->imp_withdraws_received++;
+ +      /* An update */
+ +      /* Set auxiliary values */
+ +      new->stale_cycle = hook->stale_set;
+ +      new->sender = hook;
   
- -      if (!(nn = net_find(c->table, n)) || !src)
- -      {
- -        stats->imp_withdraws_ignored++;
- -        rte_update_unlock();
- -        return;
- -      }
- -    }
+ +      /* Allocate the key structure */
+ +      i = net_get_index(tab->netindex, n);
+ +      new->net = i->addr;
   
- - recalc:
- -  /* And recalculate the best route */
- -  rte_recalculate(c, nn, new, src);
+ +      /* Block size update */
+ +      u32 nbs = bs;
+ +      while (i->index >= nbs)
+ +      nbs *= 2;
   
- -  if (p->mpls_map)
- -    mpls_handle_rte_cleanup(p->mpls_map, &fec);
+ +      if (nbs > bs)
+ +      {
+ +      net *nb = mb_alloc(tab->rp, nbs * sizeof *nb);
+ +      memcpy(&nb[0], routes, bs * sizeof *nb);
+ +      memset(&nb[bs], 0, (nbs - bs) * sizeof *nb);
+ +      ASSERT_DIE(atomic_compare_exchange_strong_explicit(
+ +            &tab->routes, &routes, nb,
+ +            memory_order_acq_rel, memory_order_relaxed));
+ +      ASSERT_DIE(atomic_compare_exchange_strong_explicit(
+ +            &tab->routes_block_size, &bs, nbs,
+ +            memory_order_acq_rel, memory_order_relaxed));
+ +      ASSERT_DIE(atomic_compare_exchange_strong_explicit(
+ +            &tab->export_all.max_feed_index, &bs, nbs,
+ +            memory_order_acq_rel, memory_order_relaxed));
+ +      ASSERT_DIE(atomic_compare_exchange_strong_explicit(
+ +            &tab->export_best.max_feed_index, &bs, nbs,
+ +            memory_order_acq_rel, memory_order_relaxed));
+ +
+ +      synchronize_rcu();
+ +      mb_free(routes);
+ +
+ +      routes = nb;
+ +      }
   
- -  rte_update_unlock();
- -  return;
+ +      /* Update table tries */
+ +      struct f_trie *trie = atomic_load_explicit(&tab->trie, memory_order_relaxed);
+ +      if (trie)
+ +      trie_add_prefix(trie, i->addr, i->addr->pxlen, i->addr->pxlen);
+ +
+ +      if (tab->trie_new)
+ +      trie_add_prefix(tab->trie_new, i->addr, i->addr->pxlen, i->addr->pxlen);
+ +    }
+ +    else if ((i = net_find_index(tab->netindex, n)) && (i->index < bs))
+ +      /* Found an block where we can withdraw from */
+ +      ;
+ +    else
+ +    {
+ +      /* No route for this net is present at all. Ignore right now. */
+ +      req->hook->stats.withdraws_ignored++;
+ +      if (req->trace_routes & D_ROUTES)
+ +      log(L_TRACE "%s > ignored %N withdraw", req->name, n);
+ +      return;
+ +    }
   
- - drop:
- -  rte_free(new);
- -  new = NULL;
- -  if (nn = net_find(c->table, n))
- -    goto recalc;
+ +    /* Resolve the net structure */
+ +    nn = &routes[i->index];
   
- -  rte_update_unlock();
+ +    /* Recalculate the best route. */
+ +    rte_recalculate(tab, hook, i, nn, new, src);
+ +  }
   }
   
- -/* Independent call to rte_announce(), used from next hop
- -   recalculation, outside of rte_update(). new must be non-NULL */
- -static inline void
- -rte_announce_i(rtable *tab, uint type, net *net, rte *new, rte *old,
- -             rte *new_best, rte *old_best)
+ +/*
+ + *    Feeding
+ + */
+ +
+ +static net *
+ +rt_net_feed_get_net(struct rtable_reading *tr, uint index)
   {
- -  rte_update_lock();
- -  rte_announce(tab, type, net, new, old, new_best, old_best);
- -  rte_update_unlock();
+ +  /* Get the route block from the table */
+ +  net *routes = atomic_load_explicit(&tr->t->routes, memory_order_acquire);
+ +  u32 bs = atomic_load_explicit(&tr->t->routes_block_size, memory_order_acquire);
+ +
+ +  /* Nothing to actually feed */
+ +  if (index >= bs)
+ +    return NULL;
+ +
+ +  /* We have a net to feed! */
+ +  return &routes[index];
   }
   
- -static inline void
- -rte_discard(rte *old) /* Non-filtered route deletion, used during garbage collection */
+ +static const struct rt_pending_export *
+ +rt_net_feed_validate_first(
+ +    struct rtable_reading *tr,
+ +    const struct rt_pending_export *first_in_net,
+ +    const struct rt_pending_export *last_in_net,
+ +    const struct rt_pending_export *first)
   {
- -  rte_update_lock();
- -  rte_recalculate(old->sender, old->net, NULL, old->src);
- -  rte_update_unlock();
+ +  /* Inconsistent input */
+ +  if (!first_in_net != !last_in_net)
+ +    RT_READ_RETRY(tr);
+ +
+ +  if (!first)
+ +    return first_in_net;
+ +
+ +  /* Export item validity check: we must find it between first_in_net and last_in_net */
+ +  const struct rt_pending_export *rpe = first_in_net;
+ +  while (rpe)
+ +    if (rpe == first)
+ +      return first;
+ +    else if (rpe == last_in_net)
+ +      /* Got to the end without finding the beginning */
+ +      break;
+ +    else
+ +      rpe = atomic_load_explicit(&rpe->next, memory_order_acquire);
+ +
+ +  /* Not found, inconsistent export, retry */
+ +  RT_READ_RETRY(tr);
   }
   
- -/* Modify existing route by protocol hook, used for long-lived graceful restart */
- -static inline void
- -rte_modify(rte *old)
+ +static struct rt_export_feed *
- rt_net_feed_index(struct rtable_reading *tr, net *n, _Bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_pending_export *first)
++rt_net_feed_index(struct rtable_reading *tr, net *n, bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_pending_export *first)
   {
- -  rte_update_lock();
+ +  /* Get the feed itself. It may change under our hands tho. */
+ +  struct rt_pending_export *first_in_net, *last_in_net;
+ +  first_in_net = atomic_load_explicit(&n->all.first, memory_order_acquire);
+ +  last_in_net = atomic_load_explicit(&n->all.last, memory_order_acquire);
+ +
+ +  first = rt_net_feed_validate_first(tr, first_in_net, last_in_net, first);
+ +
+ +  /* Count the elements */
+ +  uint rcnt = rte_feed_count(tr, n);
+ +  uint ecnt = 0;
+ +  uint ocnt = 0;
+ +  for (const struct rt_pending_export *rpe = first; rpe;
+ +      rpe = atomic_load_explicit(&rpe->next, memory_order_acquire))
+ +  {
+ +    ecnt++;
+ +    if (rpe->it.old)
+ +      ocnt++;
+ +  }
+ +
+ +  if (ecnt) {
+ +    const net_addr *a = (first->it.new ?: first->it.old)->net;
+ +    if (prefilter && !prefilter(f, a))
+ +      return NULL;
+ +  }
+ +
+ +  struct rt_export_feed *feed = NULL;
   
- -  rte *new = old->sender->proto->rte_modify(old, rte_update_pool);
- -  if (new != old)
+ +  if (rcnt || ocnt || ecnt)
     {
- -    if (new)
+ +    if (!ecnt && prefilter && !prefilter(f, NET_READ_BEST_ROUTE(tr, n)->rte.net))
+ +      return NULL;
+ +
+ +    feed = rt_alloc_feed(rcnt+ocnt, ecnt);
+ +
+ +    if (rcnt)
+ +      rte_feed_obtain_copy(tr, n, feed->block, rcnt);
+ +
+ +    if (ecnt)
       {
- -      if (!rta_is_cached(new->attrs))
- -      new->attrs = rta_lookup(new->attrs);
- -      new->flags = (old->flags & ~REF_MODIFY) | REF_COW;
+ +      uint e = 0;
+ +      uint rpos = rcnt;
+ +      for (const struct rt_pending_export *rpe = first; rpe;
+ +        rpe = atomic_load_explicit(&rpe->next, memory_order_acquire))
+ +      if (e >= ecnt)
+ +        RT_READ_RETRY(tr);
+ +      else
+ +      {
+ +        feed->exports[e++] = rpe->it.seq;
+ +
+ +        /* Copy also obsolete routes */
+ +        if (rpe->it.old)
+ +        {
+ +          ASSERT_DIE(rpos < rcnt + ocnt);
+ +          feed->block[rpos++] = *rpe->it.old;
+ +          ea_free_later(ea_ref(rpe->it.old->attrs));
+ +        }
+ +      }
+ +
+ +      ASSERT_DIE(e == ecnt);
       }
   
- -    rte_recalculate(old->sender, old->net, new, old->src);
+ +    feed->ni = NET_TO_INDEX(feed->block[0].net);
     }
   
- -  rte_update_unlock();
+ +  /* Check that it indeed didn't change and the last export is still the same. */
+ +  if (
+ +      (first_in_net != atomic_load_explicit(&n->all.first, memory_order_acquire))
+ +   || (last_in_net != atomic_load_explicit(&n->all.last, memory_order_acquire)))
+ +    RT_READ_RETRY(tr);
+ +
+ +  return feed;
   }
   
- -/* Check rtable for best route to given net whether it would be exported do p */
- -int
- -rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter)
+ +static struct rt_export_feed *
- rt_net_feed_internal(struct rtable_reading *tr, u32 index, _Bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_pending_export *first)
++rt_net_feed_internal(struct rtable_reading *tr, u32 index, bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_pending_export *first)
   {
- -  struct proto *p = c->proto;
- -  net *n = net_find(t, a);
- -  rte *rt = n ? n->routes : NULL;
+ +  net *n = rt_net_feed_get_net(tr, index);
+ +  if (!n)
+ +    return &rt_feed_index_out_of_range;
   
- -  if (!rte_is_valid(rt))
- -    return 0;
+ +  return rt_net_feed_index(tr, n, prefilter, f, first);
+ +}
   
- -  rte_update_lock();
+ +struct rt_export_feed *
+ +rt_net_feed(rtable *t, const net_addr *a, const struct rt_pending_export *first)
+ +{
+ +  RT_READ(t, tr);
+ +  const struct netindex *ni = net_find_index(tr->t->netindex, a);
+ +  return ni ? rt_net_feed_internal(tr, ni->index, NULL, NULL, first) : NULL;
+ +}
   
- -  /* Rest is stripped down export_filter() */
- -  int v = p->preexport ? p->preexport(c, rt) : 0;
- -  if (v == RIC_PROCESS)
- -    v = (f_run(filter, &rt, rte_update_pool, FF_SILENT) <= F_ACCEPT);
+ +static struct rt_export_feed *
- rt_feed_net_all(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, _Bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first)
++rt_feed_net_all(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first)
+ +{
+ +  RT_READ_ANCHORED(SKIP_BACK(rtable, export_all, e), tr, u);
+ +  return rt_net_feed_internal(tr, index, prefilter, f, SKIP_BACK(const struct rt_pending_export, it, _first));
+ +}
   
- -  /* Discard temporary rte */
- -  if (rt != n->routes)
- -    rte_free(rt);
+ +rte
+ +rt_net_best(rtable *t, const net_addr *a)
+ +{
+ +  rte rt = {};
   
- -  rte_update_unlock();
+ +  RT_READ(t, tr);
   
- -  return v > 0;
+ +  struct netindex *i = net_find_index(t->netindex, a);
+ +  net *n = i ? net_find(tr, i) : NULL;
+ +  if (!n)
+ +    return rt;
+ +
+ +  struct rte_storage *e = NET_READ_BEST_ROUTE(tr, n);
+ +  if (!e || !rte_is_valid(&e->rte))
+ +    return rt;
+ +
+ +  ASSERT_DIE(e->rte.net == i->addr);
+ +  ea_free_later(ea_ref(e->rte.attrs));
+ +  return RTE_COPY(e);
   }
   
- rt_feed_net_best(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, _Bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first)
+ +static struct rt_export_feed *
++rt_feed_net_best(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first)
+ +{
+ +  SKIP_BACK_DECLARE(rtable, t, export_best, e);
+ +  SKIP_BACK_DECLARE(const struct rt_pending_export, first, it, _first);
+ +
+ +  RT_READ_ANCHORED(t, tr, u);
+ +
+ +  net *n = rt_net_feed_get_net(tr, index);
+ +  if (!n)
+ +    return &rt_feed_index_out_of_range;
+ +    /* No more to feed, we are fed up! */
+ +
+ +  const struct rt_pending_export *first_in_net, *last_in_net;
+ +  first_in_net = atomic_load_explicit(&n->best.first, memory_order_acquire);
+ +  last_in_net = atomic_load_explicit(&n->best.last, memory_order_acquire);
+ +  first = rt_net_feed_validate_first(tr, first_in_net, last_in_net, first);
+ +
+ +  uint ecnt = 0;
+ +  for (const struct rt_pending_export *rpe = first; rpe;
+ +      rpe = atomic_load_explicit(&rpe->next, memory_order_acquire))
+ +    ecnt++;
+ +
+ +  if (ecnt) {
+ +    const net_addr *a = (first->it.new ?: first->it.old)->net;
+ +    if (prefilter && !prefilter(f, a))
+ +      return NULL;
+ +  }
   
- -/**
- - * rt_refresh_begin - start a refresh cycle
- - * @t: related routing table
- - * @c related channel
- - *
- - * This function starts a refresh cycle for given routing table and announce
- - * hook. The refresh cycle is a sequence where the protocol sends all its valid
- - * routes to the routing table (by rte_update()). After that, all protocol
- - * routes (more precisely routes with @c as @sender) not sent during the
- - * refresh cycle but still in the table from the past are pruned. This is
+ +  struct rte_storage *best = NET_READ_BEST_ROUTE(tr, n);
+ +
+ +  if (!ecnt && (!best || prefilter && !prefilter(f, best->rte.net)))
+ +    return NULL;
+ +
+ +  struct rt_export_feed *feed = rt_alloc_feed(!!best, ecnt);
+ +  if (best)
+ +  {
+ +    feed->block[0] = best->rte;
+ +    feed->ni = NET_TO_INDEX(best->rte.net);
+ +  }
+ +  else
+ +    feed->ni = NET_TO_INDEX((first->it.new ?: first->it.old)->net);
+ +
+ +  if (ecnt)
+ +  {
+ +    uint e = 0;
+ +    for (const struct rt_pending_export *rpe = first; rpe;
+ +      rpe = atomic_load_explicit(&rpe->next, memory_order_acquire))
+ +      if (e >= ecnt)
+ +      RT_READ_RETRY(tr);
+ +      else
+ +      feed->exports[e++] = rpe->it.seq;
+ +
+ +    ASSERT_DIE(e == ecnt);
+ +  }
+ +
+ +  /* Check that it indeed didn't change and the last export is still the same. */
+ +  if (
+ +      (first_in_net != atomic_load_explicit(&n->best.first, memory_order_acquire))
+ +      || (last_in_net != atomic_load_explicit(&n->best.last, memory_order_acquire)))
+ +    RT_READ_RETRY(tr);
+ +
+ +  /* And we're finally done */
+ +  return feed;
+ +}
+ +
+ +
+ +/* Check rtable for best route to given net whether it would be exported do p */
+ +int
+ +rt_examine(rtable *t, net_addr *a, struct channel *c, const struct filter *filter)
+ +{
+ +  rte rt = rt_net_best(t, a);
+ +
+ +  int v = c->proto->preexport ? c->proto->preexport(c, &rt) : 0;
+ +  if (v == RIC_PROCESS)
+ +    v = (f_run(filter, &rt, FF_SILENT) <= F_ACCEPT);
+ +
+ +  return v > 0;
+ +}
+ +
+ +static inline void
+ +rt_set_import_state(struct rt_import_hook *hook, u8 state)
+ +{
+ +  hook->last_state_change = current_time();
+ +  hook->import_state = state;
+ +
+ +  CALL(hook->req->log_state_change, hook->req, state);
+ +}
+ +
+ +void
+ +rt_request_import(rtable *t, struct rt_import_request *req)
+ +{
+ +  RT_LOCKED(t, tab)
+ +  {
+ +    rt_lock_table(tab);
+ +
+ +    struct rt_import_hook *hook = req->hook = mb_allocz(tab->rp, sizeof(struct rt_import_hook));
+ +
+ +    DBG("Lock table %s for import %p req=%p uc=%u\n", tab->name, hook, req, tab->use_count);
+ +
+ +    hook->req = req;
+ +    hook->table = t;
+ +
+ +    rt_set_import_state(hook, TIS_UP);
+ +    add_tail(&tab->imports, &hook->n);
+ +  }
+ +}
+ +
+ +void
+ +rt_stop_import(struct rt_import_request *req, void (*stopped)(struct rt_import_request *))
+ +{
+ +  ASSERT_DIE(req->hook);
+ +  struct rt_import_hook *hook = req->hook;
+ +
+ +  RT_LOCKED(hook->table, tab)
+ +  {
+ +    rt_set_import_state(hook, TIS_STOP);
+ +    hook->stopped = stopped;
+ +
+ +    rt_refresh_trace(tab, hook, "stop import");
+ +
+ +    /* Cancel table rr_counter */
+ +    if (hook->stale_set != hook->stale_pruned)
+ +      tab->rr_counter -= ((int) hook->stale_set - (int) hook->stale_pruned);
+ +
+ +    tab->rr_counter++;
+ +
+ +    hook->stale_set = hook->stale_pruned = hook->stale_pruning = hook->stale_valid = 0;
+ +
+ +    rt_schedule_prune(tab);
+ +  }
+ +}
+ +
+ +
+ +/**
+ + * rt_refresh_begin - start a refresh cycle
+ + * @t: related routing table
+ + * @c related channel
+ + *
+ + * This function starts a refresh cycle for given routing table and announce
+ + * hook. The refresh cycle is a sequence where the protocol sends all its valid
+ + * routes to the routing table (by rte_update()). After that, all protocol
+ + * routes (more precisely routes with @c as @sender) not sent during the
+ + * refresh cycle but still in the table from the past are pruned. This is
    * implemented by marking all related routes as stale by REF_STALE flag in
    * rt_refresh_begin(), then marking all related stale routes with REF_DISCARD
    * flag in rt_refresh_end() and then removing such routes in the prune loop.
@@@ -2907,31 -1857,20 +2907,31 @@@ rt_flowspec_link(rtable *src_pub, rtabl
   void
   rt_flowspec_unlink(rtable *src, rtable *dst)
   {
- -  struct rt_flowspec_link *ln = rt_flowspec_find_link(src, dst);
+ +  birdloop_enter(dst->loop);
   
-   _Bool unlock_dst = 0;
- -  ASSERT(ln && (ln->uc > 0));
++  bool unlock_dst = 0;
   
- -  ln->uc--;
- -
- -  if (!ln->uc)
+ +  struct rt_flowspec_link *ln;
+ +  RT_LOCKED(src, t)
     {
- -    rem_node(&ln->n);
- -    mb_free(ln);
+ +    ln = rt_flowspec_find_link(t, dst);
   
- -    rt_unlock_table(src);
- -    rt_unlock_table(dst);
+ +    ASSERT(ln && (ln->uc > 0));
+ +
+ +    if (!--ln->uc)
+ +    {
+ +      rt_flowspec_link_rem_node(&t->flowspec_links, ln);
+ +      rtex_export_unsubscribe(&ln->req);
+ +      ev_postpone(&ln->event);
+ +      mb_free(ln);
+ +      unlock_dst = 1;
+ +    }
     }
+ +
+ +  if (unlock_dst)
+ +    rt_unlock_table(dst);
+ +
+ +  birdloop_leave(dst->loop);
   }
   
   static void
@@@ -3252,86 -2004,10 +3252,86 @@@ voi
   rt_init(void)
   {
     rta_init();
- -  rt_table_pool = rp_new(&root_pool, "Routing tables");
- -  rte_update_pool = lp_new_default(rt_table_pool);
- -  rte_slab = sl_new(rt_table_pool, sizeof(rte));
+ +  rt_table_pool = rp_new(&root_pool, the_bird_domain.the_bird, "Routing tables");
     init_list(&routing_tables);
- static _Bool
+ +  init_list(&deleted_routing_tables);
+ +  ev_init_list(&rt_cork.queue, &main_birdloop, "Route cork release");
+ +  rt_cork.dom = DOMAIN_NEW_RCU_SYNC(resource);
+ +  idm_init(&rtable_idm, rt_table_pool, 256);
+ +
+ +  ea_register_init(&ea_roa_aggregated);
+ +}
+ +
-     _Bool stale = (s->import_state == TIS_FLUSHING);
++static bool
+ +rt_prune_net(struct rtable_private *tab, struct network *n)
+ +{
+ +  NET_WALK_ROUTES(tab, n, ep, e)
+ +  {
+ +    ASSERT_DIE(!(e->flags & REF_OBSOLETE));
+ +    struct rt_import_hook *s = e->rte.sender;
+ +
++    bool stale = (s->import_state == TIS_FLUSHING);
+ +
+ +    if (!stale)
+ +    {
+ +
+ +    /*
+ +     * The range of 0..256 is split by s->stale_* like this:
+ +     *
+ +     *     pruned    pruning     valid      set
+ +     *       |          |          |         |
+ +     * 0     v          v          v         v       256
+ +     * |...........................+++++++++++........|
+ +     *
+ +     * We want to drop everything outside the marked range, thus
+ +     *            (e->rte.stale_cycle < s->stale_valid) ||
+ +     *            (e->rte.stale_cycle > s->stale_set))
+ +     *          looks right.
+ +     *
+ +     * But the pointers may wrap around, and in the following situation, all the routes get pruned:
+ +     *
+ +     *      set         pruned    pruning     valid
+ +     *       |            |          |          |
+ +     * 0     v            v          v          v    256
+ +     * |++++++..................................++++++|
+ +     *
+ +     * In that case, we want
+ +     *            (e->rte.stale_cycle > s->stale_valid) ||
+ +     *            (e->rte.stale_cycle < s->stale_set))
+ +     *
+ +     * Full logic table:
+ +     *
+ +     *           permutation   |  result  |  (S < V) + (S < SC) + (SC < V)
+ +     *        -----------------+----------+---------------------------------
+ +     *   SC <   V <=  S  |   prune  |     0    +    0     +     1    =  1
+ +     *    S <  SC <   V  |   prune  |     1    +    1     +     1    =  3
+ +     *    V <=  S <  SC  |   prune  |     0    +    1     +     0    =  1
+ +     *   SC <=  S <   V  |    keep  |     1    +    0     +     1    =  2
+ +     *    V <= SC <=  S  |    keep  |     0    +    0     +     0    =  0
+ +     *    S <   V <= SC  |    keep  |     1    +    1     +     0    =  2
+ +     *
+ +     * Now the following code hopefully makes sense.
+ +     */
+ +
+ +      int sv = (s->stale_set < s->stale_valid);
+ +      int ssc = (s->stale_set < e->rte.stale_cycle);
+ +      int scv = (e->rte.stale_cycle < s->stale_valid);
+ +      stale = (sv + ssc + scv) & 1;
+ +    }
+ +
+ +    /* By the C standard, either the importer is flushing and stale_perm is 1,
+ +     * or by the table above, stale_perm is between 0 and 3, where even values
+ +     * say "keep" and odd values say "prune". */
+ +
+ +    if (stale)
+ +    {
+ +      /* Announce withdrawal */
+ +      struct netindex *i = RTE_GET_NETINDEX(&e->rte);
+ +      rte_recalculate(tab, e->rte.sender, i, n, NULL, e->rte.src);
+ +      return 1;
+ +    }
+ +  }
+ +  return 0;
   }
   
   
@@@ -3925,234 -2539,78 +3925,234 @@@ rt_flowspec_update_rte(struct rtable_pr
   #endif
   }
   
+ +static inline void
+ +rt_flowspec_resolve_rte(rte *r, struct channel *c)
+ +{
+ +#ifdef CONFIG_BGP
+ +  enum flowspec_valid valid, old = rt_get_flowspec_valid(r);
+ +  struct bgp_channel *bc = (struct bgp_channel *) c;
   
- -static inline int
- -rt_next_hop_update_net(rtable *tab, net *n)
+ +  if (        (rt_get_source_attr(r) == RTS_BGP)
+ +     && (c->class == &channel_bgp)
+ +     && (bc->base_table))
+ +  {
+ +    SKIP_BACK_DECLARE(struct bgp_proto, p, p, bc->c.proto);
+ +    RT_LOCKED(c->in_req.hook->table, tab)
+ +      valid = rt_flowspec_check(
+ +        bc->base_table, tab,
+ +        r->net, r->attrs, p->is_interior);
+ +  }
+ +  else
+ +    valid = FLOWSPEC_UNKNOWN;
+ +
+ +  if (valid == old)
+ +    return;
+ +
+ +  if (valid == FLOWSPEC_UNKNOWN)
+ +    ea_unset_attr(&r->attrs, 0, &ea_gen_flowspec_valid);
+ +  else
+ +    ea_set_attr_u32(&r->attrs, &ea_gen_flowspec_valid, 0, valid);
+ +#endif
+ +}
+ +
+ +static inline void
+ +rt_next_hop_update_net(struct rtable_private *tab, struct netindex *ni, net *n)
   {
- -  rte **k, *e, *new, *old_best, **new_best;
- -  int count = 0;
- -  int free_old_best = 0;
+ +  uint count = 0;
+ +  int is_flow = net_val_match(tab->addr_type, NB_FLOW);
   
- -  old_best = n->routes;
+ +  struct rte_storage *old_best = NET_BEST_ROUTE(tab, n);
     if (!old_best)
- -    return 0;
+ +    return;
+ +
+ +  NET_WALK_ROUTES(tab, n, ep, e)
+ +    count++;
+ +
+ +  if (!count)
+ +    return;
+ +
+ +  struct rte_multiupdate {
+ +    struct rte_storage *old, *new_stored;
+ +    rte new;
+ +  } *updates = tmp_allocz(sizeof(struct rte_multiupdate) * (count+1));
+ +
+ +  uint pos = 0;
+ +  NET_WALK_ROUTES(tab, n, ep, e)
+ +    updates[pos++].old = e;
+ +
+ +  uint mod = 0;
+ +  if (is_flow)
+ +    for (uint i = 0; i < pos; i++)
+ +      mod += rt_flowspec_update_rte(tab, &updates[i].old->rte, &updates[i].new);
+ +
+ +  else
+ +    for (uint i = 0; i < pos; i++)
+ +      mod += rt_next_hop_update_rte(&updates[i].old->rte, &updates[i].new);
+ +
+ +  if (!mod)
+ +    return;
+ +
+ +  /* We add a spinlock sentinel to the beginning */
+ +  struct rte_storage local_sentinel = {
+ +    .flags = REF_OBSOLETE,
+ +    .next = old_best,
+ +  };
+ +  atomic_store_explicit(&n->routes, &local_sentinel, memory_order_release);
   
- -  for (k = &n->routes; e = *k; k = &e->next)
+ +  /* Now we mark all the old routes obsolete */
+ +  for (uint i = 0; i < pos; i++)
+ +    if (updates[i].new.attrs)
+ +      updates[i].old->flags |= REF_OBSOLETE;
+ +
+ +  /* Wait for readers */
+ +  synchronize_rcu();
+ +
+ +  /* And now we go backwards to keep the list properly linked */
+ +  struct rte_storage *next = NULL;
+ +  for (int i = pos - 1; i >= 0; i--)
     {
- -    if (!net_is_flow(n->n.addr))
- -      new = rt_next_hop_update_rte(tab, e);
+ +    struct rte_storage *this;
+ +    if (updates[i].new.attrs)
+ +    {
+ +      rte *new = &updates[i].new;
+ +      new->lastmod = current_time();
+ +      new->id = hmap_first_zero(&tab->id_map);
+ +      hmap_set(&tab->id_map, new->id);
+ +      this = updates[i].new_stored = rte_store(new, ni, tab);
+ +    }
       else
- -      new = rt_flowspec_update_rte(tab, e);
+ +      this = updates[i].old;
   
- -    if (new)
- -      {
- -      *k = new;
+ +    atomic_store_explicit(&this->next, next, memory_order_release);
+ +    next = this;
+ +  }
   
- -      rte_trace_in(D_ROUTES, new->sender, new, "updated");
- -      rte_announce_i(tab, RA_ANY, n, new, e, NULL, NULL);
+ +  /* Add behind the sentinel */
+ +  atomic_store_explicit(&local_sentinel.next, next, memory_order_release);
   
- -      /* Call a pre-comparison hook */
+ +  /* Call the pre-comparison hooks */
+ +  for (uint i = 0; i < pos; i++)
+ +    if (updates[i].new_stored)
+ +      {
         /* Not really an efficient way to compute this */
- -      if (e->src->proto->rte_recalculate)
- -        e->src->proto->rte_recalculate(tab, n, new, e, NULL);
+ +      if (updates[i].old->rte.src->owner->rte_recalculate)
+ +        updates[i].old->rte.src->owner->rte_recalculate(tab, n, updates[i].new_stored, updates[i].old, old_best);
+ +      }
   
- -      if (e != old_best)
- -        rte_free_quick(e);
- -      else /* Freeing of the old best rte is postponed */
- -        free_old_best = 1;
+ +  /* Find the new best route */
+ +  uint best_pos = 0;
+ +  struct rte_storage *new_best = updates[0].new_stored ?: updates[0].old;
   
- -      e = new;
- -      count++;
- -      }
+ +  for (uint i = 1; i < pos; i++)
+ +  {
+ +    struct rte_storage *s = updates[i].new_stored ?: updates[i].old;
+ +    if (rte_better(&s->rte, &new_best->rte))
+ +    {
+ +      best_pos = i;
+ +      new_best = s;
+ +    }
     }
   
- -  if (!count)
- -    return 0;
+ +  /* Relink the new best route to the first position */
+ +  struct rte_storage * _Atomic *best_prev;
+ +  if (best_pos)
+ +    best_prev = &(updates[best_pos-1].new_stored ?: updates[best_pos-1].old)->next;
+ +  else
+ +    best_prev = &local_sentinel.next;
   
- -  /* Find the new best route */
- -  new_best = NULL;
- -  for (k = &n->routes; e = *k; k = &e->next)
+ +  /* Unlink from the original place */
+ +  atomic_store_explicit(best_prev,
+ +      atomic_load_explicit(&new_best->next, memory_order_relaxed),
+ +      memory_order_release);
+ +
+ +  /* Link out */
+ +  atomic_store_explicit(&new_best->next,
+ +      atomic_load_explicit(&local_sentinel.next, memory_order_relaxed),
+ +      memory_order_release);
+ +
+ +  /* Now we have to announce the routes the right way, to not cause any
+ +   * strange problems with consistency. */
+ +
+ +  ASSERT_DIE(updates[0].old == old_best);
+ +
+ +  /* Find new best route original position */
+ +  uint nbpos = ~0;
+ +  for (uint i=0; i<count; i++)
+ +    if ((updates[i].new_stored == new_best) || (updates[i].old == new_best))
       {
- -      if (!new_best || rte_better(e, *new_best))
- -      new_best = k;
+ +      nbpos = i;
+ +      break;
       }
+ +  ASSERT_DIE(~nbpos);
   
- -  /* Relink the new best route to the first position */
- -  new = *new_best;
- -  if (new != n->routes)
+ +  struct rt_pending_export *best_rpe =
+ +    (new_best != old_best) ?
+ +    rte_announce_to(&tab->export_best, &n->best, &new_best->rte, &old_best->rte)
+ +    : NULL;
+ +
+ +  uint total = 0;
+ +  u64 last_seq = 0;
+ +
+ +  /* Announce the changes */
+ +  for (uint i=0; i<count; i++)
+ +  {
+ +    /* Not changed at all */
+ +    if (!updates[i].new_stored)
+ +      continue;
+ +
+ +    struct rt_pending_export *this_rpe =
+ +      rte_announce_to(&tab->export_all, &n->all,
+ +        &updates[i].new_stored->rte, &updates[i].old->rte);
+ +
+ +    ASSERT_DIE(this_rpe);
-     _Bool nb = (new_best->rte.src == updates[i].new.src), ob = (i == 0);
++    bool nb = (new_best->rte.src == updates[i].new.src), ob = (i == 0);
+ +    char info[96];
+ +    char best_indicator[2][2] = { { ' ', '+' }, { '-', '=' } };
+ +    bsnprintf(info, sizeof info, "autoupdated [%cbest]", best_indicator[ob][nb]);
+ +
+ +    rt_rte_trace_in(D_ROUTES, updates[i].new.sender->req, &updates[i].new, info);
+ +
+ +    /* Double announcement of this specific route */
+ +    if (ob && best_rpe)
       {
- -      *new_best = new->next;
- -      new->next = n->routes;
- -      n->routes = new;
+ +      ASSERT_DIE(best_rpe->it.old == &updates[i].old->rte);
+ +      ASSERT_DIE(!best_rpe->seq_all);
+ +      best_rpe->seq_all = this_rpe->it.seq;
       }
+ +    else
+ +      last_seq = this_rpe->it.seq;
   
- -  /* Announce the new best route */
- -  if (new != old_best)
- -    rte_trace_in(D_ROUTES, new->sender, new, "updated [best]");
+ +    total++;
+ +  }
   
- -  /* Propagate changes */
- -  rte_announce_i(tab, RA_UNDEF, n, NULL, NULL, n->routes, old_best);
+ +  if (best_rpe && !best_rpe->seq_all)
+ +  {
+ +    ASSERT_DIE(!updates[0].new_stored);
+ +    best_rpe->seq_all = last_seq;
+ +  }
   
- -  if (free_old_best)
- -    rte_free_quick(old_best);
+ +  /* Now we can finally release the changes back into the table */
+ +  atomic_store_explicit(&n->routes, new_best, memory_order_release);
   
- -  return count;
+ +  return;
+ +}
+ +
+ +static void
+ +rt_nhu_uncork(callback *cb)
+ +{
+ +  RT_LOCKED(SKIP_BACK(rtable, priv.nhu_uncork.cb, cb), tab)
+ +  {
+ +    ASSERT_DIE(tab->nhu_corked);
+ +    ASSERT_DIE(tab->nhu_state == 0);
+ +
+ +    /* Reset the state */
+ +    tab->nhu_state = tab->nhu_corked;
+ +    tab->nhu_corked = 0;
+ +    rt_trace(tab, D_STATES, "Next hop updater uncorked");
+ +
+ +    ev_send_loop(tab->loop, tab->nhu_event);
+ +    rt_unlock_table(tab);
+ +  }
   }
   
   static void
@@@ -4454,110 -3070,123 +4454,110 @@@ rt_check_cork_high(struct rtable_privat
     }
   }
   
- -void
- -rt_prune_sync(rtable *t, int all)
+ +
+ +static int
+ +rt_reconfigure(struct rtable_private *tab, struct rtable_config *new, struct rtable_config *old)
   {
- -  struct fib_iterator fit;
+ +  if ((new->addr_type != old->addr_type) ||
+ +      (new->sorted != old->sorted) ||
+ +      (new->trie_used != old->trie_used))
+ +    return 0;
   
- -  FIB_ITERATE_INIT(&fit, &t->fib);
+ +  ASSERT_DIE(new->master.setup == old->master.setup);
+ +  ASSERT_DIE(new->master.stop == old->master.stop);
   
- -again:
- -  FIB_ITERATE_START(&t->fib, &fit, net, n)
- -  {
- -    rte *e, **ee = &n->routes;
+ +  DBG("\t%s: same\n", new->name);
+ +  new->table = RT_PUB(tab);
+ +  tab->name = new->name;
+ +  tab->config = new;
+ +  tab->debug = new->debug;
+ +  tab->export_all.trace_routes = tab->export_best.trace_routes = new->debug;
   
- -    while (e = *ee)
- -    {
- -      if (all || (e->flags & (REF_STALE | REF_DISCARD)))
- -      {
- -      *ee = e->next;
- -      rte_free_quick(e);
- -      t->rt_count--;
- -      }
- -      else
- -      ee = &e->next;
- -    }
+ +  if (tab->hostcache)
+ +    tab->hostcache->req.trace_routes = new->debug;
   
- -    if (all || !n->routes)
- -    {
- -      FIB_ITERATE_PUT(&fit);
- -      fib_delete(&t->fib, n);
- -      goto again;
- -    }
- -  }
- -  FIB_ITERATE_END;
- -}
+ +  WALK_TLIST(rt_flowspec_link, ln, &tab->flowspec_links)
+ +    ln->req.trace_routes = new->debug;
   
+ +  tab->cork_threshold = new->cork_threshold;
   
- -/*
- - *    Export table
- - */
+ +  if (new->cork_threshold.high != old->cork_threshold.high)
+ +    rt_check_cork_high(tab);
   
- -int
- -rte_update_out(struct channel *c, const net_addr *n, rte *new, rte *old0, int refeed)
- -{
- -  struct rtable *tab = c->out_table;
- -  struct rte_src *src;
- -  rte *old, **pos;
- -  net *net;
+ +  if (new->cork_threshold.low != old->cork_threshold.low)
+ +    rt_check_cork_low(tab);
   
- -  if (new)
- -  {
- -    net = net_get(tab, n);
- -    src = new->src;
+ +  if (tab->export_digest && (
+ +      (new->digest_settle.min != tab->export_digest->settle.cf.min)
+ +    ||  (new->digest_settle.max != tab->export_digest->settle.cf.max)))
+ +    tab->export_digest->settle.cf = new->digest_settle;
   
- -    if (!rta_is_cached(new->attrs))
- -      new->attrs = rta_lookup(new->attrs);
- -  }
- -  else
- -  {
- -    net = net_find(tab, n);
- -    src = old0->src;
+ +  return 1;
+ +}
   
- -    if (!net)
- -      goto drop_withdraw;
- -  }
+ +static struct rtable_config *
+ +rt_find_table_config(struct config *cf, char *name)
+ +{
+ +  struct symbol *sym = cf_find_symbol(cf, name);
+ +  return (sym && (sym->class == SYM_TABLE)) ? sym->table : NULL;
+ +}
+ +
+ +/**
+ + * rt_commit - commit new routing table configuration
+ + * @new: new configuration
+ + * @old: original configuration or %NULL if it's boot time config
+ + *
+ + * Scan differences between @old and @new configuration and modify
+ + * the routing tables according to these changes. If @new defines a
+ + * previously unknown table, create it, if it omits a table existing
+ + * in @old, schedule it for deletion (it gets deleted when all protocols
+ + * disconnect from it by calling rt_unlock_table()), if it exists
+ + * in both configurations, leave it unchanged.
+ + */
+ +void
+ +rt_commit(struct config *new, struct config *old)
+ +{
+ +  struct rtable_config *o, *r;
   
- -  /* Find the old rte */
- -  for (pos = &net->routes; old = *pos; pos = &old->next)
- -    if ((c->ra_mode != RA_ANY) || (old->src == src))
+ +  DBG("rt_commit:\n");
+ +
+ +  if (old)
       {
- -      if (new && rte_same(old, new))
+ +      WALK_LIST(o, old->tables)
         {
-       _Bool ok;
- -      /* REF_STALE / REF_DISCARD not used in export table */
- -      /*
- -      if (old->flags & (REF_STALE | REF_DISCARD | REF_MODIFY))
++      bool ok;
+ +      RT_LOCKED(o->table, tab)
         {
- -        old->flags &= ~(REF_STALE | REF_DISCARD | REF_MODIFY);
- -        return 1;
+ +        r = OBSREF_GET(tab->deleted) ? NULL : rt_find_table_config(new, o->name);
+ +        ok = r && !new->shutdown && rt_reconfigure(tab, r, o);
         }
- -      */
   
- -      goto drop_update;
- -      }
+ +      if (ok)
+ +        continue;
   
- -      /* Remove the old rte */
- -      *pos = old->next;
- -      rte_free_quick(old);
- -      tab->rt_count--;
+ +      birdloop_enter(o->table->loop);
+ +      RT_LOCKED(o->table, tab)
+ +      {
+ +        DBG("\t%s: deleted\n", o->name);
+ +        OBSREF_SET(tab->deleted, old);
+ +        rt_check_cork_low(tab);
+ +        rt_lock_table(tab);
+ +        rt_unlock_table(tab);
+ +      }
   
- -      break;
+ +      CALL(o->table->config->master.stop, o->table);
+ +      birdloop_leave(o->table->loop);
+ +      }
       }
   
- -  if (!new)
- -  {
- -    if (!old)
- -      goto drop_withdraw;
- -
- -    if (!net->routes)
- -      fib_delete(&tab->fib, net);
- -
- -    return 1;
- -  }
- -
- -  /* Insert the new rte */
- -  rte *e = rte_do_cow(new);
- -  e->flags |= REF_COW;
- -  e->net = net;
- -  e->sender = c;
- -  e->lastmod = current_time();
- -  e->next = *pos;
- -  *pos = e;
- -  tab->rt_count++;
- -  return 1;
- -
- -drop_update:
- -  return refeed;
- -
- -drop_withdraw:
- -  return 0;
+ +  WALK_LIST(r, new->tables)
+ +    if (!r->table)
+ +      {
+ +      r->table = rt_setup(rt_table_pool, r);
+ +      DBG("\t%s: created\n", r->name);
+ +      add_tail(&routing_tables, &r->table->n);
+ +      }
+ +  DBG("\tdone\n");
   }
   
   
diff --cc proto/bgp/attrs.c

index 7ed898d238521b2cd17a01c673429d68923b0251,85646647f4b98a183e798dec548b75d8f790a4be..d8d9d3cf35585253e99d417e5dc9129b17d085fe
--- 1/proto/bgp/attrs.c
--- 2/proto/bgp/attrs.c
+++ b/proto/bgp/attrs.c
@@@ -1803,242 -1716,15 +1803,242 @@@ bgp_free_prefix(struct bgp_ptx_private 
   }
   
   void
- -bgp_free_prefix(struct bgp_channel *c, struct bgp_prefix *px)
+ +bgp_done_prefix(struct bgp_ptx_private *c, struct bgp_prefix *px, struct bgp_bucket *buck)
   {
+ +  /* BMP hack */
+ +  if (buck->bmp)
+ +    return;
+ +
+ +  /* Cleanup: We're called from bucket senders. */
+ +  ASSERT_DIE(px->cur == buck);
     rem_node(&px->buck_node);
- -  HASH_REMOVE2(c->prefix_hash, PXH, c->pool, px);
   
- -  if (c->prefix_slab)
- -    sl_free(px);
- -  else
- -    mb_free(px);
+ +  /* We may want to store the updates */
+ +  if (c->c->tx_keep)
+ +  {
+ +    /* Nothing to be sent right now */
+ +    px->cur = NULL;
+ +
+ +    /* Unref the previous sent version */
+ +    if (px->last)
+ +      if (!--px->last->px_uc)
+ +      bgp_done_bucket(c, px->last);
+ +
+ +    /* Ref the current sent version */
+ +    if (!IS_WITHDRAW_BUCKET(buck))
+ +    {
+ +      px->last = buck;
+ +      px->last->px_uc++;
+ +      return;
+ +    }
+ +
+ +    /* Prefixes belonging to the withdraw bucket are freed always */
+ +  }
+ +
+ +  bgp_free_prefix(c, px);
+ +}
+ +
+ +void
+ +bgp_tx_resend(struct bgp_proto *p, struct bgp_channel *bc)
+ +{
+ +  BGP_PTX_LOCK(bc->tx, c);
+ +
+ +  ASSERT_DIE(bc->tx_keep);
+ +  uint seen = 0;
+ +
+ +  HASH_WALK(c->prefix_hash, next, px)
+ +  {
+ +    if (!px->cur)
+ +    {
+ +      ASSERT_DIE(px->last);
+ +      struct bgp_bucket *last = px->last;
+ +
+ +      /* Remove the last reference, we wanna resend the route */
+ +      px->last->px_uc--;
+ +      px->last = NULL;
+ +
+ +      /* And send it once again */
+ +      seen += bgp_update_prefix(c, px, last);
+ +    }
+ +  }
+ +  HASH_WALK_END;
+ +
+ +  if (bc->c.debug & D_EVENTS)
+ +    log(L_TRACE "%s.%s: TX resending %u routes",
+ +      bc->c.proto->name, bc->c.name, seen);
+ +
+ +  if (seen)
+ +    bgp_schedule_packet(p->conn, bc, PKT_UPDATE);
+ +}
+ +
+ +/*
+ + *    Prefix hash table exporter
+ + */
+ +
+ +static void
+ +bgp_out_item_done(struct lfjour *j, struct lfjour_item *i)
+ +{}
+ +
+ +static struct rt_export_feed *
- bgp_out_feed_net(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, _Bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first)
++bgp_out_feed_net(struct rt_exporter *e, struct rcu_unwinder *u, u32 index, bool (*prefilter)(struct rt_export_feeder *, const net_addr *), struct rt_export_feeder *f, const struct rt_export_item *_first)
+ +{
+ +  ASSERT_DIE(u == NULL);
+ +  SKIP_BACK_DECLARE(struct bgp_ptx_private, c, exporter, e);
+ +  ASSERT_DIE(DOMAIN_IS_LOCKED(rtable, c->lock));
+ +
+ +  struct netindex *ni = net_resolve_index(c->exporter.netindex, index);
+ +  if (ni == &net_index_out_of_range)
+ +    return &rt_feed_index_out_of_range;
+ +
+ +  if (ni == NULL)
+ +    return NULL;
+ +
+ +  if (prefilter && !prefilter(f, ni->addr))
+ +    return NULL;
+ +
+ +  struct rt_export_feed *feed = NULL;
+ +
+ +  uint count = 0;
+ +
+ +  struct bgp_prefix *chain = HASH_FIND_CHAIN(c->prefix_hash, PXH, ni, NULL);
+ +
+ +  for (struct bgp_prefix *px = chain; px; px = px->next)
+ +    if (px->ni == ni)
+ +      count += !!px->last + !!px->cur;
+ +
+ +  if (count)
+ +  {
+ +    feed = rt_alloc_feed(count, 0);
+ +    feed->ni = ni;
+ +
+ +    uint pos = 0;
+ +
+ +    for (struct bgp_prefix *px = chain; px; px = px->next)
+ +      if (px->ni == ni)
+ +      {
+ +      if (px->cur)
+ +        feed->block[pos++] = (rte) {
+ +          .attrs = (px->cur == c->withdraw_bucket) ? NULL : ea_free_later(ea_lookup_slow(px->cur->eattrs, 0, EALS_CUSTOM)),
+ +          .net = ni->addr,
+ +          .src = px->src,
+ +          .lastmod = px->lastmod,
+ +          .flags = REF_PENDING,
+ +        };
+ +
+ +      if (px->last)
+ +        feed->block[pos++] = (rte) {
+ +          .attrs = (px->last == c->withdraw_bucket) ? NULL : ea_free_later(ea_lookup_slow(px->last->eattrs, 0, EALS_CUSTOM)),
+ +          .net = ni->addr,
+ +          .src = px->src,
+ +          .lastmod = px->lastmod,
+ +        };
+ +      }
+ +
+ +    ASSERT_DIE(pos == count);
+ +  }
+ +
+ +  return feed;
+ +}
+ +
+ +/* TX structures Init and Free */
+ +
+ +void
+ +bgp_init_pending_tx(struct bgp_channel *c)
+ +{
+ +  ASSERT_DIE(c->c.out_table == NULL);
+ +  ASSERT_DIE(c->tx == NULL);
+ +
+ +  DOMAIN(rtable) dom = DOMAIN_NEW_RCU_SYNC(rtable);
+ +  LOCK_DOMAIN(rtable, dom);
+ +  pool *p = rp_newf(c->pool, dom.rtable, "%s.%s TX", c->c.proto->name, c->c.name);
+ +
+ +  struct bgp_ptx_private *bpp = mb_allocz(p, sizeof *bpp);
+ +
+ +  bpp->lock = dom;
+ +  bpp->pool = p;
+ +  bpp->c = c;
+ +
+ +  bgp_init_bucket_table(bpp);
+ +  bgp_init_prefix_table(bpp);
+ +
+ +  bpp->exporter = (struct rt_exporter) {
+ +    .journal = {
+ +      .loop = c->c.proto->loop,
+ +      .item_size = sizeof(struct rt_export_item),
+ +      .item_done = bgp_out_item_done,
+ +    },
+ +    .name = mb_sprintf(c->c.proto->pool, "%s.%s.export", c->c.proto->name, c->c.name),
+ +    .net_type = c->c.net_type,
+ +    .max_feed_index = 0,
+ +    .netindex = c->c.table->netindex,
+ +    .trace_routes = c->c.debug,
+ +    .feed_net = bgp_out_feed_net,
+ +    .domain = dom,
+ +  };
+ +
+ +  rt_exporter_init(&bpp->exporter, &c->cf->ptx_exporter_settle);
+ +  c->c.out_table = &bpp->exporter;
+ +
+ +  c->tx = BGP_PTX_PUB(bpp);
+ +
+ +  UNLOCK_DOMAIN(rtable, dom);
+ +}
+ +
+ +void
+ +bgp_free_pending_tx(struct bgp_channel *bc)
+ +{
+ +  if (!bc->tx)
+ +    return;
+ +
+ +  DOMAIN(rtable) dom = bc->tx->lock;
+ +  LOCK_DOMAIN(rtable, dom);
+ +  struct bgp_ptx_private *c = &bc->tx->priv;
+ +
+ +  bc->c.out_table = NULL;
+ +  rt_exporter_shutdown(&c->exporter, NULL); /* TODO: actually implement exports */
+ +
+ +  /* Move all prefixes to the withdraw bucket to unref the "last" prefixes */
+ +  struct bgp_bucket *b = bgp_get_withdraw_bucket(c);
+ +  HASH_WALK(c->prefix_hash, next, px)
+ +    bgp_update_prefix(c, px, b);
+ +  HASH_WALK_END;
+ +
+ +  /* Flush withdrawals */
+ +  struct bgp_prefix *px;
+ +  WALK_LIST_FIRST(px, b->prefixes)
+ +    bgp_done_prefix(c, px, b);
+ +
+ +  /* Flush pending TX */
+ +  WALK_LIST_FIRST(b, c->bucket_queue)
+ +  {
+ +    WALK_LIST_FIRST(px, b->prefixes)
+ +      bgp_done_prefix(c, px, b);
+ +    bgp_done_bucket(c, b);
+ +  }
+ +
+ +  /* Consistency and resource leak checks */
+ +  HASH_WALK(c->prefix_hash, next, n)
+ +    bug("Stray prefix after cleanup");
+ +  HASH_WALK_END;
+ +
+ +  HASH_FREE(c->prefix_hash);
+ +  sl_delete(c->prefix_slab);
+ +  c->prefix_slab = NULL;
+ +
+ +  HASH_WALK(c->bucket_hash, next, n)
+ +    bug("Stray bucket after cleanup");
+ +  HASH_WALK_END;
+ +
+ +  HASH_FREE(c->bucket_hash);
+ +  sl_delete(c->bucket_slab);
+ +  c->bucket_slab = NULL;
+ +
+ +  rp_free(c->pool);
+ +
+ +  UNLOCK_DOMAIN(rtable, dom);
+ +  DOMAIN_FREE(rtable, dom);
+ +
+ +  bc->tx = NULL;
   }
   
   
diff --cc sysdep/config.h
Simple merge
diff --cc sysdep/unix/alloc.c

index 6d9bcde0457c5fd903d89a8fc6f7e7f3a283bdb3,08fc99801b3c4760380e40041b1d20a45a233d97..ef383f3653810db9e7972e6d4d516bf752ebd1a2
--- 1/sysdep/unix/alloc.c
--- 2/sysdep/unix/alloc.c
+++ b/sysdep/unix/alloc.c
@@@ -30,134 -29,67 +30,134 @@@
   long page_size = 0;
   
   #ifdef HAVE_MMAP
- -#define KEEP_PAGES_MAIN_MAX   256
- -#define KEEP_PAGES_MAIN_MIN   8
- -#define CLEANUP_PAGES_BULK    256
+ +# define KEEP_PAGES_MAX       16384
+ +# define KEEP_PAGES_MIN       32
+ +# define KEEP_PAGES_MAX_LOCAL 128
+ +# define ALLOC_PAGES_AT_ONCE  32
+ +
+ +  STATIC_ASSERT(KEEP_PAGES_MIN * 4 < KEEP_PAGES_MAX);
+ +  STATIC_ASSERT(ALLOC_PAGES_AT_ONCE < KEEP_PAGES_MAX_LOCAL);
+ +
-   static _Bool use_fake = 0;
-   static _Bool initialized = 0;
++  static bool use_fake = 0;
++  static bool initialized = 0;
+ +
+ +# define PROTECT_PAGE(pg)
+ +# define UNPROTECT_PAGE(pg)
+ +
+ +# if DEBUGGING
+ +#   ifdef ENABLE_EXPENSIVE_CHECKS
+ +#     undef PROTECT_PAGE
+ +#     undef UNPROTECT_PAGE
+ +#     define PROTECT_PAGE(pg) mprotect((pg), page_size, PROT_READ)
+ +#     define UNPROTECT_PAGE(pg)       mprotect((pg), page_size, PROT_READ | PROT_WRITE)
+ +#   endif
+ +
+ +#   define AJSIZE     16384
+ +
+ +    static struct alloc_journal {
+ +      void *fp;
+ +      void *next;
+ +      u16 pos;
+ +      u16 type;
+ +      uint thread_id;
+ +    } alloc_journal[AJSIZE];
+ +
+ +    _Thread_local int alloc_journal_local_pos = -1;
+ +    _Atomic int alloc_journal_pos = 0;
+ +
+ +#   define AJT_ALLOC_LOCAL_HOT                1
+ +#   define AJT_ALLOC_GLOBAL_HOT               2
+ +#   define AJT_ALLOC_COLD_STD         3
+ +#   define AJT_ALLOC_COLD_KEEPER      4
+ +#   define AJT_ALLOC_MMAP             5
+ +
+ +#   define AJT_FREE_LOCAL_HOT         0x11
+ +#   define AJT_FREE_GLOBAL_HOT                0x12
+ +
+ +#   define AJT_CLEANUP_NOTHING                0xc0
+ +#   define AJT_CLEANUP_COLD_STD               0xc3
+ +#   define AJT_CLEANUP_COLD_KEEPER    0xc4
+ +#   define AJT_CLEANUP_BEGIN          0xcb
+ +#   define AJT_CLEANUP_END            0xce
+ +
+ +#   define AJT_FLUSH_LOCAL_BEGIN      0xfb
+ +#   define AJT_FLUSH_LOCAL_END                0xfe
+ +#   define AJT_SCHEDULE_CLEANUP               0xff
+ +
+ +    static void
+ +    ajlog(void *fp, void *next, u16 pos, u16 type)
+ +    {
+ +      alloc_journal[(alloc_journal_local_pos = atomic_fetch_add_explicit(&alloc_journal_pos, 1, memory_order_relaxed)) % AJSIZE] = (struct alloc_journal) {
+ +      .fp = fp,
+ +      .next = next,
+ +      .pos = pos,
+ +      .type = type,
+ +      .thread_id = THIS_THREAD_ID,
+ +      };
+ +    }
   
- -STATIC_ASSERT(KEEP_PAGES_MAIN_MIN * 4 < KEEP_PAGES_MAIN_MAX);
+ +    struct free_page {
+ +      node unused[42];
+ +      struct free_page * _Atomic next;
+ +    };
+ +# else /* ! DEBUGGING */
   
- -static bool use_fake = 0;
+ +#   define ajlog(...)
   
- -#if DEBUGGING
- -struct free_page {
- -  node unused[42];
- -  node n;
- -};
- -#else
- -struct free_page {
- -  node n;
- -};
- -#endif
+ +    struct free_page {
+ +      struct free_page * _Atomic next;
+ +    };
   
- -#define EP_POS_MAX    ((page_size - OFFSETOF(struct empty_pages, pages)) / sizeof (void *))
+ +# endif
   
- -struct empty_pages {
- -  node n;
- -  uint pos;
- -  void *pages[0];
- -};
+ +# define WRITE_NEXT(pg, val)  do { UNPROTECT_PAGE((pg)); (pg)->next = (val); PROTECT_PAGE((pg)); } while (0)
   
- -struct free_pages {
- -  list pages;         /* List of (struct free_page) keeping free pages without releasing them (hot) */
- -  list empty;         /* List of (struct empty_pages) keeping invalidated pages mapped for us (cold) */
- -  u16 min, max;               /* Minimal and maximal number of free pages kept */
- -  uint cnt;           /* Number of free pages in list */
- -  event cleanup;
- -};
+ +# define EP_POS_MAX   ((page_size - OFFSETOF(struct empty_pages, pages)) / sizeof (void *))
   
- -static void global_free_pages_cleanup_event(void *);
- -static void *alloc_cold_page(void);
+ +  struct empty_pages {
+ +    struct empty_pages *next;
+ +    uint pos;
+ +    void *pages[0];
+ +  };
   
- -static struct free_pages global_free_pages = {
- -  .min = KEEP_PAGES_MAIN_MIN,
- -  .max = KEEP_PAGES_MAIN_MAX,
- -  .cleanup = { .hook = global_free_pages_cleanup_event },
- -};
+ +  static DOMAIN(resource) empty_pages_domain;
+ +  static struct empty_pages *empty_pages = NULL;
   
- -uint *pages_kept = &global_free_pages.cnt;
+ +  static struct free_page * _Atomic page_stack = NULL;
+ +  static _Thread_local struct free_page * local_page_stack = NULL;
+ +  static struct free_page page_stack_blocked;
   
- -static void *
- -alloc_sys_page(void)
- -{
- -  void *ptr = mmap(NULL, page_size, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ +  /* Try to replace the page stack head with a cork, until it succeeds. */
+ +# define PAGE_STACK_GET       ({ \
+ +    struct free_page *fp; \
+ +    while ((fp = atomic_exchange_explicit(&page_stack, &page_stack_blocked, memory_order_acq_rel)) == &page_stack_blocked) birdloop_yield(); \
+ +    fp; })
+ +  /* Reinstate the stack with another value */
+ +# define PAGE_STACK_PUT(val)  ASSERT_DIE(atomic_exchange_explicit(&page_stack, (val), memory_order_acq_rel) == &page_stack_blocked)
   
- -  if (ptr == MAP_FAILED)
- -    die("mmap(%ld) failed: %m", (s64) page_size);
+ +  static void page_cleanup(void *);
+ +  static event page_cleanup_event = { .hook = page_cleanup, };
+ +# define SCHEDULE_CLEANUP  do if (initialized && !shutting_down) ev_send(&global_event_list, &page_cleanup_event); while (0)
   
- -  return ptr;
- -}
+ +  _Atomic int pages_kept = 0;
+ +  _Atomic int pages_kept_locally = 0;
+ +  static _Thread_local int pages_kept_here = 0;
   
- -extern int shutting_down; /* Shutdown requested. */
+ +  static void *
+ +  alloc_sys_page(void)
+ +  {
+ +    void *ptr = mmap(NULL, page_size * ALLOC_PAGES_AT_ONCE, PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ +
+ +    if (ptr == MAP_FAILED)
+ +      die("mmap(%ld) failed: %m", (s64) page_size);
+ +
+ +    return ptr;
+ +  }
+ +
+ +  extern int shutting_down; /* Shutdown requested. */
   
   #else // ! HAVE_MMAP
- -#define use_fake  1
+ +# define use_fake  1
   #endif
   
   void *
diff --cc sysdep/unix/domain.c

index 7dace7e124666f30580e4e0ee20618882c29d0a6,0000000000000000000000000000000000000000..a3104b89883d3664b338892fdfb888bf2f79da8d

mode 100644,000000..100644
--- 1/sysdep/unix/domain.c
--- /dev/null
+++ b/sysdep/unix/domain.c
@@@ -1,159 -1,0 +1,159 @@@
-   _Bool forbidden_when_reading_rcu;
+ +/*
+ + *    BIRD Locking
+ + *
+ + *    (c) 2020 Maria Matejka <mq@jmq.cz>
+ + *
+ + *    Can be freely distributed and used under the terms of the GNU GPL.
+ + */
+ +
+ +#ifndef _GNU_SOURCE
+ +#define _GNU_SOURCE
+ +#endif
+ +
+ +#undef LOCAL_DEBUG
+ +
+ +#undef DEBUG_LOCKING
+ +
+ +#include "lib/birdlib.h"
+ +#include "lib/locking.h"
+ +#include "lib/resource.h"
+ +#include "lib/timer.h"
+ +
+ +#include "conf/conf.h"
+ +
+ +#include <errno.h>
+ +#include <fcntl.h>
+ +#include <poll.h>
+ +#include <pthread.h>
+ +#include <semaphore.h>
+ +#include <stdatomic.h>
+ +#include <stdlib.h>
+ +#include <string.h>
+ +#include <unistd.h>
+ +
+ +/*
+ + *    Locking subsystem
+ + */
+ +
+ +#ifdef DEBUGGING
+ +_Thread_local rw_spinlock *rw_spinlocks_taken[MAX_RWS_AT_ONCE];
+ +_Thread_local btime rw_spinlocks_time[MAX_RWS_AT_ONCE];
+ +_Thread_local u32 rw_spinlocks_taken_cnt;
+ +_Thread_local u32 rw_spinlocks_taken_write;
+ +#endif
+ +
+ +_Thread_local struct lock_order locking_stack = {};
+ +_Thread_local struct domain_generic **last_locked = NULL;
+ +
+ +#define ASSERT_NO_LOCK        ASSERT_DIE(last_locked == NULL)
+ +
+ +struct domain_generic {
+ +  pthread_mutex_t mutex;
+ +  uint order;
- domain_new(uint order, _Bool allow_rcu)
++  bool forbidden_when_reading_rcu;
+ +  struct domain_generic **prev;
+ +  struct lock_order *locked_by;
+ +  const char *name;
+ +  pool *pool;
+ +};
+ +
+ +#define DOMAIN_INIT(_order, _allow_rcu) { \
+ +  .mutex = PTHREAD_MUTEX_INITIALIZER, \
+ +  .order = _order, \
+ +  .forbidden_when_reading_rcu = !_allow_rcu, \
+ +}
+ +
+ +static struct domain_generic the_bird_domain_gen = DOMAIN_INIT(OFFSETOF(struct lock_order, the_bird), 1);
+ +
+ +DOMAIN(the_bird) the_bird_domain = { .the_bird = &the_bird_domain_gen };
+ +
+ +struct domain_generic *
++domain_new(uint order, bool allow_rcu)
+ +{
+ +  ASSERT_DIE(order < sizeof(struct lock_order));
+ +  struct domain_generic *dg = xmalloc(sizeof(struct domain_generic));
+ +  *dg = (struct domain_generic) DOMAIN_INIT(order, allow_rcu);
+ +  return dg;
+ +}
+ +
+ +void
+ +domain_free(struct domain_generic *dg)
+ +{
+ +  pthread_mutex_destroy(&dg->mutex);
+ +  xfree(dg);
+ +}
+ +
+ +const char *
+ +domain_name(struct domain_generic *dg)
+ +{
+ +  return dg->name;
+ +}
+ +
+ +uint dg_order(struct domain_generic *dg)
+ +{
+ +  return dg->order;
+ +}
+ +
+ +void
+ +domain_setup(struct domain_generic *dg, const char *name, pool *p)
+ +{
+ +  ASSERT_DIE(dg->pool == NULL);
+ +  dg->pool = p;
+ +  dg->name = name;
+ +}
+ +
+ +void do_lock(struct domain_generic *dg, struct domain_generic **lsp)
+ +{
+ +  struct lock_order stack_copy;
+ +  memcpy(&stack_copy, &locking_stack, sizeof(stack_copy));
+ +  struct domain_generic **lll = last_locked;
+ +
+ +  if (dg->forbidden_when_reading_rcu)
+ +    if (rcu_read_active())
+ +      bug("Locking of this lock forbidden while RCU reader is active");
+ +    else
+ +      rcu_blocked++;
+ +
+ +  if ((char *) lsp - (char *) &locking_stack != dg->order)
+ +    bug("Trying to lock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack);
+ +
+ +  if (lsp <= last_locked)
+ +    bug("Trying to lock in a bad order: %p %p", &stack_copy, lll);
+ +  if (*lsp)
+ +    bug("Inconsistent locking stack state on lock");
+ +
+ +  btime lock_begin = current_time();
+ +  pthread_mutex_lock(&dg->mutex);
+ +  btime duration = current_time() - lock_begin;
+ +  btime wdw = atomic_load_explicit(&global_runtime, memory_order_relaxed)->watchdog_warning;
+ +  if (wdw && (duration > wdw))
+ +    log(L_WARN "Locking of %s took %d ms", dg->name, (int) (duration TO_MS));
+ +
+ +  if (dg->prev || dg->locked_by)
+ +    bug("Previous unlock not finished correctly");
+ +  dg->prev = last_locked;
+ +  *lsp = dg;
+ +  last_locked = lsp;
+ +  dg->locked_by = &locking_stack;
+ +}
+ +
+ +void do_unlock(struct domain_generic *dg, struct domain_generic **lsp)
+ +{
+ +  if (dg->forbidden_when_reading_rcu)
+ +    ASSERT_DIE(rcu_blocked--);
+ +
+ +  if ((char *) lsp - (char *) &locking_stack != dg->order)
+ +    bug("Trying to unlock on bad position: order=%u, lsp=%p, base=%p", dg->order, lsp, &locking_stack);
+ +
+ +  if (dg->locked_by != &locking_stack)
+ +    bug("Inconsistent domain state on unlock");
+ +  if ((last_locked != lsp) || (*lsp != dg))
+ +    bug("Inconsistent locking stack state on unlock");
+ +  dg->locked_by = NULL;
+ +  last_locked = dg->prev;
+ +  *lsp = NULL;
+ +  dg->prev = NULL;
+ +  pthread_mutex_unlock(&dg->mutex);
+ +
+ +  /* From here on, the dg pointer is invalid! */
+ +}
diff --cc sysdep/unix/io-loop.c

index be18752a76239709901dc7083c5625e9bf507900,0000000000000000000000000000000000000000..32530826cb6a62fed6037f075d350f5f2eb2a944

mode 100644,000000..100644
--- 1/sysdep/unix/io-loop.c
--- /dev/null
+++ b/sysdep/unix/io-loop.c
@@@ -1,1758 -1,0 +1,1758 @@@
- _Bool
+ +/*
+ + *    BIRD -- I/O and event loop
+ + *
+ + *    Can be freely distributed and used under the terms of the GNU GPL.
+ + */
+ +
+ +#include <stdio.h>
+ +#include <stdlib.h>
+ +#include <unistd.h>
+ +#include <errno.h>
+ +#include <fcntl.h>
+ +#include <poll.h>
+ +#include <pthread.h>
+ +#include <time.h>
+ +#include <sys/time.h>
+ +
+ +#include "nest/bird.h"
+ +
+ +#include "lib/buffer.h"
+ +#include "lib/defer.h"
+ +#include "lib/lists.h"
+ +#include "lib/locking.h"
+ +#include "lib/resource.h"
+ +#include "lib/event.h"
+ +#include "lib/timer.h"
+ +#include "lib/socket.h"
+ +
+ +#include "lib/io-loop.h"
+ +#include "sysdep/unix/io-loop.h"
+ +#include "conf/conf.h"
+ +#include "nest/cli.h"
+ +
+ +#define THREAD_STACK_SIZE     65536   /* To be lowered in near future */
+ +
+ +static struct birdloop *birdloop_new_no_pickup(pool *pp, uint order, const char *name, ...);
+ +
+ +/*
+ + *    Nanosecond time for accounting purposes
+ + *
+ + *    A fixed point on startup is set as zero, all other values are relative to that.
+ + *    Caution: this overflows after like 500 years or so. If you plan to run
+ + *    BIRD for such a long time, please implement some means of overflow prevention.
+ + */
+ +
+ +#if ! HAVE_CLOCK_MONOTONIC_COARSE
+ +#define CLOCK_MONOTONIC_COARSE CLOCK_MONOTONIC
+ +#endif
+ +
+ +static struct timespec ns_begin;
+ +
+ +static void ns_init(void)
+ +{
+ +  if (clock_gettime(CLOCK_MONOTONIC_COARSE, &ns_begin))
+ +    bug("clock_gettime: %m");
+ +}
+ +
+ +#define NSEC_IN_SEC   ((u64) (1000 * 1000 * 1000))
+ +
+ +u64 ns_now(void)
+ +{
+ +  struct timespec ts;
+ +  if (clock_gettime(CLOCK_MONOTONIC_COARSE, &ts))
+ +    bug("clock_gettime: %m");
+ +
+ +  return (u64) (ts.tv_sec - ns_begin.tv_sec) * NSEC_IN_SEC + ts.tv_nsec - ns_begin.tv_nsec;
+ +}
+ +
+ +#define NSEC_TO_SEC(x)        ((x) / NSEC_IN_SEC)
+ +#define CURRENT_SEC   NSEC_TO_SEC(ns_now())
+ +
+ +static _Thread_local struct spent_time *account_target_spent_time;
+ +static _Thread_local u64 *account_target_total;
+ +static _Thread_local u64 account_last;
+ +
+ +static u64 account_finish(void)
+ +{
+ +  /* Get current time */
+ +  u64 now = ns_now();
+ +  u64 dif = now - account_last;
+ +
+ +  /* Update second by second */
+ +  if (account_target_spent_time)
+ +  {
+ +    /* Drop old time information if difference is too large */
+ +    if (NSEC_TO_SEC(account_last) + TIME_BY_SEC_SIZE - 1 < NSEC_TO_SEC(now))
+ +      account_last = (NSEC_TO_SEC(now) - TIME_BY_SEC_SIZE + 1) * NSEC_IN_SEC;
+ +
+ +    /* Zero new records */
+ +    if (NSEC_TO_SEC(account_target_spent_time->last_written_ns) + TIME_BY_SEC_SIZE < NSEC_TO_SEC(account_last))
+ +      memset(account_target_spent_time->by_sec_ns, 0, sizeof(account_target_spent_time->by_sec_ns));
+ +    else
+ +      for (u64 fclr = NSEC_TO_SEC(account_target_spent_time->last_written_ns) + 1;
+ +        fclr <= NSEC_TO_SEC(now);
+ +        fclr++)
+ +      account_target_spent_time->by_sec_ns[fclr % TIME_BY_SEC_SIZE] = 0;
+ +
+ +    /* Add times second by second */
+ +    while (NSEC_TO_SEC(account_last) != NSEC_TO_SEC(now))
+ +    {
+ +      u64 part = (NSEC_TO_SEC(account_last) + 1) * NSEC_IN_SEC - account_last;
+ +      account_target_spent_time->by_sec_ns[NSEC_TO_SEC(account_last) % TIME_BY_SEC_SIZE] += part;
+ +      account_last += part;
+ +    }
+ +
+ +    /* Update the last second */
+ +    account_target_spent_time->by_sec_ns[NSEC_TO_SEC(account_last) % TIME_BY_SEC_SIZE] += now - account_last;
+ +
+ +    /* Store the current time */
+ +    account_target_spent_time->last_written_ns = now;
+ +  }
+ +
+ +  /* Update the total */
+ +  if (account_target_total)
+ +    *account_target_total += dif;
+ +
+ +  /* Store current time */
+ +  account_last = now;
+ +
+ +  return dif;
+ +}
+ +
+ +static u64 account_to_spent_time(struct spent_time *st)
+ +{
+ +  u64 elapsed = account_finish();
+ +
+ +  account_target_spent_time = st;
+ +  account_target_total = &st->total_ns;
+ +
+ +  return elapsed;
+ +}
+ +
+ +static u64 account_to_total(u64 *total)
+ +{
+ +  u64 elapsed = account_finish();
+ +
+ +  account_target_spent_time = NULL;
+ +  account_target_total = total;
+ +
+ +  return elapsed;
+ +}
+ +
+ +#define account_to(_arg)      _Generic((_arg), \
+ +    struct spent_time *: account_to_spent_time, \
+ +    u64 *: account_to_total)(_arg)
+ +
+ +/*
+ + *    Current thread context
+ + */
+ +
+ +_Thread_local struct birdloop *birdloop_current;
+ +static _Thread_local struct birdloop *birdloop_wakeup_masked;
+ +static _Thread_local uint birdloop_wakeup_masked_count;
+ +
+ +#define LOOP_NAME(loop)                       domain_name((loop)->time.domain)
+ +#define LATENCY_DEBUG(flags)          (atomic_load_explicit(&global_runtime, memory_order_relaxed)->latency_debug & (flags))
+ +
+ +#define LOOP_TRACE(loop, flags, fmt, args...) do { if (LATENCY_DEBUG(flags)) log(L_TRACE "%s (%p): " fmt, LOOP_NAME(loop), (loop), ##args); } while (0)
+ +#define THREAD_TRACE(flags, ...)              do { if (LATENCY_DEBUG(flags)) log(L_TRACE "Thread: " __VA_ARGS__); } while (0)
+ +
+ +#define LOOP_WARN(loop, fmt, args...) log(L_WARN "%s (%p): " fmt, LOOP_NAME(loop), (loop), ##args)
+ +
+ +
+ +event_list *
+ +birdloop_event_list(struct birdloop *loop)
+ +{
+ +  return &loop->event_list;
+ +}
+ +
+ +struct timeloop *
+ +birdloop_time_loop(struct birdloop *loop)
+ +{
+ +  return &loop->time;
+ +}
+ +
+ +pool *
+ +birdloop_pool(struct birdloop *loop)
+ +{
+ +  return loop->pool;
+ +}
+ +
- _Bool
++bool
+ +birdloop_inside(struct birdloop *loop)
+ +{
+ +  for (struct birdloop *c = birdloop_current; c; c = c->prev_loop)
+ +    if (loop == c)
+ +      return 1;
+ +
+ +  return 0;
+ +}
+ +
- static inline _Bool
++bool
+ +birdloop_in_this_thread(struct birdloop *loop)
+ +{
+ +  return pthread_equal(pthread_self(), loop->thread->thread_id);
+ +}
+ +
+ +/*
+ + *    Wakeup code for birdloop
+ + */
+ +
+ +void
+ +pipe_new(struct pipe *p)
+ +{
+ +  int rv = pipe(p->fd);
+ +  if (rv < 0)
+ +    die("pipe: %m");
+ +
+ +  if (fcntl(p->fd[0], F_SETFL, O_NONBLOCK) < 0)
+ +    die("fcntl(O_NONBLOCK): %m");
+ +
+ +  if (fcntl(p->fd[1], F_SETFL, O_NONBLOCK) < 0)
+ +    die("fcntl(O_NONBLOCK): %m");
+ +}
+ +
+ +void
+ +pipe_drain(struct pipe *p)
+ +{
+ +  while (1) {
+ +    char buf[64];
+ +    int rv = read(p->fd[0], buf, sizeof(buf));
+ +    if ((rv < 0) && (errno == EAGAIN))
+ +      return;
+ +
+ +    if (rv == 0)
+ +      bug("wakeup read eof");
+ +    if ((rv < 0) && (errno != EINTR))
+ +      bug("wakeup read: %m");
+ +  }
+ +}
+ +
+ +int
+ +pipe_read_one(struct pipe *p)
+ +{
+ +  while (1) {
+ +    char v;
+ +    int rv = read(p->fd[0], &v, sizeof(v));
+ +    if (rv == 1)
+ +      return 1;
+ +    if ((rv < 0) && (errno == EAGAIN))
+ +      return 0;
+ +    if (rv > 1)
+ +      bug("wakeup read more bytes than expected: %d", rv);
+ +    if (rv == 0)
+ +      bug("wakeup read eof");
+ +    if (errno != EINTR)
+ +      bug("wakeup read: %m");
+ +  }
+ +}
+ +
+ +void
+ +pipe_kick(struct pipe *p)
+ +{
+ +  char v = 1;
+ +  int rv;
+ +
+ +  while (1) {
+ +    rv = write(p->fd[1], &v, sizeof(v));
+ +    if ((rv >= 0) || (errno == EAGAIN))
+ +      return;
+ +    if (errno != EINTR)
+ +      bug("wakeup write: %m");
+ +  }
+ +}
+ +
+ +void
+ +pipe_pollin(struct pipe *p, struct pfd *pfd)
+ +{
+ +  BUFFER_PUSH(pfd->pfd) = (struct pollfd) {
+ +      .fd = p->fd[0],
+ +      .events = POLLIN,
+ +      };
+ +  BUFFER_PUSH(pfd->loop) = NULL;
+ +}
+ +
+ +void
+ +pipe_free(struct pipe *p)
+ +{
+ +  close(p->fd[0]);
+ +  close(p->fd[1]);
+ +}
+ +
+ +static inline void
+ +wakeup_init(struct bird_thread *loop)
+ +{
+ +  pipe_new(&loop->wakeup);
+ +}
+ +
+ +static inline void
+ +wakeup_drain(struct bird_thread *loop)
+ +{
+ +  pipe_drain(&loop->wakeup);
+ +}
+ +
+ +static inline void
+ +wakeup_do_kick(struct bird_thread *loop)
+ +{
+ +  pipe_kick(&loop->wakeup);
+ +}
+ +
+ +static inline void
+ +wakeup_free(struct bird_thread *loop)
+ +{
+ +  pipe_free(&loop->wakeup);
+ +}
+ +
- sockets_fire(struct birdloop *loop, _Bool read, _Bool write)
++static inline bool
+ +birdloop_try_ping(struct birdloop *loop, u32 ltt)
+ +{
+ +  /* Somebody else is already pinging, be idempotent */
+ +  if (ltt & LTT_PING)
+ +  {
+ +    LOOP_TRACE(loop, DL_PING, "already being pinged");
+ +    return 0;
+ +  }
+ +
+ +  /* Thread moving is an implicit ping */
+ +  if (ltt & LTT_MOVE)
+ +  {
+ +    LOOP_TRACE(loop, DL_PING, "ping while moving");
+ +    return 1;
+ +  }
+ +
+ +  /* No more flags allowed */
+ +  ASSERT_DIE(!ltt);
+ +
+ +  /* No ping when not picked up */
+ +  if (!loop->thread)
+ +  {
+ +    LOOP_TRACE(loop, DL_PING, "not picked up yet, can't ping");
+ +    return 1;
+ +  }
+ +
+ +  /* No ping when masked */
+ +  if (loop == birdloop_wakeup_masked)
+ +  {
+ +    LOOP_TRACE(loop, DL_PING, "wakeup masked, can't ping");
+ +    birdloop_wakeup_masked_count++;
+ +    return 1;
+ +  }
+ +
+ +  /* Send meta event to ping */
+ +  if ((loop != loop->thread->meta) && (loop != &main_birdloop))
+ +  {
+ +    LOOP_TRACE(loop, DL_PING, "Ping by meta event to %p", loop->thread->meta);
+ +    ev_send_loop(loop->thread->meta, &loop->event);
+ +    return 1;
+ +  }
+ +
+ +  /* Do the real ping of Meta or Main */
+ +  LOOP_TRACE(loop, DL_WAKEUP, "sending pipe ping");
+ +  wakeup_do_kick(loop->thread);
+ +  return 0;
+ +}
+ +
+ +static inline void
+ +birdloop_do_ping(struct birdloop *loop)
+ +{
+ +  /* Register our ping effort */
+ +  u32 ltt = atomic_fetch_or_explicit(&loop->thread_transition, LTT_PING, memory_order_acq_rel);
+ +
+ +  /* Try to ping in multiple ways */
+ +  if (birdloop_try_ping(loop, ltt))
+ +    atomic_fetch_and_explicit(&loop->thread_transition, ~LTT_PING, memory_order_acq_rel);
+ +}
+ +
+ +void
+ +birdloop_ping(struct birdloop *loop)
+ +{
+ +  if (!birdloop_inside(loop))
+ +  {
+ +    LOOP_TRACE(loop, DL_PING, "ping from outside");
+ +    birdloop_do_ping(loop);
+ +  }
+ +  else
+ +  {
+ +    LOOP_TRACE(loop, DL_PING, "ping from inside, pending=%d", loop->ping_pending);
+ +    if (!loop->ping_pending)
+ +      loop->ping_pending++;
+ +  }
+ +}
+ +
+ +
+ +/*
+ + *    Sockets
+ + */
+ +
+ +static void
+ +sockets_init(struct birdloop *loop)
+ +{
+ +  init_list(&loop->sock_list);
+ +  loop->sock_num = 0;
+ +}
+ +
+ +void
+ +socket_changed(sock *s)
+ +{
+ +  struct birdloop *loop = s->loop;
+ +  ASSERT_DIE(birdloop_inside(loop));
+ +
+ +  LOOP_TRACE(loop, DL_SOCKETS, "socket %p changed", s);
+ +  loop->sock_changed = 1;
+ +  birdloop_ping(loop);
+ +}
+ +
+ +void
+ +birdloop_add_socket(struct birdloop *loop, sock *s)
+ +{
+ +  ASSERT_DIE(birdloop_inside(loop));
+ +  ASSERT_DIE(!s->loop);
+ +
+ +  LOOP_TRACE(loop, DL_SOCKETS, "adding socket %p (total=%d)", s, loop->sock_num);
+ +  add_tail(&loop->sock_list, &s->n);
+ +  loop->sock_num++;
+ +
+ +  s->loop = loop;
+ +  s->index = -1;
+ +
+ +  socket_changed(s);
+ +}
+ +
+ +extern sock *stored_sock; /* mainloop hack */
+ +
+ +void
+ +birdloop_remove_socket(struct birdloop *loop, sock *s)
+ +{
+ +  ASSERT_DIE(!enlisted(&s->n) == !s->loop);
+ +
+ +  if (!s->loop)
+ +    return;
+ +
+ +  ASSERT_DIE(birdloop_inside(loop));
+ +  ASSERT_DIE(s->loop == loop);
+ +
+ +  /* Decouple the socket from the loop at all. */
+ +  LOOP_TRACE(loop, DL_SOCKETS, "removing socket %p (total=%d)", s, loop->sock_num);
+ +
+ +  if (loop->sock_active == s)
+ +    loop->sock_active = sk_next(s);
+ +
+ +  if ((loop == &main_birdloop) && (s == stored_sock))
+ +    stored_sock = sk_next(s);
+ +
+ +  rem_node(&s->n);
+ +  loop->sock_num--;
+ +
+ +  socket_changed(s);
+ +
+ +  s->loop = NULL;
+ +  s->index = -1;
+ +}
+ +
+ +void
+ +sk_reloop(sock *s, struct birdloop *loop)
+ +{
+ +  ASSERT_DIE(birdloop_inside(loop));
+ +  ASSERT_DIE(birdloop_inside(s->loop));
+ +
+ +  if (loop == s->loop)
+ +    return;
+ +
+ +  birdloop_remove_socket(s->loop, s);
+ +  birdloop_add_socket(loop, s);
+ +}
+ +
+ +void
+ +sk_pause_rx(struct birdloop *loop, sock *s)
+ +{
+ +  ASSERT_DIE(birdloop_inside(loop));
+ +  s->rx_hook = NULL;
+ +  socket_changed(s);
+ +}
+ +
+ +void
+ +sk_resume_rx(struct birdloop *loop, sock *s, int (*hook)(sock *, uint))
+ +{
+ +  ASSERT_DIE(birdloop_inside(loop));
+ +  ASSERT_DIE(hook);
+ +  s->rx_hook = hook;
+ +  socket_changed(s);
+ +}
+ +
+ +static inline uint sk_want_events(sock *s)
+ +{ return (s->rx_hook ? POLLIN : 0) | (sk_tx_pending(s) ? POLLOUT : 0); }
+ +
+ +void
+ +sockets_prepare(struct birdloop *loop, struct pfd *pfd)
+ +{
+ +  node *n;
+ +  WALK_LIST(n, loop->sock_list)
+ +  {
+ +    SKIP_BACK_DECLARE(sock, s, n, n);
+ +    uint w = sk_want_events(s);
+ +
+ +    if (!w)
+ +    {
+ +      s->index = -1;
+ +      continue;
+ +    }
+ +
+ +    s->index = pfd->pfd.used;
+ +    LOOP_TRACE(loop, DL_SOCKETS, "socket %p poll index is %d", s, s->index);
+ +
+ +    BUFFER_PUSH(pfd->pfd) = (struct pollfd) {
+ +      .fd = s->fd,
+ +      .events = sk_want_events(s),
+ +    };
+ +    BUFFER_PUSH(pfd->loop) = loop;
+ +  }
+ +}
+ +
+ +int sk_read(sock *s, int revents);
+ +int sk_write(sock *s);
+ +void sk_err(sock *s, int revents);
+ +
+ +static void
- static _Bool
++sockets_fire(struct birdloop *loop, bool read, bool write)
+ +{
+ +  if (EMPTY_LIST(loop->sock_list))
+ +    return;
+ +
+ +  times_update();
+ +
+ +  struct pollfd *pfd = loop->thread->pfd->pfd.data;
+ +  loop->sock_active = SKIP_BACK(sock, n, HEAD(loop->sock_list));
+ +
+ +  while (loop->sock_active)
+ +  {
+ +    sock *s = loop->sock_active;
+ +
+ +    int rev;
+ +    if ((s->index >= 0) && (rev = pfd[s->index].revents) && !(rev & POLLNVAL))
+ +    {
+ +      int e = 1;
+ +
+ +      if (write && (rev & POLLOUT))
+ +      {
+ +      /* Write until task limit is up */
+ +      while ((s == loop->sock_active) && (e = sk_write(s)) && task_still_in_limit())
+ +        ;
+ +
+ +      if (s != loop->sock_active)
+ +        continue;
+ +
+ +      if (!sk_tx_pending(s))
+ +        loop->thread->sock_changed = 1;
+ +      }
+ +
+ +      /* Read until task limit is up */
+ +      if (read && (rev & POLLIN))
+ +      while ((s == loop->sock_active) && s->rx_hook && sk_read(s, rev) && (s->fast_rx || task_still_in_limit()))
+ +        ;
+ +
+ +      if (s != loop->sock_active)
+ +      continue;
+ +
+ +      if (!(rev & (POLLOUT | POLLIN)) && (rev & POLLERR))
+ +      sk_err(s, rev);
+ +
+ +      if (s != loop->sock_active)
+ +      continue;
+ +    }
+ +
+ +    loop->sock_active = sk_next(s);
+ +  }
+ +}
+ +
+ +/*
+ + *    Threads
+ + */
+ +
+ +static void bird_thread_start_event(void *_data);
+ +static void bird_thread_busy_set(struct bird_thread *thr, int val);
+ +
+ +struct birdloop_pickup_group {
+ +  DOMAIN(attrs) domain;
+ +  list loops;
+ +  list threads;
+ +  uint thread_count;
+ +  uint thread_busy_count;
+ +  uint loop_count;
+ +  uint loop_unassigned_count;
+ +  btime max_latency;
+ +  event start_threads;
+ +} pickup_groups[2] = {
+ +  {
+ +    /* all zeroes */
+ +  },
+ +  {
+ +    /* FIXME: make this dynamic, now it copies the loop_max_latency value from proto/bfd/config.Y */
+ +    .max_latency = 10 MS,
+ +    .start_threads.hook = bird_thread_start_event,
+ +    .start_threads.data = &pickup_groups[1],
+ +  },
+ +};
+ +
+ +static _Thread_local struct bird_thread *this_thread;
+ +
+ +static void
+ +birdloop_set_thread(struct birdloop *loop, struct bird_thread *thr, struct birdloop_pickup_group *group)
+ +{
+ +  struct bird_thread *old = loop->thread;
+ +  ASSERT_DIE(!thr != !old);
+ +
+ +  /* Signal our moving effort */
+ +  u32 ltt = atomic_fetch_or_explicit(&loop->thread_transition, LTT_MOVE, memory_order_acq_rel);
+ +  ASSERT_DIE((ltt & LTT_MOVE) == 0);
+ +
+ +  /* Wait until all previously started pings end */
+ +  while (ltt & LTT_PING)
+ +  {
+ +    birdloop_yield();
+ +    ltt = atomic_load_explicit(&loop->thread_transition, memory_order_acquire);
+ +    ASSERT_DIE(ltt & LTT_MOVE);
+ +  }
+ +  /* Now we are free of running pings */
+ +
+ +  if (!thr)
+ +  {
+ +    /* Unschedule from Meta */
+ +    ev_postpone(&loop->event);
+ +    tm_stop(&loop->timer);
+ +
+ +    /* Request local socket reload */
+ +    this_thread->sock_changed = 1;
+ +  }
+ +
+ +  /* Update the thread value */
+ +  loop->thread = thr;
+ +
+ +  /* Allow pings */
+ +  atomic_fetch_and_explicit(&loop->thread_transition, ~LTT_MOVE, memory_order_acq_rel);
+ +
+ +  /* Put into appropriate lists */
+ +  if (thr)
+ +  {
+ +    thr->loop_count++;
+ +    add_tail(&thr->loops, &loop->n);
+ +
+ +    if (!EMPTY_LIST(loop->sock_list))
+ +      thr->sock_changed = 1;
+ +    ev_send_loop(loop->thread->meta, &loop->event);
+ +  }
+ +  else
+ +  {
+ +    /* Put into pickup list */
+ +    LOCK_DOMAIN(attrs, group->domain);
+ +    add_tail(&group->loops, &loop->n);
+ +    group->loop_unassigned_count++;
+ +    UNLOCK_DOMAIN(attrs, group->domain);
+ +  }
+ +
+ +  loop->last_transition_ns = ns_now();
+ +}
+ +
+ +static void
+ +bird_thread_pickup_next(struct birdloop_pickup_group *group)
+ +{
+ +  /* This thread goes to the end of the pickup list */
+ +  rem_node(&this_thread->n);
+ +  add_tail(&group->threads, &this_thread->n);
+ +
+ +  /* If there are more loops to be picked up, wakeup the next thread in order */
+ +  if (!EMPTY_LIST(group->loops))
+ +    wakeup_do_kick(SKIP_BACK(struct bird_thread, n, HEAD(group->threads)));
+ +}
+ +
-     _Bool thread_dropper_running = !!thread_dropper;
++static bool
+ +birdloop_hot_potato(struct birdloop *loop)
+ +{
+ +  if (!loop)
+ +    return 0;
+ +
+ +  return ns_now() - loop->last_transition_ns < 1 S TO_NS;
+ +}
+ +
+ +static void
+ +birdloop_take(struct birdloop_pickup_group *group)
+ +{
+ +  struct birdloop *loop = NULL;
+ +
+ +  if (birdloop_hot_potato(this_thread->meta))
+ +    return;
+ +
+ +  LOCK_DOMAIN(attrs, group->domain);
+ +
+ +  if (this_thread->busy_active &&
+ +      (group->thread_busy_count < group->thread_count) &&
+ +      (this_thread->loop_count > 1) &&
+ +      !EMPTY_LIST(group->loops) &&
+ +      birdloop_hot_potato(HEAD(group->loops)))
+ +  {
+ +    THREAD_TRACE(DL_SCHEDULING, "Loop drop requested (tbc=%d, tc=%d, lc=%d)",
+ +      group->thread_busy_count, group->thread_count, this_thread->loop_count);
+ +    UNLOCK_DOMAIN(attrs, group->domain);
+ +
+ +    uint dropped = 0;
+ +    node *n;
+ +    WALK_LIST2(loop, n, this_thread->loops, n)
+ +    {
+ +      birdloop_enter(loop);
+ +      if (ev_active(&loop->event) && !loop->stopped && !birdloop_hot_potato(loop))
+ +      {
+ +      /* Pass to another thread */
+ +      rem_node(&loop->n);
+ +      this_thread->loop_count--;
+ +      LOOP_TRACE(loop, DL_SCHEDULING, "Dropping from thread, remaining %u loops here", this_thread->loop_count);
+ +
+ +      /* This also unschedules the loop from Meta */
+ +      birdloop_set_thread(loop, NULL, group);
+ +
+ +      dropped++;
+ +      if (dropped * dropped > this_thread->loop_count)
+ +      {
+ +        birdloop_leave(loop);
+ +
+ +        LOCK_DOMAIN(attrs, group->domain);
+ +        bird_thread_pickup_next(group);
+ +        UNLOCK_DOMAIN(attrs, group->domain);
+ +
+ +        break;
+ +      }
+ +      }
+ +      birdloop_leave(loop);
+ +    }
+ +
+ +    if (dropped)
+ +    {
+ +      this_thread->meta->last_transition_ns = ns_now();
+ +      return;
+ +    }
+ +
+ +    this_thread->busy_counter = 0;
+ +    bird_thread_busy_set(this_thread, 0);
+ +    LOCK_DOMAIN(attrs, group->domain);
+ +  }
+ +
+ +  if (!EMPTY_LIST(group->loops))
+ +  {
+ +    THREAD_TRACE(DL_SCHEDULING, "Loop take requested");
+ +
+ +    /* Take a proportional amount of loops from the pickup list and unlock */
+ +    uint thread_count = group->thread_count + 1;
+ +    if (group->thread_busy_count < group->thread_count)
+ +      thread_count -= group->thread_busy_count;
+ +
+ +    uint assign = 1 + group->loop_unassigned_count / thread_count;
+ +    for (uint i=0; !EMPTY_LIST(group->loops) && i<assign; i++)
+ +    {
+ +      loop = SKIP_BACK(struct birdloop, n, HEAD(group->loops));
+ +      rem_node(&loop->n);
+ +      group->loop_unassigned_count--;
+ +      UNLOCK_DOMAIN(attrs, group->domain);
+ +
+ +      birdloop_enter(loop);
+ +      birdloop_set_thread(loop, this_thread, group);
+ +      LOOP_TRACE(loop, DL_SCHEDULING, "Picked up by thread");
+ +
+ +      node *n;
+ +      WALK_LIST(n, loop->sock_list)
+ +      SKIP_BACK(sock, n, n)->index = -1;
+ +
+ +      birdloop_leave(loop);
+ +
+ +      LOCK_DOMAIN(attrs, group->domain);
+ +    }
+ +
+ +    bird_thread_pickup_next(group);
+ +  }
+ +
+ +  UNLOCK_DOMAIN(attrs, group->domain);
+ +  this_thread->meta->last_transition_ns = ns_now();
+ +}
+ +
+ +static int
+ +poll_timeout(struct birdloop *loop)
+ +{
+ +  timer *t = timers_first(&loop->time);
+ +  if (!t)
+ +  {
+ +    THREAD_TRACE(DL_SCHEDULING, "No timers, no events in meta");
+ +    return -1;
+ +  }
+ +
+ +  btime remains = tm_remains(t);
+ +  int timeout = remains TO_MS + ((remains TO_MS) MS < remains);
+ +
+ +  THREAD_TRACE(DL_SCHEDULING, "Next meta timer in %d ms for %s", timeout,
+ +      LOOP_NAME(SKIP_BACK(struct birdloop, timer, t)));
+ +
+ +  return timeout;
+ +}
+ +
+ +static void
+ +bird_thread_busy_set(struct bird_thread *thr, int val)
+ +{
+ +  LOCK_DOMAIN(attrs, thr->group->domain);
+ +  if (thr->busy_active = val)
+ +    thr->group->thread_busy_count++;
+ +  else
+ +    thr->group->thread_busy_count--;
+ +  ASSERT_DIE(thr->group->thread_busy_count <= thr->group->thread_count);
+ +  UNLOCK_DOMAIN(attrs, thr->group->domain);
+ +}
+ +
+ +static void *
+ +bird_thread_main(void *arg)
+ +{
+ +  struct bird_thread *thr = this_thread = arg;
+ +
+ +  rcu_thread_start();
+ +
+ +  account_to(&thr->overhead);
+ +
+ +  birdloop_enter(thr->meta);
+ +  this_birdloop = thr->meta;
+ +
+ +  THREAD_TRACE(DL_SCHEDULING, "Started");
+ +
+ +  tmp_init(thr->pool);
+ +  init_list(&thr->loops);
+ +
+ +  defer_init(lp_new(thr->pool));
+ +
+ +  thr->sock_changed = 1;
+ +
+ +  struct pfd pfd;
+ +  BUFFER_INIT(pfd.pfd, thr->pool, 16);
+ +  BUFFER_INIT(pfd.loop, thr->pool, 16);
+ +  thr->pfd = &pfd;
+ +
+ +  while (1)
+ +  {
+ +    u64 thr_loop_start = ns_now();
+ +    int timeout;
+ +
+ +    /* Schedule all loops with timed out timers */
+ +    timers_fire(&thr->meta->time, 0);
+ +
+ +    /* Pickup new loops */
+ +    birdloop_take(thr->group);
+ +
+ +    /* Compute maximal time per loop */
+ +    u64 thr_before_run = ns_now();
+ +    if (thr->loop_count > 0)
+ +    {
+ +      thr->max_loop_time_ns = (thr->max_latency_ns / 2 - (thr_before_run - thr_loop_start)) / (u64) thr->loop_count;
+ +      if (thr->max_loop_time_ns NS > 300 MS)
+ +      thr->max_loop_time_ns = 300 MS TO_NS;
+ +    }
+ +
+ +    /* Run all scheduled loops */
+ +    int more_events = ev_run_list(&thr->meta->event_list);
+ +    if (more_events)
+ +    {
+ +      THREAD_TRACE(DL_SCHEDULING, "More metaevents to run from %s",
+ +        LOOP_NAME(SKIP_BACK(struct birdloop, event,
+ +            atomic_load_explicit(&thr->meta->event_list.receiver, memory_order_relaxed)))
+ +        );
+ +      timeout = 0;
+ +    }
+ +    else
+ +      timeout = poll_timeout(thr->meta);
+ +
+ +    /* Run priority events before sleeping */
+ +    ev_run_list(&thr->priority_events);
+ +
+ +    /* Do we have to refresh sockets? */
+ +    if (thr->sock_changed)
+ +    {
+ +      THREAD_TRACE(DL_SOCKETS, "Recalculating socket poll");
+ +      thr->sock_changed = 0;
+ +
+ +      BUFFER_FLUSH(pfd.pfd);
+ +      BUFFER_FLUSH(pfd.loop);
+ +
+ +      pipe_pollin(&thr->wakeup, &pfd);
+ +
+ +      node *nn;
+ +      struct birdloop *loop;
+ +      WALK_LIST2(loop, nn, thr->loops, n)
+ +      {
+ +      birdloop_enter(loop);
+ +      sockets_prepare(loop, &pfd);
+ +      birdloop_leave(loop);
+ +      }
+ +
+ +      ASSERT_DIE(pfd.loop.used == pfd.pfd.used);
+ +      THREAD_TRACE(DL_SOCKETS, "Total %d sockets", pfd.pfd.used);
+ +    }
+ +
+ +    /* Check thread busy indicator */
+ +    int idle_force = (timeout < 0) || (timeout > 300);
+ +    int busy_now = (timeout < 5) && !idle_force;
+ +
+ +    /* Nothing to do right now but there may be some loops for pickup */
+ +    if (idle_force)
+ +    {
+ +      LOCK_DOMAIN(attrs, thr->group->domain);
+ +      if (!EMPTY_LIST(thr->group->loops))
+ +      timeout = 0;
+ +      UNLOCK_DOMAIN(attrs, thr->group->domain);
+ +    }
+ +
+ +    if (busy_now && !thr->busy_active && (++thr->busy_counter == 4))
+ +      bird_thread_busy_set(thr, 1);
+ +
+ +    if (!busy_now && thr->busy_active && (idle_force || (--thr->busy_counter == 0)))
+ +    {
+ +      thr->busy_counter = 0;
+ +      bird_thread_busy_set(thr, 0);
+ +    }
+ +
+ +    account_to(&this_thread->idle);
+ +    birdloop_leave(thr->meta);
+ +poll_retry:;
+ +    int rv = poll(pfd.pfd.data, pfd.pfd.used, timeout);
+ +    if (rv < 0)
+ +    {
+ +      if (errno == EINTR || errno == EAGAIN)
+ +      goto poll_retry;
+ +      bug("poll in %p: %m", thr);
+ +    }
+ +
+ +    account_to(&this_thread->overhead);
+ +    birdloop_enter(thr->meta);
+ +
+ +    /* Drain wakeup fd */
+ +    if (pfd.pfd.data[0].revents & POLLIN)
+ +    {
+ +      THREAD_TRACE(DL_WAKEUP, "Ping received");
+ +      ASSERT_DIE(rv > 0);
+ +      rv--;
+ +      wakeup_drain(thr);
+ +    }
+ +
+ +    /* Unset ping information for Meta */
+ +    atomic_fetch_and_explicit(&thr->meta->thread_transition, ~LTT_PING, memory_order_acq_rel);
+ +
+ +    /* Schedule loops with active sockets */
+ +    if (rv)
+ +      for (uint i = 1; i < pfd.pfd.used; i++)
+ +      if (pfd.pfd.data[i].revents)
+ +      {
+ +        LOOP_TRACE(pfd.loop.data[i], DL_SOCKETS, "socket id %d got revents=0x%x", i, pfd.pfd.data[i].revents);
+ +        ev_send_loop(thr->meta, &pfd.loop.data[i]->event);
+ +      }
+ +  }
+ +
+ +  bug("An infinite loop has ended.");
+ +}
+ +
+ +static void
+ +bird_thread_cleanup(void *_thr)
+ +{
+ +  struct bird_thread *thr = _thr;
+ +  struct birdloop *meta = thr->meta;
+ +  ASSERT_DIE(birdloop_inside(&main_birdloop));
+ +
+ +  /* Wait until the thread actually finishes */
+ +  ASSERT_DIE(meta);
+ +  birdloop_enter(meta);
+ +  birdloop_leave(meta);
+ +
+ +  /* No more wakeup */
+ +  wakeup_free(thr);
+ +
+ +  /* Thread attributes no longer needed */
+ +  pthread_attr_destroy(&thr->thread_attr);
+ +
+ +  /* Free the meta loop */
+ +  thr->meta->thread = NULL;
+ +  thr->meta = NULL;
+ +  birdloop_free(meta);
+ +}
+ +
+ +static struct bird_thread *
+ +bird_thread_start(struct birdloop_pickup_group *group)
+ +{
+ +  ASSERT_DIE(birdloop_inside(&main_birdloop));
+ +
+ +  struct birdloop *meta = birdloop_new_no_pickup(&root_pool, DOMAIN_ORDER(meta), "Thread Meta");
+ +  pool *p = birdloop_pool(meta);
+ +
+ +  birdloop_enter(meta);
+ +  LOCK_DOMAIN(attrs, group->domain);
+ +
+ +  struct bird_thread *thr = mb_allocz(p, sizeof(*thr));
+ +  thr->pool = p;
+ +  thr->cleanup_event = (event) { .hook = bird_thread_cleanup, .data = thr, };
+ +  thr->group = group;
+ +  thr->max_latency_ns = (group->max_latency ?: 5 S) TO_NS;
+ +  thr->meta = meta;
+ +  thr->meta->thread = thr;
+ +
+ +  wakeup_init(thr);
+ +  ev_init_list(&thr->priority_events, NULL, "Thread direct event list");
+ +
+ +  add_tail(&group->threads, &thr->n);
+ +
+ +  int e = 0;
+ +
+ +  if (e = pthread_attr_init(&thr->thread_attr))
+ +    die("pthread_attr_init() failed: %M", e);
+ +
+ +  /* We don't have to worry about thread stack size so much.
+ +  if (e = pthread_attr_setstacksize(&thr->thread_attr, THREAD_STACK_SIZE))
+ +    die("pthread_attr_setstacksize(%u) failed: %M", THREAD_STACK_SIZE, e);
+ +    */
+ +
+ +  if (e = pthread_attr_setdetachstate(&thr->thread_attr, PTHREAD_CREATE_DETACHED))
+ +    die("pthread_attr_setdetachstate(PTHREAD_CREATE_DETACHED) failed: %M", e);
+ +
+ +  if (e = pthread_create(&thr->thread_id, &thr->thread_attr, bird_thread_main, thr))
+ +    die("pthread_create() failed: %M", e);
+ +
+ +  group->thread_count++;
+ +
+ +  UNLOCK_DOMAIN(attrs, group->domain);
+ +  birdloop_leave(meta);
+ +  return thr;
+ +}
+ +
+ +static void
+ +bird_thread_start_event(void *_data)
+ +{
+ +  struct birdloop_pickup_group *group = _data;
+ +  bird_thread_start(group);
+ +}
+ +
+ +static struct birdloop *thread_dropper;
+ +static event *thread_dropper_event;
+ +static uint thread_dropper_goal;
+ +
+ +static void
+ +bird_thread_dropper_free(void *data)
+ +{
+ +  struct birdloop *tdl_stop = data;
+ +  birdloop_free(tdl_stop);
+ +}
+ +
+ +static void
+ +bird_thread_shutdown(void * _ UNUSED)
+ +{
+ +  struct birdloop_pickup_group *group = this_thread->group;
+ +  LOCK_DOMAIN(attrs, group->domain);
+ +  int dif = group->thread_count - thread_dropper_goal;
+ +  struct birdloop *tdl_stop = NULL;
+ +
+ +  if (dif > 0)
+ +    ev_send_loop(thread_dropper, thread_dropper_event);
+ +  else
+ +  {
+ +    tdl_stop = thread_dropper;
+ +    thread_dropper = NULL;
+ +  }
+ +
+ +  UNLOCK_DOMAIN(attrs, group->domain);
+ +
+ +  THREAD_TRACE(DL_SCHEDULING, "Thread pickup size differs from dropper goal by %d%s", dif, tdl_stop ? ", stopping" : "");
+ +
+ +  if (tdl_stop)
+ +  {
+ +    birdloop_stop_self(tdl_stop, bird_thread_dropper_free, tdl_stop);
+ +    return;
+ +  }
+ +
+ +  struct bird_thread *thr = this_thread;
+ +
+ +  LOCK_DOMAIN(attrs, group->domain);
+ +  /* Leave the thread-picker list to get no more loops */
+ +  rem_node(&thr->n);
+ +  group->thread_count--;
+ +
+ +  /* Fix the busy count */
+ +  if (thr->busy_active)
+ +    group->thread_busy_count--;
+ +
+ +  UNLOCK_DOMAIN(attrs, group->domain);
+ +
+ +  /* Leave the thread-dropper loop as we aren't going to return. */
+ +  birdloop_leave(thread_dropper);
+ +
+ +  /* Last try to run the priority event list; ruin it then to be extra sure */
+ +  ev_run_list(&this_thread->priority_events);
+ +  memset(&this_thread->priority_events, 0xa5, sizeof(this_thread->priority_events));
+ +
+ +  /* Drop loops including the thread dropper itself */
+ +  while (!EMPTY_LIST(thr->loops))
+ +  {
+ +    struct birdloop *loop = HEAD(thr->loops);
+ +
+ +    /* Remove loop from this thread's list */
+ +    this_thread->loop_count--;
+ +    rem_node(&loop->n);
+ +
+ +    /* Unset loop's thread */
+ +    birdloop_set_thread(loop, NULL, group);
+ +  }
+ +
+ +  /* Let others know about new loops */
+ +  LOCK_DOMAIN(attrs, group->domain);
+ +  if (!EMPTY_LIST(group->loops))
+ +    wakeup_do_kick(SKIP_BACK(struct bird_thread, n, HEAD(group->threads)));
+ +  UNLOCK_DOMAIN(attrs, group->domain);
+ +
+ +  /* Request thread cleanup from main loop */
+ +  ev_send_loop(&main_birdloop, &thr->cleanup_event);
+ +
+ +  /* Local pages not needed anymore */
+ +  flush_local_pages();
+ +
+ +  /* Unregister from RCU */
+ +  rcu_thread_stop();
+ +
+ +  /* Now we can be cleaned up */
+ +  birdloop_leave(thr->meta);
+ +
+ +  /* Exit! */
+ +  THREAD_TRACE(DL_SCHEDULING, "Stopped");
+ +  pthread_exit(NULL);
+ +}
+ +
+ +void
+ +bird_thread_commit(struct config *new, struct config *old UNUSED)
+ +{
+ +  ASSERT_DIE(birdloop_inside(&main_birdloop));
+ +
+ +  if (new->shutdown)
+ +    return;
+ +
+ +  if (!new->thread_count)
+ +    new->thread_count = 1;
+ +
+ +  while (1)
+ +  {
+ +    struct birdloop_pickup_group *group = &pickup_groups[0];
+ +    LOCK_DOMAIN(attrs, group->domain);
+ +
+ +    int dif = group->thread_count - (thread_dropper_goal = new->thread_count);
- _Bool task_still_in_limit(void)
++    bool thread_dropper_running = !!thread_dropper;
+ +
+ +    UNLOCK_DOMAIN(attrs, group->domain);
+ +
+ +    if (dif < 0)
+ +    {
+ +      bird_thread_start(group);
+ +      continue;
+ +    }
+ +
+ +    if ((dif > 0) && !thread_dropper_running)
+ +    {
+ +      struct birdloop *tdl = birdloop_new(&root_pool, DOMAIN_ORDER(control), group->max_latency, "Thread dropper");
+ +      birdloop_enter(tdl);
+ +      event *tde = ev_new_init(tdl->pool, bird_thread_shutdown, NULL);
+ +
+ +      LOCK_DOMAIN(attrs, group->domain);
+ +      thread_dropper = tdl;
+ +      thread_dropper_event = tde;
+ +      UNLOCK_DOMAIN(attrs, group->domain);
+ +
+ +      ev_send_loop(thread_dropper, thread_dropper_event);
+ +      birdloop_leave(tdl);
+ +    }
+ +
+ +    return;
+ +  }
+ +}
+ +
+ +/* Cleanup after last thread */
+ +static void
+ +bird_thread_sync_finish(void *_sync)
+ +{
+ +  ASSERT_THE_BIRD_LOCKED;
+ +  struct bird_thread_syncer *sync = _sync;
+ +
+ +  /* Keep necessary pointers locally */
+ +  pool *p = sync->pool;
+ +  DOMAIN(control) lock = sync->lock;
+ +  LOCK_DOMAIN(control, lock);
+ +
+ +  /* This invalidates the `sync` pointer */
+ +  CALL(sync->finish, sync);
+ +
+ +  /* Free pool and domain */
+ +  rp_free(p);
+ +  UNLOCK_DOMAIN(control, lock);
+ +  DOMAIN_FREE(control, lock);
+ +}
+ +
+ +/* Process regular one thread hook */
+ +static void
+ +bird_thread_sync_one(void *_sync)
+ +{
+ +  struct bird_thread_syncer *sync = _sync;
+ +
+ +  LOCK_DOMAIN(control, sync->lock);
+ +  CALL(sync->hook, sync);
+ +  sync->done++;
+ +  if (sync->done == sync->total)
+ +    ev_send_loop(&main_birdloop, ev_new_init(sync->pool, bird_thread_sync_finish, sync));
+ +  UNLOCK_DOMAIN(control, sync->lock);
+ +}
+ +
+ +void
+ +bird_thread_sync_all(struct bird_thread_syncer *sync,
+ +    void (*hook)(struct bird_thread_syncer *),
+ +    void (*done)(struct bird_thread_syncer *), const char *name)
+ +{
+ +  sync->lock = DOMAIN_NEW(control);
+ +  LOCK_DOMAIN(control, sync->lock);
+ +
+ +  sync->pool = rp_new(&root_pool, sync->lock.control, name);
+ +  sync->hook = hook;
+ +  sync->finish = done;
+ +
+ +  for (int i=0; i<2; i++)
+ +  {
+ +    struct birdloop_pickup_group *group = &pickup_groups[i];
+ +
+ +    LOCK_DOMAIN(attrs, group->domain);
+ +
+ +    struct bird_thread *thr;
+ +    WALK_LIST(thr, group->threads)
+ +    {
+ +      sync->total++;
+ +      ev_send(&thr->priority_events, ev_new_init(sync->pool, bird_thread_sync_one, sync));
+ +      wakeup_do_kick(thr);
+ +    }
+ +
+ +    UNLOCK_DOMAIN(attrs, group->domain);
+ +  }
+ +
+ +  UNLOCK_DOMAIN(control, sync->lock);
+ +}
+ +
+ +
+ +struct bird_thread_show_data {
+ +  struct bird_thread_syncer sync;
+ +  cli *cli;
+ +  linpool *lp;
+ +  u8 show_loops;
+ +  uint line_pos;
+ +  uint line_max;
+ +  const char **lines;
+ +};
+ +
+ +#define tsd_append(...)               do { \
+ +  if (!tsd->lines) \
+ +    tsd->lines = mb_allocz(tsd->sync.pool, sizeof(const char *) * tsd->line_max); \
+ +  if (tsd->line_pos >= tsd->line_max) \
+ +    tsd->lines = mb_realloc(tsd->lines, sizeof (const char *) * (tsd->line_max *= 2)); \
+ +  tsd->lines[tsd->line_pos++] = lp_sprintf(tsd->lp, __VA_ARGS__); \
+ +} while (0)
+ +
+ +static void
+ +bird_thread_show_cli_cont(struct cli *c UNUSED)
+ +{
+ +  /* Explicitly do nothing to prevent CLI from trying to parse another command. */
+ +}
+ +
+ +static int
+ +bird_thread_show_cli_cleanup(struct cli *c UNUSED)
+ +{
+ +  return 1; /* Defer the cleanup until the writeout is finished. */
+ +}
+ +
+ +static void
+ +bird_thread_show_spent_time(struct bird_thread_show_data *tsd, const char *name, struct spent_time *st)
+ +{
+ +  char b[TIME_BY_SEC_SIZE * sizeof("1234567890, ")], *bptr = b, *bend = b + sizeof(b);
+ +  uint cs = CURRENT_SEC;
+ +  uint fs = NSEC_TO_SEC(st->last_written_ns);
+ +
+ +  for (uint i = 0; i <= cs && i < TIME_BY_SEC_SIZE; i++)
+ +    bptr += bsnprintf(bptr, bend - bptr, "% 10lu ",
+ +      (cs - i > fs) ? 0 : st->by_sec_ns[(cs - i) % TIME_BY_SEC_SIZE]);
+ +  bptr[-1] = 0; /* Drop the trailing space */
+ +
+ +  tsd_append("    %s total time: % 9t s; last %d secs [ns]: %s", name, st->total_ns NS, MIN(CURRENT_SEC+1, TIME_BY_SEC_SIZE), b);
+ +}
+ +
+ +static void
+ +bird_thread_show_loop(struct bird_thread_show_data *tsd, struct birdloop *loop)
+ +{
+ +  tsd_append("  Loop %s", domain_name(loop->time.domain));
+ +  bird_thread_show_spent_time(tsd, "Working ", &loop->working);
+ +  bird_thread_show_spent_time(tsd, "Locking ", &loop->locking);
+ +}
+ +
+ +static void
+ +bird_thread_show(struct bird_thread_syncer *sync)
+ +{
+ +  SKIP_BACK_DECLARE(struct bird_thread_show_data, tsd, sync, sync);
+ +
+ +  if (!tsd->lp)
+ +    tsd->lp = lp_new(tsd->sync.pool);
+ +
+ +  if (tsd->show_loops)
+ +    tsd_append("Thread %p%s (busy counter %d)", this_thread, this_thread->busy_active ? " [busy]" : "", this_thread->busy_counter);
+ +
+ +  u64 total_time_ns = 0;
+ +  struct birdloop *loop;
+ +  WALK_LIST(loop, this_thread->loops)
+ +  {
+ +    if (tsd->show_loops)
+ +      bird_thread_show_loop(tsd, loop);
+ +
+ +    total_time_ns += loop->working.total_ns + loop->locking.total_ns;
+ +  }
+ +
+ +  if (tsd->show_loops)
+ +  {
+ +    tsd_append("  Total working time: %t", total_time_ns NS);
+ +    bird_thread_show_spent_time(tsd, "Overhead", &this_thread->overhead);
+ +    bird_thread_show_spent_time(tsd, "Idle    ", &this_thread->idle);
+ +  }
+ +  else
+ +    tsd_append("Thread %p working %t s overhead %t s",
+ +      this_thread, total_time_ns NS, this_thread->overhead.total_ns NS);
+ +}
+ +
+ +static void
+ +cmd_show_threads_done(struct bird_thread_syncer *sync)
+ +{
+ +  SKIP_BACK_DECLARE(struct bird_thread_show_data, tsd, sync, sync);
+ +  ASSERT_DIE(birdloop_inside(&main_birdloop));
+ +
+ +  tsd->cli->cont = NULL;
+ +  tsd->cli->cleanup = NULL;
+ +
+ +  for (int i=0; i<2; i++)
+ +  {
+ +    struct birdloop_pickup_group *group = &pickup_groups[i];
+ +
+ +    LOCK_DOMAIN(attrs, group->domain);
+ +    uint count = 0;
+ +    u64 total_time_ns = 0;
+ +    if (!EMPTY_LIST(group->loops))
+ +    {
+ +      if (tsd->show_loops)
+ +      tsd_append("Unassigned loops in group %d:", i);
+ +
+ +      struct birdloop *loop;
+ +      WALK_LIST(loop, group->loops)
+ +      {
+ +      if (tsd->show_loops)
+ +        bird_thread_show_loop(tsd, loop);
+ +
+ +      total_time_ns += loop->working.total_ns + loop->locking.total_ns;
+ +      count++;
+ +      }
+ +
+ +      if (tsd->show_loops)
+ +      tsd_append("  Total working time: %t", total_time_ns NS);
+ +      else
+ +      tsd_append("Unassigned %d loops in group %d, total time %t", count, i, total_time_ns NS);
+ +    }
+ +    else
+ +      tsd_append("All loops in group %d are assigned.", i);
+ +
+ +    UNLOCK_DOMAIN(attrs, group->domain);
+ +  }
+ +
+ +  for (uint i = 0; i < tsd->line_pos - 1; i++)
+ +    cli_printf(tsd->cli, -1027, "%s", tsd->lines[i]);
+ +
+ +  cli_printf(tsd->cli, 1027, "%s", tsd->lines[tsd->line_pos-1]);
+ +  cli_write_trigger(tsd->cli);
+ +  mb_free(tsd);
+ +}
+ +
+ +void
+ +cmd_show_threads(int show_loops)
+ +{
+ +  struct bird_thread_show_data *tsd = mb_allocz(&root_pool, sizeof(struct bird_thread_show_data));
+ +  tsd->cli = this_cli;
+ +  tsd->show_loops = show_loops;
+ +  tsd->line_pos = 0;
+ +  tsd->line_max = 64;
+ +
+ +  this_cli->cont = bird_thread_show_cli_cont;
+ +  this_cli->cleanup = bird_thread_show_cli_cleanup;
+ +
+ +  bird_thread_sync_all(&tsd->sync, bird_thread_show, cmd_show_threads_done, "Show Threads");
+ +}
+ +
- _Bool task_before_halftime(void)
++bool task_still_in_limit(void)
+ +{
+ +  static u64 main_counter = 0;
+ +  if (this_birdloop == &main_birdloop)
+ +    return (++main_counter % 2048);   /* This is a hack because of no accounting in mainloop */
+ +  else
+ +    return ns_now() < account_last + this_thread->max_loop_time_ns;
+ +}
+ +
++bool task_before_halftime(void)
+ +{
+ +  return ns_now() < account_last + this_thread->max_loop_time_ns / 2;
+ +}
+ +
+ +
+ +/*
+ + *    Birdloop
+ + */
+ +
+ +static struct bird_thread main_thread;
+ +struct birdloop main_birdloop = { .thread = &main_thread, };
+ +_Thread_local struct birdloop *this_birdloop;
+ +
+ +static void birdloop_enter_locked(struct birdloop *loop);
+ +
+ +void
+ +birdloop_init(void)
+ +{
+ +  ns_init();
+ +
+ +  for (int i=0; i<2; i++)
+ +  {
+ +    struct birdloop_pickup_group *group = &pickup_groups[i];
+ +
+ +    group->domain = DOMAIN_NEW(attrs);
+ +    DOMAIN_SETUP(attrs, group->domain, "Loop Pickup", NULL);
+ +    init_list(&group->loops);
+ +    init_list(&group->threads);
+ +  }
+ +
+ +  wakeup_init(main_birdloop.thread);
+ +
+ +  main_birdloop.time.domain = the_bird_domain.the_bird;
+ +  main_birdloop.time.loop = &main_birdloop;
+ +
+ +  times_update();
+ +  timers_init(&main_birdloop.time, &root_pool);
+ +
+ +  birdloop_enter_locked(&main_birdloop);
+ +  this_birdloop = &main_birdloop;
+ +  this_thread = &main_thread;
+ +
+ +  defer_init(lp_new(&root_pool));
+ +}
+ +
+ +static void
+ +birdloop_stop_internal(struct birdloop *loop)
+ +{
+ +  LOOP_TRACE(loop, DL_SCHEDULING, "Stopping");
+ +
+ +  /* Block incoming pings */
+ +  u32 ltt = atomic_load_explicit(&loop->thread_transition, memory_order_acquire);
+ +  while (!atomic_compare_exchange_strong_explicit(
+ +      &loop->thread_transition, &ltt, LTT_PING,
+ +      memory_order_acq_rel, memory_order_acquire))
+ +    ;
+ +
+ +  /* Flush remaining events */
+ +  ASSERT_DIE(!ev_run_list(&loop->event_list));
+ +
+ +  /* Drop timers */
+ +  timer *t;
+ +  while (t = timers_first(&loop->time))
+ +    tm_stop(t);
+ +
+ +  /* Drop sockets */
+ +  sock *s;
+ +  WALK_LIST_FIRST2(s, n, loop->sock_list)
+ +    birdloop_remove_socket(loop, s);
+ +
+ +  /* Unschedule from Meta */
+ +  ev_postpone(&loop->event);
+ +  tm_stop(&loop->timer);
+ +
+ +  /* Remove from thread loop list */
+ +  ASSERT_DIE(loop->thread == this_thread);
+ +  rem_node(&loop->n);
+ +  loop->thread = NULL;
+ +
+ +  /* Uncount from thread group */
+ +  LOCK_DOMAIN(attrs, this_thread->group->domain);
+ +  this_thread->group->loop_count--;
+ +  UNLOCK_DOMAIN(attrs, this_thread->group->domain);
+ +
+ +  /* Leave the loop context without causing any other fuss */
+ +  ASSERT_DIE(!ev_active(&loop->event));
+ +  loop->ping_pending = 0;
+ +  account_to(&this_thread->overhead);
+ +  this_birdloop = this_thread->meta;
+ +  birdloop_leave(loop);
+ +
+ +  /* Request local socket reload */
+ +  this_thread->sock_changed = 1;
+ +
+ +  /* Call the stopped hook from the main loop */
+ +  loop->event.hook = loop->stopped;
+ +  loop->event.data = loop->stop_data;
+ +  ev_send_loop(&main_birdloop, &loop->event);
+ +}
+ +
+ +static void
+ +birdloop_run(void *_loop)
+ +{
+ +  /* Run priority events before the loop is executed */
+ +  ev_run_list(&this_thread->priority_events);
+ +
+ +  struct birdloop *loop = _loop;
+ +  account_to(&loop->locking);
+ +  birdloop_enter(loop);
+ +  this_birdloop = loop;
+ +
+ +  /* Wait until pingers end to wait for all events to actually arrive */
+ +  for (u32 ltt;
+ +      ltt = atomic_load_explicit(&loop->thread_transition, memory_order_acquire);
+ +      )
+ +  {
+ +    ASSERT_DIE(ltt == LTT_PING);
+ +    birdloop_yield();
+ +  }
+ +
+ +  /* Now we can actually do some work */
+ +  u64 dif = account_to(&loop->working);
+ +
+ +  struct global_runtime *gr = atomic_load_explicit(&global_runtime, memory_order_relaxed);
+ +  if (dif > this_thread->max_loop_time_ns + gr->latency_limit TO_NS)
+ +    LOOP_WARN(loop, "locked %lu us after its scheduled end time", dif NS TO_US);
+ +
+ +  uint repeat, loop_runs = 0;
+ +  do {
+ +    LOOP_TRACE(loop, DL_SCHEDULING, "Regular run (%d)", loop_runs);
+ +    loop_runs++;
+ +
+ +    if (loop->stopped)
+ +      /* Birdloop left inside the helper function */
+ +      return birdloop_stop_internal(loop);
+ +
+ +    /* Process socket TX */
+ +    sockets_fire(loop, 0, 1);
+ +
+ +    /* Run timers */
+ +    timers_fire(&loop->time, 0);
+ +
+ +    /* Run events */
+ +    repeat = ev_run_list(&loop->event_list);
+ +
+ +    /* Process socket RX */
+ +    sockets_fire(loop, 1, 0);
+ +
+ +    /* Check end time */
+ +  } while (repeat && task_still_in_limit());
+ +
+ +  /* Request meta timer */
+ +  timer *t = timers_first(&loop->time);
+ +  if (t)
+ +    tm_start_in(&loop->timer, tm_remains(t), this_thread->meta);
+ +  else
+ +    tm_stop(&loop->timer);
+ +
+ +  /* Request re-run if needed */
+ +  if (repeat)
+ +    ev_send_loop(this_thread->meta, &loop->event);
+ +
+ +  /* Collect socket change requests */
+ +  this_thread->sock_changed |= loop->sock_changed;
+ +  loop->sock_changed = 0;
+ +
+ +  account_to(&this_thread->overhead);
+ +  this_birdloop = this_thread->meta;
+ +  birdloop_leave(loop);
+ +}
+ +
+ +static void
+ +birdloop_run_timer(timer *tm)
+ +{
+ +  struct birdloop *loop = tm->data;
+ +  LOOP_TRACE(loop, DL_TIMERS, "Meta timer ready, requesting run");
+ +  ev_send_loop(loop->thread->meta, &loop->event);
+ +}
+ +
+ +static struct birdloop *
+ +birdloop_vnew_internal(pool *pp, uint order, struct birdloop_pickup_group *group, const char *name, va_list args)
+ +{
+ +  struct domain_generic *dg = domain_new(order, 1);
+ +  DG_LOCK(dg);
+ +
+ +  pool *p = rp_vnewf(pp, dg, name, args);
+ +  struct birdloop *loop = mb_allocz(p, sizeof(struct birdloop));
+ +  loop->pool = p;
+ +
+ +  loop->time.domain = dg;
+ +  loop->time.loop = loop;
+ +
+ +  atomic_store_explicit(&loop->thread_transition, 0, memory_order_relaxed);
+ +
+ +  birdloop_enter_locked(loop);
+ +
+ +  ev_init_list(&loop->event_list, loop, p->name);
+ +  timers_init(&loop->time, p);
+ +  sockets_init(loop);
+ +
+ +  loop->event = (event) { .hook = birdloop_run, .data = loop, };
+ +  loop->timer = (timer) { .hook = birdloop_run_timer, .data = loop, };
+ +
+ +  LOOP_TRACE(loop, DL_SCHEDULING, "New loop: %s", p->name);
+ +
+ +  if (group)
+ +  {
+ +    LOCK_DOMAIN(attrs, group->domain);
+ +    group->loop_count++;
+ +    group->loop_unassigned_count++;
+ +    add_tail(&group->loops, &loop->n);
+ +    if (EMPTY_LIST(group->threads))
+ +      ev_send(&global_event_list, &group->start_threads);
+ +    else
+ +      wakeup_do_kick(SKIP_BACK(struct bird_thread, n, HEAD(group->threads)));
+ +    UNLOCK_DOMAIN(attrs, group->domain);
+ +  }
+ +  else
+ +    loop->n.next = loop->n.prev = &loop->n;
+ +
+ +  birdloop_leave(loop);
+ +
+ +  return loop;
+ +}
+ +
+ +static struct birdloop *
+ +birdloop_new_no_pickup(pool *pp, uint order, const char *name, ...)
+ +{
+ +  va_list args;
+ +  va_start(args, name);
+ +  struct birdloop *loop = birdloop_vnew_internal(pp, order, NULL, name, args);
+ +  va_end(args);
+ +  return loop;
+ +}
+ +
+ +struct birdloop *
+ +birdloop_new(pool *pp, uint order, btime max_latency, const char *name, ...)
+ +{
+ +  va_list args;
+ +  va_start(args, name);
+ +  struct birdloop *loop = birdloop_vnew_internal(pp, order, max_latency ? &pickup_groups[1] : &pickup_groups[0], name, args);
+ +  va_end(args);
+ +  return loop;
+ +}
+ +
+ +static void
+ +birdloop_do_stop(struct birdloop *loop, void (*stopped)(void *data), void *data)
+ +{
+ +  LOOP_TRACE(loop, DL_SCHEDULING, "Stop requested");
+ +
+ +  loop->stopped = stopped;
+ +  loop->stop_data = data;
+ +
+ +  birdloop_do_ping(loop);
+ +}
+ +
+ +void
+ +birdloop_stop(struct birdloop *loop, void (*stopped)(void *data), void *data)
+ +{
+ +  DG_LOCK(loop->time.domain);
+ +  birdloop_do_stop(loop, stopped, data);
+ +  DG_UNLOCK(loop->time.domain);
+ +}
+ +
+ +void
+ +birdloop_stop_self(struct birdloop *loop, void (*stopped)(void *data), void *data)
+ +{
+ +  ASSERT_DIE(loop == birdloop_current);
+ +  ASSERT_DIE(DG_IS_LOCKED(loop->time.domain));
+ +
+ +  birdloop_do_stop(loop, stopped, data);
+ +}
+ +
+ +void
+ +birdloop_free(struct birdloop *loop)
+ +{
+ +  ASSERT_DIE(loop->thread == NULL);
+ +
+ +  struct domain_generic *dg = loop->time.domain;
+ +  DG_LOCK(dg);
+ +  rp_free(loop->pool);
+ +  DG_UNLOCK(dg);
+ +  domain_free(dg);
+ +}
+ +
+ +static void
+ +birdloop_enter_locked(struct birdloop *loop)
+ +{
+ +  ASSERT_DIE(DG_IS_LOCKED(loop->time.domain));
+ +  ASSERT_DIE(!birdloop_inside(loop));
+ +
+ +  /* Store the old context */
+ +  loop->prev_loop = birdloop_current;
+ +
+ +  /* Put the new context */
+ +  birdloop_current = loop;
+ +}
+ +
+ +void
+ +birdloop_enter(struct birdloop *loop)
+ +{
+ +  DG_LOCK(loop->time.domain);
+ +  return birdloop_enter_locked(loop);
+ +}
+ +
+ +static void
+ +birdloop_leave_locked(struct birdloop *loop)
+ +{
+ +  /* Check the current context */
+ +  ASSERT_DIE(birdloop_current == loop);
+ +
+ +  /* Send pending pings */
+ +  if (loop->ping_pending)
+ +  {
+ +    LOOP_TRACE(loop, DL_PING, "sending pings on leave");
+ +    loop->ping_pending = 0;
+ +    birdloop_do_ping(loop);
+ +  }
+ +
+ +  /* Restore the old context */
+ +  birdloop_current = loop->prev_loop;
+ +}
+ +
+ +void
+ +birdloop_leave(struct birdloop *loop)
+ +{
+ +  birdloop_leave_locked(loop);
+ +  DG_UNLOCK(loop->time.domain);
+ +}
+ +
+ +void
+ +birdloop_mask_wakeups(struct birdloop *loop)
+ +{
+ +  ASSERT_DIE(birdloop_wakeup_masked == NULL);
+ +  birdloop_wakeup_masked = loop;
+ +}
+ +
+ +void
+ +birdloop_unmask_wakeups(struct birdloop *loop)
+ +{
+ +  ASSERT_DIE(birdloop_wakeup_masked == loop);
+ +  birdloop_wakeup_masked = NULL;
+ +  if (birdloop_wakeup_masked_count)
+ +    wakeup_do_kick(loop->thread);
+ +
+ +  birdloop_wakeup_masked_count = 0;
+ +}
+ +
+ +void
+ +birdloop_yield(void)
+ +{
+ +  usleep(100);
+ +}
+ +
+ +void
+ +ev_send_this_thread(event *e)
+ +{
+ +  if (this_thread == &main_thread)
+ +    ev_send_loop(&main_birdloop, e);
+ +  else
+ +    ev_send(&this_thread->priority_events, e);
+ +}
diff --cc sysdep/unix/io.c

index 7e974ec10274a94d04b46a7e48696d44a5d3c441,9b499020423b97a689202de7840ef428a7042ef2..892044c5df047c31ae58a755ef582230735ac5d1
--- 1/sysdep/unix/io.c
--- 2/sysdep/unix/io.c
+++ b/sysdep/unix/io.c
@@@ -1827,13 -1680,6 +1827,13 @@@ sk_recvmsg(sock *s
   
   static inline void reset_tx_buffer(sock *s) { s->ttx = s->tpos = s->tbuf; }
   
- _Bool
++bool
+ +sk_tx_pending(sock *s)
+ +{
+ +  return s->ttx != s->tpos;
+ +}
+ +
+ +
   static int
   sk_maybe_write(sock *s)
   {
diff --cc sysdep/unix/krt.c

index 300d695dcaf91cd8ab44c3ddc41ac862b7e20c59,0664f4c1dde5f348bf539b6c1fb0ef5c75f53656..d6c1a31de56a7f1ebda6daa9ab60784c0e912dc6
--- 1/sysdep/unix/krt.c
--- 2/sysdep/unix/krt.c
+++ b/sysdep/unix/krt.c
@@@ -445,29 -718,10 +445,29 @@@ done:
     lp_flush(krt_filter_lp);
   }
   
- static _Bool
- -static void
++static bool
   krt_init_scan(struct krt_proto *p)
   {
- -  bmap_reset(&p->seen_map, 1024);
+ +  switch (p->sync_state)
+ +  {
+ +    case KPS_IDLE:
+ +      rt_refresh_begin(&p->p.main_channel->in_req);
+ +      bmap_reset(&p->seen_map, 1024);
+ +      p->sync_state = KPS_SCANNING;
+ +      return 1;
+ +
+ +    case KPS_SCANNING:
+ +      bug("Kernel scan double-init");
+ +
+ +    case KPS_PRUNING:
+ +      log(L_WARN "%s: Can't scan, still pruning", p->p.name);
+ +      return 0;
+ +
+ +    case KPS_FLUSHING:
+ +      bug("Can't scan, flushing");
+ +  }
+ +
+ +  bug("Bad kernel sync state");
   }
   
   static void
author	Maria Matejka <mq@ucw.cz>
	Wed, 26 Jun 2024 15:19:24 +0000 (17:19 +0200)
committer	Maria Matejka <mq@ucw.cz>
	Wed, 26 Jun 2024 15:19:24 +0000 (17:19 +0200)
		1	2
conf/confbase.Y	patch \|	diff1 \|	diff2 \|	blob \| history
lib/io-loop.h	patch \|	diff1 \|	\|	blob \| history
lib/lists.h	patch \|	diff1 \|	diff2 \|	blob \| history
lib/lockfree.c	patch \|	diff1 \|	\|	blob \| history
lib/lockfree.h	patch \|	diff1 \|	\|	blob \| history
lib/locking.h	patch \|	diff1 \|	\|	blob \| history
lib/locking_test.c	patch \|	diff1 \|	\|	blob \| history
lib/netindex.c	patch \|	diff1 \|	\|	blob \| history
lib/rcu.c	patch \|	diff1 \|	\|	blob \| history
lib/rcu.h	patch \|	diff1 \|	\|	blob \| history
lib/rcu_test.c	patch \|	diff1 \|	\|	blob \| history
lib/route.h	patch \|	diff1 \|	\|	blob \| history
lib/socket.h	patch \|	diff1 \|	diff2 \|	blob \| history
nest/mpls.c	patch \|	diff1 \|	diff2 \|	blob \| history
nest/proto.c	patch \|	diff1 \|	diff2 \|	blob \| history
nest/route.h	patch \|	diff1 \|	diff2 \|	blob \| history
nest/rt-attr.c	patch \|	diff1 \|	diff2 \|	blob \| history
nest/rt-export.c	patch \|	diff1 \|	\|	blob \| history
nest/rt-table.c	patch \|	diff1 \|	diff2 \|	blob \| history
proto/bgp/attrs.c	patch \|	diff1 \|	diff2 \|	blob \| history
sysdep/config.h	patch \|	diff1 \|	diff2 \|	blob \| history
sysdep/unix/alloc.c	patch \|	diff1 \|	diff2 \|	blob \| history
sysdep/unix/domain.c	patch \|	diff1 \|	\|	blob \| history
sysdep/unix/io-loop.c	patch \|	diff1 \|	\|	blob \| history
sysdep/unix/io.c	patch \|	diff1 \|	diff2 \|	blob \| history
sysdep/unix/krt.c	patch \|	diff1 \|	diff2 \|	blob \| history