/* Ugly structure offset handling macros */
+#define SAME_TYPE(a, b) ({ int _ = ((a) != (b)); !_; })
+#define TYPE_CAST(from, to, what) ( SAME_TYPE(((from) NULL), (what)), ((to) (what)))
+
#define OFFSETOF(s, i) ((size_t) &((s *)0)->i)
-#define SKIP_BACK(s, i, p) ((s *)((char *)p - OFFSETOF(s, i)))
+#define SKIP_BACK(s, i, p) ({ s *_ptr = ((s *)((char *)p - OFFSETOF(s, i))); SAME_TYPE(&_ptr->i, p); _ptr; })
#define BIRD_ALIGN(s, a) (((s)+a-1)&~(a-1))
#define CPU_STRUCT_ALIGN (alignof(max_align_t))
--- /dev/null
+/*
+ * BIRD -- Settle timer
+ *
+ * (c) 2022 Maria Matejka <mq@jmq.cz>
+ * (c) 2022 CZ.NIC z.s.p.o.
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#ifndef _BIRD_SETTLE_H_
+#define _BIRD_SETTLE_H_
+
+#include "lib/birdlib.h"
+#include "lib/timer.h"
+
+struct settle_config {
+ btime min, max;
+};
+
+struct settle {
+ union {
+ /* Timer hook polymorphism. */
+ struct {
+ resource _r;
+ void (*hook)(struct settle *);
+ };
+ timer tm;
+ };
+ struct settle_config cf;
+ btime started;
+};
+
+STATIC_ASSERT(OFFSETOF(struct settle, hook) == OFFSETOF(struct settle, tm) + OFFSETOF(timer, hook));
+
+#define SETTLE_INIT(_cfp, _hook, _data) (struct settle) { .tm = { .data = (_data), .hook = TYPE_CAST(void (*)(struct settle *), void (*)(struct timer *), (_hook)), }, .cf = ({ASSERT_DIE((_cfp)->min <= (_cfp)->max); *(_cfp); }), }
+
+
+static inline void settle_init(struct settle *s, struct settle_config *cf, void (*hook)(struct settle *), void *data)
+{
+ *s = SETTLE_INIT(cf, hook, data);
+}
+
+#define settle_active(s) tm_active(&(s)->tm)
+
+static inline void settle_kick(struct settle *s)
+{
+ if (!tm_active(&s->tm))
+ {
+ s->started = current_time();
+ tm_set(&s->tm, s->started + s->cf.min);
+ }
+ else
+ {
+ btime now = current_time();
+ tm_set(&s->tm, MIN_(now + s->cf.min, s->started + s->cf.max));
+ }
+}
+
+static inline void settle_cancel(struct settle *s)
+{
+ tm_stop(&s->tm);
+}
+
+#endif
-src := aggregator.c
+src := aggregator.c trie.c
obj := $(src-o-files)
$(all-daemon)
$(cf-local)
/*
- * BIRD Internet Routing Daemon -- Route aggregation
+ * BIRD Internet Routing Daemon -- Route aggregation
*
- * (c) 2023--2023 Igor Putovny <igor.putovny@nic.cz>
- * (c) 2023 CZ.NIC, z.s.p.o.
+ * (c) 2023--2025 Igor Putovny <igor.putovny@nic.cz>
+ * (c) 2023--2025 Maria Matejka <mq@ucw.cz>
+ * (c) 2025 CZ.NIC, z.s.p.o.
*
* Can be freely distributed and used under the terms of the GNU GPL.
*/
/**
- * DOC: Route aggregation
+ * DOC: Aggregator protocol
*
- * This is an implementation of route aggregation functionality.
- * It enables user to specify a set of route attributes in the configuarion file
- * and then, for a given destination (net), aggregate routes with the same
- * values of these attributes into a single multi-path route.
+ * The purpose of the aggregator protocol is to aggregate routes based on
+ * user-specified set of route attributes. It can be used for aggregating
+ * routes for a given destination (net) or for aggregating prefixes.
*
- * Structure &channel contains pointer to aggregation list which is represented
- * by &aggr_list_linearized. In rt_notify_aggregated(), attributes from this
- * list are evaluated for every route of a given net and results are stored
- * in &rte_val_list which contains pointer to this route and array of &f_val.
- * Array of pointers to &rte_val_list entries is sorted using
- * sort_rte_val_list(). For comparison of &f_val structures, val_compare()
- * is used. Comparator function is written so that sorting is stable. If all
- * attributes have the same values, routes are compared by their global IDs.
+ * Aggregation of routes for networks means that for each destination, routes
+ * with the same values of attributes will be aggregated into a single
+ * multi-path route. Aggregation is performed by inserting routes into a hash
+ * table based on values of their attributes and generating new routes from
+ * the routes in th same bucket. Buckets are represented by @aggregator_bucket,
+ * which contains linked list of @aggregator_route.
*
- * After sorting, &rte_val_list entries containing equivalent routes will be
- * adjacent to each other. Function process_rte_list() iterates through these
- * entries to identify sequences of equivalent routes. New route will be
- * created for each such sequence, even if only from a single route.
- * Only attributes from the aggreagation list will be set for the new route.
- * New &rta is created and prepare_rta() is used to copy static and dynamic
- * attributes to new &rta from &rta of the original route. New route is created
- * by create_merged_rte() from new &rta and exported to the routing table.
+ * Aggregation of prefixes aggregates a given set of prefixes into another set
+ * of prefixes. It offers a reduction in number of prefixes without changing
+ * the routing semantics. Aggregator is capable of processing incremental
+ * updates.
*/
#undef LOCAL_DEBUG
#include "nest/bird.h"
#include "nest/iface.h"
#include "filter/filter.h"
-#include "aggregator.h"
+#include "proto/aggregator/aggregator.h"
+
+extern linpool *rte_update_pool;
-#include <stdlib.h>
/*
-#include "nest/route.h"
-#include "nest/iface.h"
-#include "lib/resource.h"
-#include "lib/event.h"
-#include "lib/timer.h"
-#include "lib/string.h"
-#include "conf/conf.h"
-#include "filter/filter.h"
-#include "filter/data.h"
-#include "lib/hash.h"
-#include "lib/string.h"
-#include "lib/alloca.h"
-#include "lib/flowspec.h"
-*/
+ * Allocate unique ID for bucket
+ */
+static inline u32
+aggregator_get_new_bucket_id(struct aggregator_proto *p)
+{
+ u32 id = hmap_first_zero(&p->bucket_id_map);
+ hmap_set(&p->bucket_id_map, id);
+ return id;
+}
-extern linpool *rte_update_pool;
+/*
+ * Add @bucket to the list of bucket pointers in @p to position @bucket.id
+ */
+// TODO: enable to reset bucket ptr?
+static void
+aggregator_add_bucket(struct aggregator_proto *p, struct aggregator_bucket *bucket)
+{
+ ASSERT_DIE(p != NULL);
+ ASSERT_DIE(p->bucket_list != NULL);
+ ASSERT_DIE(bucket != NULL);
+
+ /* Bucket is already in the list */
+ if (bucket->id < p->bucket_list_size && p->bucket_list[bucket->id])
+ return;
+
+ const size_t old_size = p->bucket_list_size;
+
+ /* Reallocate if more space is needed */
+ if (bucket->id >= p->bucket_list_size)
+ {
+ while (bucket->id >= p->bucket_list_size)
+ p->bucket_list_size *= 2;
+
+ ASSERT_DIE(old_size < p->bucket_list_size);
+
+ p->bucket_list = mb_realloc(p->bucket_list, sizeof(p->bucket_list[0]) * p->bucket_list_size);
+ memset(&p->bucket_list[old_size], 0, sizeof(p->bucket_list[0]) * (p->bucket_list_size - old_size));
+ }
+
+ ASSERT_DIE(bucket->id < p->bucket_list_size);
+ ASSERT_DIE(p->bucket_list[bucket->id] == NULL);
+
+ p->bucket_list[bucket->id] = bucket;
+ p->bucket_list_count++;
+}
+
+/*
+ * Withdraw all routes that are on the stack
+ */
+static void
+aggregator_withdraw_rte(struct aggregator_proto *p)
+{
+ if ((p->addr_type == NET_IP4 && p->rte_withdrawal_count > IP4_WITHDRAWAL_MAX_EXPECTED_LIMIT) ||
+ (p->addr_type == NET_IP6 && p->rte_withdrawal_count > IP6_WITHDRAWAL_MAX_EXPECTED_LIMIT))
+ log(L_WARN "This number of updates was not expected."
+ "They will be processed, but please, contact the developers.");
+
+ struct rte_withdrawal_item *node = NULL;
+
+ while (node = p->rte_withdrawal_stack)
+ {
+ rte_update2(p->dst, &node->addr, NULL, node->bucket->last_src);
+ p->rte_withdrawal_stack = node->next;
+ p->rte_withdrawal_count--;
+ }
+
+ ASSERT_DIE(p->rte_withdrawal_stack == NULL);
+ ASSERT_DIE(p->rte_withdrawal_count == 0);
+
+ lp_flush(p->rte_withdrawal_pool);
+}
+
+static void
+aggregator_aggregate_on_feed_end(struct channel *C)
+{
+ struct aggregator_proto *p = SKIP_BACK(struct aggregator_proto, p, C->proto);
+
+ if (C != p->src)
+ return;
+
+ ASSERT_DIE(p->aggr_mode == PREFIX_AGGR);
+ ASSERT_DIE(p->root != NULL);
+
+ p->initial_feed = 0;
+ aggregator_aggregate(p);
+}
/*
* Set static attribute in @rta from static attribute in @old according to @sa.
*/
static void
-rta_set_static_attr(struct rta *rta, const struct rta *old, struct f_static_attr sa)
+aggregator_rta_set_static_attr(struct rta *rta, const struct rta *old, struct f_static_attr sa)
{
switch (sa.sa_code)
{
* @count: number of &f_val entries
*/
static int
-same_val_list(const struct f_val *v1, const struct f_val *v2, uint len)
+aggregator_same_val_list(const struct f_val *v1, const struct f_val *v2, u32 len)
{
- for (uint i = 0; i < len; i++)
+ for (u32 i = 0; i < len; i++)
if (!val_same(&v1[i], &v2[i]))
return 0;
}
/*
- * Create and export new merged route.
- * @old: first route in a sequence of equivalent routes that are to be merged
- * @rte_val: first element in a sequence of equivalent rte_val_list entries
- * @length: number of equivalent routes that are to be merged (at least 1)
- * @ail: aggregation list
+ * Create and export new merged route
*/
-static void
+void
aggregator_bucket_update(struct aggregator_proto *p, struct aggregator_bucket *bucket, struct network *net)
{
/* Empty bucket */
rta->source = RTS_AGGREGATED;
rta->scope = SCOPE_UNIVERSE;
- struct ea_list *eal = allocz(sizeof(struct ea_list) + sizeof(struct eattr) * p->aggr_on_da_count);
+ struct ea_list *eal = allocz(sizeof(*eal) + sizeof(struct eattr) * p->aggr_on_da_count);
eal->next = NULL;
eal->count = 0;
rta->eattrs = eal;
/* Seed the attributes from aggregator rule */
- for (uint i = 0; i < p->aggr_on_count; i++)
+ for (u32 i = 0; i < p->aggr_on_count; i++)
{
if (p->aggr_on[i].type == AGGR_ITEM_DYNAMIC_ATTR)
{
eal->attrs[eal->count++] = *e;
}
else if (p->aggr_on[i].type == AGGR_ITEM_STATIC_ATTR)
- rta_set_static_attr(rta, bucket->rte->attrs, p->aggr_on[i].sa);
+ aggregator_rta_set_static_attr(rta, bucket->rte->attrs, p->aggr_on[i].sa);
}
- struct rte *new = rte_get_temp(rta, bucket->rte->src);
+ struct rte *new = rte_get_temp(rta, p->p.main_source);
new->net = net;
- /*
- log("=============== CREATE MERGED ROUTE ===============");
- log("New route created: id = %d, protocol: %s", new->src->global_id, new->src->proto->name);
- log("===================================================");
- */
+ if (p->logging)
+ {
+ log("=============== CREATE MERGED ROUTE ===============");
+ log("New route created: id = %d, protocol: %s", new->src->global_id, new->src->proto->name);
+ log("===================================================");
+ }
/* merge filter needs one argument called "routes" */
struct f_val val = {
/* We actually don't want this route */
case F_REJECT:
if (bucket->last_src)
- rte_update2(p->dst, net->n.addr, NULL, bucket->last_src);
+ rte_update2(p->dst, net->n.addr, NULL, bucket->last_src);
break;
}
{
if (new_src)
rt_lock_source(new_src);
+
if (bucket->last_src)
rt_unlock_source(bucket->last_src);
* and store result in @pos.
*/
static void
-eval_static_attr(const struct rte *rt1, struct f_static_attr sa, struct f_val *pos)
+aggregator_eval_static_attr(const struct rte *rt1, struct f_static_attr sa, struct f_val *pos)
{
const struct rta *rta = rt1->attrs;
switch (sa.sa_code)
{
- case SA_NET: RESULT(sa.f_type, net, rt1->net->n.addr); break;
- case SA_FROM: RESULT(sa.f_type, ip, rta->from); break;
- case SA_GW: RESULT(sa.f_type, ip, rta->nh.gw); break;
- case SA_PROTO: RESULT(sa.f_type, s, rt1->src->proto->name); break;
- case SA_SOURCE: RESULT(sa.f_type, i, rta->source); break;
- case SA_SCOPE: RESULT(sa.f_type, i, rta->scope); break;
- case SA_DEST: RESULT(sa.f_type, i, rta->dest); break;
- case SA_IFNAME: RESULT(sa.f_type, s, rta->nh.iface ? rta->nh.iface->name : ""); break;
- case SA_IFINDEX: RESULT(sa.f_type, i, rta->nh.iface ? rta->nh.iface->index : 0); break;
- case SA_WEIGHT: RESULT(sa.f_type, i, rta->nh.weight + 1); break;
- case SA_PREF: RESULT(sa.f_type, i, rta->pref); break;
- case SA_GW_MPLS: RESULT(sa.f_type, i, rta->nh.labels ? rta->nh.label[0] : MPLS_NULL); break;
+ case SA_NET: RESULT(sa.f_type, net, rt1->net->n.addr); break;
+ case SA_FROM: RESULT(sa.f_type, ip, rta->from); break;
+ case SA_GW: RESULT(sa.f_type, ip, rta->nh.gw); break;
+ case SA_PROTO: RESULT(sa.f_type, s, rt1->src->proto->name); break;
+ case SA_SOURCE: RESULT(sa.f_type, i, rta->source); break;
+ case SA_SCOPE: RESULT(sa.f_type, i, rta->scope); break;
+ case SA_DEST: RESULT(sa.f_type, i, rta->dest); break;
+ case SA_IFNAME: RESULT(sa.f_type, s, rta->nh.iface ? rta->nh.iface->name : ""); break;
+ case SA_IFINDEX: RESULT(sa.f_type, i, rta->nh.iface ? rta->nh.iface->index : 0); break;
+ case SA_WEIGHT: RESULT(sa.f_type, i, rta->nh.weight + 1); break;
+ case SA_PREF: RESULT(sa.f_type, i, rta->pref); break;
+ case SA_GW_MPLS: RESULT(sa.f_type, i, rta->nh.labels ? rta->nh.label[0] : MPLS_NULL); break;
default:
bug("Invalid static attribute access (%u/%u)", sa.f_type, sa.sa_code);
}
* and store result in @pos.
*/
static void
-eval_dynamic_attr(const struct rte *rt1, struct f_dynamic_attr da, struct f_val *pos)
+aggregator_eval_dynamic_attr(const struct rte *rt1, struct f_dynamic_attr da, struct f_val *pos)
{
const struct rta *rta = rt1->attrs;
const struct eattr *e = ea_find(rta->eattrs, da.ea_code);
#undef RESULT_VOID
}
-static inline u32 aggr_route_hash(const rte *e)
+static inline u32
+aggregator_route_hash(const rte *e)
{
struct {
net *net;
#define AGGR_RTE_KEY(n) (&(n)->rte)
#define AGGR_RTE_NEXT(n) ((n)->next_hash)
#define AGGR_RTE_EQ(a,b) (((a)->src == (b)->src) && ((a)->net == (b)->net))
-#define AGGR_RTE_FN(_n) aggr_route_hash(_n)
+#define AGGR_RTE_FN(_n) aggregator_route_hash(_n)
#define AGGR_RTE_ORDER 4 /* Initial */
#define AGGR_RTE_REHASH aggr_rte_rehash
#define AGGR_BUCK_KEY(n) (n)
#define AGGR_BUCK_NEXT(n) ((n)->next_hash)
-#define AGGR_BUCK_EQ(a,b) (((a)->hash == (b)->hash) && (same_val_list((a)->aggr_data, (b)->aggr_data, p->aggr_on_count)))
+#define AGGR_BUCK_EQ(a,b) (((a)->hash == (b)->hash) && (aggregator_same_val_list((a)->aggr_data, (b)->aggr_data, p->aggr_on_count)))
#define AGGR_BUCK_FN(n) ((n)->hash)
#define AGGR_BUCK_ORDER 4 /* Initial */
{
struct aggregator_proto *p = SKIP_BACK(struct aggregator_proto, p, P);
ASSERT_DIE(src_ch == p->src);
+
struct aggregator_bucket *new_bucket = NULL, *old_bucket = NULL;
- struct aggregator_route *old_route = NULL;
+ struct aggregator_route *new_route = NULL, *old_route = NULL;
+
+ /* Ignore all updates if protocol is not up */
+ if (p->p.proto_state != PS_UP)
+ return;
/* Find the objects for the old route */
if (old)
return;
/* Evaluate route attributes. */
- struct aggregator_bucket *tmp_bucket = sl_allocz(p->bucket_slab);
+ struct aggregator_bucket *tmp_bucket = allocz(sizeof(*tmp_bucket) + sizeof(tmp_bucket->aggr_data[0]) * p->aggr_on_count);
+ ASSERT_DIE(tmp_bucket->id == 0);
- for (uint val_idx = 0; val_idx < p->aggr_on_count; val_idx++)
+ for (u32 val_idx = 0; val_idx < p->aggr_on_count; val_idx++)
{
int type = p->aggr_on[val_idx].type;
struct f_val *pos = &tmp_bucket->aggr_data[val_idx];
if (fret > F_RETURN)
log(L_WARN "%s.%s: Wrong number of items left on stack after evaluation of aggregation list", rt1->src->proto->name, rt1->sender->name);
- switch (pos->type) {
- case T_VOID:
- case T_INT:
- case T_BOOL:
- case T_PAIR:
- case T_QUAD:
- case T_ENUM:
- case T_IP:
- case T_EC:
- case T_LC:
- case T_RD:
- /* Fits, OK */
- break;
-
- default:
- log(L_WARN "%s.%s: Expression evaluated to type %s unsupported by aggregator. Store this value as a custom attribute instead", new->src->proto->name, new->sender->name, f_type_name(pos->type));
- *pos = (struct f_val) { .type = T_INT, .val.i = 0 };
- }
+ switch (pos->type)
+ {
+ case T_VOID:
+ case T_INT:
+ case T_BOOL:
+ case T_PAIR:
+ case T_QUAD:
+ case T_ENUM:
+ case T_IP:
+ case T_EC:
+ case T_LC:
+ case T_RD:
+ /* Fits, OK */
+ break;
+
+ default:
+ log(L_WARN "%s.%s: Expression evaluated to type %s unsupported by aggregator. Store this value as a custom attribute instead", new->src->proto->name, new->sender->name, f_type_name(pos->type));
+ *pos = (struct f_val) { .type = T_INT, .val.i = 0 };
+ }
break;
}
- case AGGR_ITEM_STATIC_ATTR: {
- eval_static_attr(new, p->aggr_on[val_idx].sa, pos);
+ case AGGR_ITEM_STATIC_ATTR:
+ aggregator_eval_static_attr(new, p->aggr_on[val_idx].sa, pos);
break;
- }
- case AGGR_ITEM_DYNAMIC_ATTR: {
- eval_dynamic_attr(new, p->aggr_on[val_idx].da, pos);
+ case AGGR_ITEM_DYNAMIC_ATTR:
+ aggregator_eval_dynamic_attr(new, p->aggr_on[val_idx].da, pos);
break;
- }
default:
break;
/* Compute the hash */
u64 haux;
mem_hash_init(&haux);
- for (uint i = 0; i < p->aggr_on_count; i++)
+
+ for (u32 i = 0; i < p->aggr_on_count; i++)
{
mem_hash_mix_num(&haux, tmp_bucket->aggr_data[i].type);
switch (tmp_bucket->aggr_data[i].type)
{
- case T_VOID:
- break;
- case T_INT:
- case T_BOOL:
- case T_PAIR:
- case T_QUAD:
- case T_ENUM:
- MX(i);
- break;
- case T_EC:
- case T_RD:
- MX(ec);
- break;
- case T_LC:
- MX(lc);
- break;
- case T_IP:
- MX(ip);
- break;
- case T_NET:
- mem_hash_mix_num(&haux, net_hash(IT(net)));
- break;
- case T_STRING:
- mem_hash_mix_str(&haux, IT(s));
- break;
- case T_PATH_MASK:
- mem_hash_mix(&haux, IT(path_mask), sizeof(*IT(path_mask)) + IT(path_mask)->len * sizeof (IT(path_mask)->item));
- break;
- case T_PATH:
- case T_CLIST:
- case T_ECLIST:
- case T_LCLIST:
- case T_BYTESTRING:
- mem_hash_mix(&haux, IT(ad)->data, IT(ad)->length);
- break;
- case T_NONE:
- case T_PATH_MASK_ITEM:
- case T_ROUTE:
- case T_ROUTES_BLOCK:
- bug("Invalid type %s in hashing", f_type_name(tmp_bucket->aggr_data[i].type));
- case T_SET:
- MX(t);
- break;
- case T_PREFIX_SET:
- MX(ti);
- break;
+ case T_VOID:
+ break;
+ case T_INT:
+ case T_BOOL:
+ case T_PAIR:
+ case T_QUAD:
+ case T_ENUM:
+ MX(i);
+ break;
+ case T_EC:
+ case T_RD:
+ MX(ec);
+ break;
+ case T_LC:
+ MX(lc);
+ break;
+ case T_IP:
+ MX(ip);
+ break;
+ case T_NET:
+ mem_hash_mix_num(&haux, net_hash(IT(net)));
+ break;
+ case T_STRING:
+ mem_hash_mix_str(&haux, IT(s));
+ break;
+ case T_PATH_MASK:
+ mem_hash_mix(&haux, IT(path_mask), sizeof(*IT(path_mask)) + IT(path_mask)->len * sizeof (IT(path_mask)->item));
+ break;
+ case T_PATH:
+ case T_CLIST:
+ case T_ECLIST:
+ case T_LCLIST:
+ case T_BYTESTRING:
+ mem_hash_mix(&haux, IT(ad)->data, IT(ad)->length);
+ break;
+ case T_NONE:
+ case T_PATH_MASK_ITEM:
+ case T_ROUTE:
+ case T_ROUTES_BLOCK:
+ bug("Invalid type %s in hashing", f_type_name(tmp_bucket->aggr_data[i].type));
+ case T_SET:
+ MX(t);
+ break;
+ case T_PREFIX_SET:
+ MX(ti);
+ break;
}
}
/* Find the existing bucket */
if (new_bucket = HASH_FIND(p->buckets, AGGR_BUCK, tmp_bucket))
- sl_free(tmp_bucket);
+ ;
else
{
- new_bucket = tmp_bucket;
+ new_bucket = lp_allocz(p->bucket_pool, sizeof(*new_bucket) + sizeof(new_bucket->aggr_data[0]) * p->aggr_on_count);
+ memcpy(new_bucket, tmp_bucket, sizeof(*new_bucket) + sizeof(new_bucket->aggr_data[0]) * p->aggr_on_count);
HASH_INSERT2(p->buckets, AGGR_BUCK, p->p.pool, new_bucket);
+
+ new_bucket->id = aggregator_get_new_bucket_id(p);
+ aggregator_add_bucket(p, new_bucket);
}
/* Store the route attributes */
else
new->attrs = rta_lookup(new->attrs);
+ if (p->logging)
+ log("New rte: %p, net: %p, src: %p, hash: %x", new, new->net, new->src, aggregator_route_hash(new));
+
/* Insert the new route into the bucket */
- struct aggregator_route *arte = sl_alloc(p->route_slab);
+ struct aggregator_route *arte = lp_allocz(p->route_pool, sizeof(*arte));
+
*arte = (struct aggregator_route) {
.bucket = new_bucket,
.rte = *new,
};
+
arte->rte.next = new_bucket->rte,
new_bucket->rte = &arte->rte;
new_bucket->count++;
HASH_INSERT2(p->routes, AGGR_RTE, p->p.pool, arte);
+
+ /* New route */
+ new_route = arte;
+ ASSERT_DIE(new_route != NULL);
+
+ if (p->logging)
+ log("Inserting rte: %p, arte: %p, net: %p, src: %p, hash: %x",
+ &arte->rte, arte, arte->rte.net, arte->rte.src, aggregator_route_hash(&arte->rte));
}
/* Remove the old route from its bucket */
if (old_bucket)
{
for (struct rte **k = &old_bucket->rte; *k; k = &(*k)->next)
+ {
if (*k == &old_route->rte)
{
- *k = (*k)->next;
- break;
+ *k = (*k)->next;
+ break;
}
+ }
old_bucket->count--;
HASH_REMOVE2(p->routes, AGGR_RTE, p->p.pool, old_route);
rta_free(old_route->rte.attrs);
- sl_free(old_route);
}
- /* Announce changes */
- if (old_bucket)
- aggregator_bucket_update(p, old_bucket, net);
+ /* Aggregation within nets allows incremental updates */
+ if (p->aggr_mode == NET_AGGR)
+ {
+ /* Announce changes */
+ if (old_bucket)
+ aggregator_bucket_update(p, old_bucket, net);
+
+ if (new_bucket && (new_bucket != old_bucket))
+ aggregator_bucket_update(p, new_bucket, net);
+ }
+ else if (p->aggr_mode == PREFIX_AGGR)
+ {
+ if (!p->initial_feed)
+ {
+ aggregator_recompute(p, old_route, new_route);
- if (new_bucket && (new_bucket != old_bucket))
- aggregator_bucket_update(p, new_bucket, net);
+ /* Process route withdrawals triggered by recomputation */
+ aggregator_withdraw_rte(p);
+ }
+ }
/* Cleanup the old bucket if empty */
if (old_bucket && (!old_bucket->rte || !old_bucket->count))
{
ASSERT_DIE(!old_bucket->rte && !old_bucket->count);
HASH_REMOVE2(p->buckets, AGGR_BUCK, p->p.pool, old_bucket);
- sl_free(old_bucket);
}
}
aggregator_preexport(struct channel *C, struct rte *new)
{
struct aggregator_proto *p = SKIP_BACK(struct aggregator_proto, p, C->proto);
+
/* Reject our own routes */
if (new->sender == p->dst)
return -1;
cf->dst->debug = cf->src->debug;
}
+// TODO: set pools to NULL?
static struct proto *
aggregator_init(struct proto_config *CF)
{
proto_configure_channel(P, &p->src, cf->src);
proto_configure_channel(P, &p->dst, cf->dst);
- p->aggr_on_count = cf->aggr_on_count;
- p->aggr_on_da_count = cf->aggr_on_da_count;
- p->aggr_on = cf->aggr_on;
- p->merge_by = cf->merge_by;
+ p->aggr_mode = cf->aggr_mode;
+ p->aggr_on_count = cf->aggr_on_count;
+ p->aggr_on_da_count = cf->aggr_on_da_count;
+ p->aggr_on = cf->aggr_on;
+ p->merge_by = cf->merge_by;
+ p->logging = cf->logging;
+ p->bucket_list = NULL;
+ p->bucket_list_size = 0;
+ p->bucket_list_count = 0;
P->rt_notify = aggregator_rt_notify;
P->preexport = aggregator_preexport;
+ P->feed_end = aggregator_aggregate_on_feed_end;
return P;
}
+/*
+ * Initialize hash table and create default route
+ *
+ * TODO: tohle asi máme zavolat při startu protokolu
+ * a uklidit při vypnutí, ne?
+ */
+static void
+aggregator_trie_init(struct aggregator_proto *p)
+{
+ /* Zero prefix for default route */
+ ip_addr prefix = (p->addr_type == NET_IP4) ? ipa_from_ip4(IP4_NONE) : ipa_from_ip6(IP6_NONE);
+
+ struct net_addr addr = { 0 };
+ net_fill_ipa(&addr, prefix, 0);
+
+ /* Create net for zero prefix */
+ struct network *default_net = mb_allocz(p->p.pool, sizeof(*default_net) + sizeof(addr));
+ net_copy(default_net->n.addr, &addr);
+
+ /* Create route attributes with zero nexthop */
+ struct rta rta = { 0 };
+
+ /* Allocate bucket for root node */
+ struct aggregator_bucket *new_bucket = lp_allocz(p->bucket_pool, sizeof(*new_bucket));
+
+ u64 haux = 0;
+ mem_hash_init(&haux);
+ new_bucket->hash = mem_hash_value(&haux);
+
+ /* Assign ID to the root node bucket */
+ new_bucket->id = aggregator_get_new_bucket_id(p);
+ aggregator_add_bucket(p, new_bucket);
+
+ struct aggregator_route *arte = lp_allocz(p->route_pool, sizeof(*arte));
+
+ *arte = (struct aggregator_route) {
+ .bucket = new_bucket,
+ .rte = { .attrs = rta_lookup(&rta) },
+ };
+
+ /* Put route into bucket */
+ arte->rte.next = new_bucket->rte;
+ new_bucket->rte = &arte->rte;
+ new_bucket->count++;
+
+ arte->rte.net = default_net;
+ default_net->routes = &arte->rte;
+
+ HASH_INSERT2(p->routes, AGGR_RTE, p->p.pool, arte);
+ HASH_INSERT2(p->buckets, AGGR_BUCK, p->p.pool, new_bucket);
+
+ /* Allocate and initialize root node */
+ p->root = aggregator_root_init(new_bucket, p->trie_slab);
+}
+
static int
aggregator_start(struct proto *P)
{
struct aggregator_proto *p = SKIP_BACK(struct aggregator_proto, p, P);
- p->bucket_slab = sl_new(P->pool, sizeof(struct aggregator_bucket) + AGGR_DATA_MEMSIZE);
+ ASSERT_DIE(p->bucket_pool == NULL);
+ ASSERT_DIE(p->route_pool == NULL);
+ ASSERT_DIE(p->trie_slab == NULL);
+ ASSERT_DIE(p->root == NULL);
+
+ p->addr_type = p->src->table->addr_type;
+
+ p->bucket_pool = lp_new(P->pool);
HASH_INIT(p->buckets, P->pool, AGGR_BUCK_ORDER);
- p->route_slab = sl_new(P->pool, sizeof(struct aggregator_route));
+ p->route_pool = lp_new(P->pool);
HASH_INIT(p->routes, P->pool, AGGR_RTE_ORDER);
p->reload_buckets = (event) {
.data = p,
};
+ p->initial_feed = 1;
+
+ hmap_init(&p->bucket_id_map, p->p.pool, 1024);
+ hmap_set(&p->bucket_id_map, 0); /* 0 is default value, do not use it as ID */
+
+ if (p->aggr_mode == PREFIX_AGGR)
+ {
+ ASSERT_DIE(p->trie_slab == NULL);
+ p->trie_slab = sl_new(P->pool, sizeof(struct trie_node));
+
+ ASSERT_DIE(p->bucket_list == NULL);
+ ASSERT_DIE(p->bucket_list_size == 0);
+ ASSERT_DIE(p->bucket_list_count == 0);
+
+ p->bucket_list_size = BUCKET_LIST_INIT_SIZE;
+ p->bucket_list = mb_allocz(p->p.pool, sizeof(p->bucket_list[0]) * p->bucket_list_size);
+
+ p->rte_withdrawal_pool = lp_new(P->pool);
+ p->rte_withdrawal_count = 0;
+
+ aggregator_trie_init(p);
+ }
+
return PS_UP;
}
static int
-aggregator_shutdown(struct proto *P)
+aggregator_shutdown(struct proto *P UNUSED)
+{
+ return PS_DOWN;
+}
+
+static void
+aggregator_cleanup(struct proto *P)
{
struct aggregator_proto *p = SKIP_BACK(struct aggregator_proto, p, P);
- HASH_WALK_DELSAFE(p->buckets, next_hash, b)
- {
- while (b->rte)
- {
- struct aggregator_route *arte = SKIP_BACK(struct aggregator_route, rte, b->rte);
- b->rte = arte->rte.next;
- b->count--;
- HASH_REMOVE(p->routes, AGGR_RTE, arte);
- rta_free(arte->rte.attrs);
- sl_free(arte);
- }
+ /*
+ * Linpools will be freed along with other protocol resources but pointers
+ * have to be set to NULL because protocol may be started again.
+ */
+ p->bucket_pool = NULL;
+ p->route_pool = NULL;
+ p->trie_slab = NULL;
+ p->rte_withdrawal_pool = NULL;
- ASSERT_DIE(b->count == 0);
- HASH_REMOVE(p->buckets, AGGR_BUCK, b);
- sl_free(b);
- }
- HASH_WALK_END;
+ p->root = NULL;
- return PS_DOWN;
+ p->bucket_list = NULL;
+ p->bucket_list_size = 0;
+ p->bucket_list_count = 0;
+
+ p->rte_withdrawal_stack = NULL;
+ p->rte_withdrawal_count = 0;
+
+ p->bucket_id_map = (struct hmap) { 0 };
+
+ p->initial_feed = 1;
}
static int
return 0;
/* Compare aggregator rule */
- for (uint i = 0; i < p->aggr_on_count; i++)
+ for (u32 i = 0; i < p->aggr_on_count; i++)
+ {
switch (cf->aggr_on[i].type)
{
case AGGR_ITEM_TERM:
- if (!f_same(cf->aggr_on[i].line, p->aggr_on[i].line))
- return 0;
- break;
+ if (!f_same(cf->aggr_on[i].line, p->aggr_on[i].line))
+ return 0;
+ break;
case AGGR_ITEM_STATIC_ATTR:
- if (memcmp(&cf->aggr_on[i].sa, &p->aggr_on[i].sa, sizeof(struct f_static_attr)) != 0)
- return 0;
- break;
+ if (memcmp(&cf->aggr_on[i].sa, &p->aggr_on[i].sa, sizeof(struct f_static_attr)) != 0)
+ return 0;
+ break;
case AGGR_ITEM_DYNAMIC_ATTR:
- if (memcmp(&cf->aggr_on[i].da, &p->aggr_on[i].da, sizeof(struct f_dynamic_attr)) != 0)
- return 0;
- break;
+ if (memcmp(&cf->aggr_on[i].da, &p->aggr_on[i].da, sizeof(struct f_dynamic_attr)) != 0)
+ return 0;
+ break;
default:
- bug("Broken aggregator rule");
+ bug("Broken aggregator rule");
}
+ }
/* Compare merge filter */
if (!f_same(cf->merge_by, p->merge_by))
return 1;
}
+static void
+aggregator_get_status(struct proto *P, byte *buf)
+{
+ struct aggregator_proto *p = SKIP_BACK(struct aggregator_proto, p, P);
+
+ if (p->p.proto_state == PS_DOWN)
+ buf[0] = 0;
+ else
+ {
+ if (p->aggr_mode == PREFIX_AGGR)
+ strcpy(buf, "prefix aggregation");
+ else
+ strcpy(buf, "net aggregation");
+ }
+}
+
struct protocol proto_aggregator = {
- .name = "Aggregator",
- .template = "aggregator%d",
- .class = PROTOCOL_AGGREGATOR,
- .preference = 1,
- .channel_mask = NB_ANY,
- .proto_size = sizeof(struct aggregator_proto),
- .config_size = sizeof(struct aggregator_config),
- .postconfig = aggregator_postconfig,
- .init = aggregator_init,
- .start = aggregator_start,
- .shutdown = aggregator_shutdown,
- .reconfigure = aggregator_reconfigure,
+ .name = "Aggregator",
+ .template = "aggregator%d",
+ .class = PROTOCOL_AGGREGATOR,
+ .preference = 1,
+ .channel_mask = NB_ANY,
+ .proto_size = sizeof(struct aggregator_proto),
+ .config_size = sizeof(struct aggregator_config),
+ .postconfig = aggregator_postconfig,
+ .init = aggregator_init,
+ .start = aggregator_start,
+ .shutdown = aggregator_shutdown,
+ .cleanup = aggregator_cleanup,
+ .reconfigure = aggregator_reconfigure,
+ .get_status = aggregator_get_status,
};
void
/*
* BIRD -- Aggregator Pseudoprotocol
*
- * (c) 2023 Igor Putovny <igor.putovny@nic.cz>
- * (c) 2023 Maria Matejka <mq@ucw.cz>
- * (c) 2023 CZ.NIC z.s.p.o.
+ * (c) 2023--2025 Igor Putovny <igor.putovny@nic.cz>
+ * (c) 2023--2025 Maria Matejka <mq@ucw.cz>
+ * (c) 2025 CZ.NIC z.s.p.o.
*
* Can be freely distributed and used under the terms of the GNU GPL.
- *
- * This file contains the data structures used by Babel.
*/
#ifndef _BIRD_AGGREGATOR_H_
#include "nest/protocol.h"
#include "lib/hash.h"
+#define BUCKET_LIST_INIT_SIZE 16
+#define POTENTIAL_BUCKETS_BITMAP_SIZE 8
+#define MAX_POTENTIAL_BUCKETS_COUNT ((int)(sizeof(u32) * 8 * POTENTIAL_BUCKETS_BITMAP_SIZE))
+
+#define IP4_WITHDRAWAL_MAX_EXPECTED_LIMIT 100
+#define IP6_WITHDRAWAL_MAX_EXPECTED_LIMIT 200
+
+enum aggregation_mode {
+ NET_AGGR, PREFIX_AGGR,
+};
+
struct aggregator_config {
struct proto_config c;
struct channel_config *src, *dst;
- uint aggr_on_count;
- uint aggr_on_da_count;
+ enum aggregation_mode aggr_mode;
+ u32 aggr_on_count;
+ u32 aggr_on_da_count;
struct aggr_item *aggr_on;
const struct f_line *merge_by;
+ int logging;
};
struct aggregator_route {
struct aggregator_bucket {
struct aggregator_bucket *next_hash;
- struct rte *rte; /* Pointer to struct aggregator_route.rte */
- struct rte_src *last_src; /* Which src we announced the bucket last with */
+ struct rte *rte; /* Pointer to struct aggregator_route.rte */
+ struct rte_src *last_src; /* Which src we announced the bucket last with */
u32 count;
u32 hash;
+ u32 id;
struct f_val aggr_data[0];
};
+struct rte_withdrawal_item {
+ struct rte_withdrawal_item *next;
+ struct aggregator_bucket *bucket;
+ struct net_addr addr;
+};
+
struct aggregator_proto {
struct proto p;
struct channel *src, *dst;
+ enum aggregation_mode aggr_mode;
/* Buckets by aggregator rule */
HASH(struct aggregator_bucket) buckets;
- slab *bucket_slab;
+ struct linpool *bucket_pool;
/* Routes by net and src */
HASH(struct aggregator_route) routes;
- slab *route_slab;
+ struct linpool *route_pool;
/* Aggregator rule */
- uint aggr_on_count;
- uint aggr_on_da_count;
+ u32 aggr_on_count;
+ u32 aggr_on_da_count;
struct aggr_item *aggr_on;
/* Merge filter */
const struct f_line *merge_by;
event reload_buckets;
+
+ /* Aggregation trie */
+ u32 addr_type;
+ struct trie_node *root;
+ struct slab *trie_slab;
+ int initial_feed;
+ int logging;
+
+ /* Array of bucket pointers */
+ struct aggregator_bucket **bucket_list;
+ size_t bucket_list_size;
+ size_t bucket_list_count;
+
+ /* Bucket IDs */
+ struct hmap bucket_id_map;
+
+ /* Route withdrawal */
+ struct rte_withdrawal_item *rte_withdrawal_stack;
+ struct linpool *rte_withdrawal_pool;
+ int rte_withdrawal_count;
};
enum aggr_item_type {
struct aggr_item i;
};
+enum fib_status {
+ UNASSIGNED_FIB,
+ IN_FIB,
+ NON_FIB,
+};
+
+enum prefix_origin {
+ FILLER,
+ ORIGINAL,
+ AGGREGATED,
+};
+
+struct trie_node {
+ struct trie_node *parent;
+ struct trie_node *child[2];
+ struct trie_node *ancestor;
+ struct aggregator_bucket *original_bucket;
+ struct aggregator_bucket *selected_bucket;
+ enum fib_status status;
+ enum prefix_origin px_origin;
+ u32 potential_buckets[POTENTIAL_BUCKETS_BITMAP_SIZE];
+ int potential_buckets_count;
+ int depth;
+};
+
+void aggregator_aggregate(struct aggregator_proto *p);
+void aggregator_recompute(struct aggregator_proto *p, struct aggregator_route *old, struct aggregator_route *new);
+void aggregator_bucket_update(struct aggregator_proto *p, struct aggregator_bucket *bucket, struct network *net);
+
+struct trie_node *aggregator_root_init(struct aggregator_bucket *bucket, struct slab *trie_slab);
+
#endif
--- /dev/null
+
+debug protocols all;
+debug channels all;
+
+log "bird.log" all;
+
+ipv4 table aggregated;
+
+protocol device {}
+#router id 1;
+
+attribute int distinguisher;
+
+protocol static {
+ ipv4 {
+ import filter { distinguisher = 9; accept; };
+ };
+
+ route 10.100.0.0/16 unreachable;
+}
+
+protocol aggregator {
+ table master4;
+ peer table aggregated;
+ export filter { print net; accept; };
+ aggregate on distinguisher, net;
+ merge by { accept; };
+ # defualt route unrechable;
+}
+
CF_DECLS
-CF_KEYWORDS(AGGREGATOR, PEER, AGGREGATE, ON, MERGE, BY)
+CF_KEYWORDS(AGGREGATOR, PEER, AGGREGATE, ON, MERGE, BY, RELOAD, AFTER, LOG, ALL)
%type <ai> aggr_item aggr_list
this_channel = AGGREGATOR_CFG->src = channel_config_new(NULL, "source", 0, this_proto);
AGGREGATOR_CFG->dst = channel_config_new(NULL, "destination", 0, this_proto);
- AGGREGATOR_CFG->src->ra_mode = AGGREGATOR_CFG->dst->ra_mode = RA_ANY;
+ /*
+ * Aggregation mode is set to prefix aggregation by default, in which case we want to receive
+ * updates with the best routes.
+ */
+ AGGREGATOR_CFG->aggr_mode = PREFIX_AGGR;
+ AGGREGATOR_CFG->src->ra_mode = RA_OPTIMAL;
+ AGGREGATOR_CFG->dst->ra_mode = RA_ANY;
+ AGGREGATOR_CFG->logging = 0;
};
aggregator_proto_item:
if (AGGREGATOR_CFG->aggr_on)
cf_error("Only one aggregate on clause allowed");
- bool net_present = 0;
int count = 0;
for (const struct aggr_item_node *item = $3; item; item = item->next) {
-// log(L_WARN "type %d sacode %d", item->i.type, item->i.sa.sa_code);
- if (item->i.type == AGGR_ITEM_STATIC_ATTR && item->i.sa.sa_code == SA_NET)
- net_present = 1;
+ /*
+ * If NET attribute is present, aggregate routes within the same net
+ * and receive updates with any routes.
+ */
+ if (item->i.type == AGGR_ITEM_STATIC_ATTR && item->i.sa.sa_code == SA_NET) {
+ AGGREGATOR_CFG->aggr_mode = NET_AGGR;
+ AGGREGATOR_CFG->src->ra_mode = RA_ANY;
+ }
count++;
}
- if (!net_present)
- cf_error("'NET' must be present");
-
AGGREGATOR_CFG->aggr_on = cfg_alloc(sizeof(struct aggr_item) * count);
int pos = 0;
+
for (const struct aggr_item_node *item = $3; item; item = item->next) {
if (item->i.type == AGGR_ITEM_DYNAMIC_ATTR)
AGGREGATOR_CFG->aggr_on_da_count++;
$4->args++;
AGGREGATOR_CFG->merge_by = $4;
}
+ | LOG ALL { AGGREGATOR_CFG->logging = 1; }
;
aggregator_proto_opts: /* empty */ | aggregator_proto_opts aggregator_proto_item ';' ;
-aggregator_proto: aggregator_proto_start proto_name '{' aggregator_proto_opts '}' ;
+aggregator_proto: aggregator_proto_start proto_name '{' aggregator_proto_opts '}' {
+ if (AGGREGATOR_CFG->src->table->addr_type != AGGREGATOR_CFG->dst->table->addr_type)
+ cf_error("Both rtables in aggregator must have the same network type");
+ if (PREFIX_AGGR == AGGREGATOR_CFG->aggr_mode)
+ if (AGGREGATOR_CFG->src->table->addr_type != NET_IP4 && AGGREGATOR_CFG->src->table->addr_type != NET_IP6)
+ cf_error("Trie aggregation is available only for IP4 or IPv6 networks");
+};
aggr_list:
aggr_item
--- /dev/null
+#!/bin/bash
+
+dir_name="aggregator-test-case"
+
+cd ../..
+make all
+echo '------------------------'
+
+if [[ ! -e "$dir_name" ]]; then
+ mkdir "$dir_name"
+ echo "creating directory 'bird/$dir_name'"
+fi
+
+cp ./bird ./"$dir_name"
+echo 'copying bird executable'
+
+cp ./birdc ./"$dir_name"
+echo 'copying birdc executable'
+
+cp ./proto/aggregator/bird.conf ./"$dir_name"
+echo 'copying bird.conf'
+
+cd "$dir_name"
+
+echo 'done'
+echo
+echo 'expected result:'
+echo '====THIRD PASS===='
+echo '0.0.0.0/0'
+echo '10.100.0.0/16'
--- /dev/null
+/*
+ * BIRD Internet Routing Daemon -- Prefix aggregation
+ *
+ * (c) 2023--2025 Igor Putovny <igor.putovny@nic.cz>
+ * (c) 2025 CZ.NIC, z.s.p.o.
+ *
+ * Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+/**
+ * DOC: Aggregator protocol trie
+ *
+ * Prefix aggregation implements the ORTC (Optimal Route Table Construction)
+ * algorithm [1].
+ * TODO: zdroje, odkazy na literaturu
+ *
+ * This algorithm uses a binary tree representation of the routing table.
+ * An edge from the parent node to its left child represents bit 0, and
+ * an edge from the parent node to its right child represents bit 1 as the
+ * prefix is traversed from the most to the least significant bit. Last node
+ * of every prefix contains original bucket where the route for this prefix
+ * belongs.
+ *
+ * Prefixes are therefore represented as a path through the trie, beginning at
+ * the root node. The last node on this path is called prefix node.
+ *
+ * TODO: popis originálního algoritmu zřetelně odlišit od toho, co tady skutečně děláme
+ *
+ * ORTC algorithm as described in the original paper consists of three passes
+ * through the trie.
+ *
+ * The first pass adds new nodes to the trie so that every node has either two
+ * or zero children. During this pass, routing information is propagated to the
+ * leaves.
+ *
+ * The second pass finds the most prevalent buckets by pushing information from
+ * the leaves up towards the root. Each node is assigned a set of potential
+ * buckets. If there are any common buckets among the node's children, they
+ * are carried to the parent node. Otherwise, all of children's buckets are
+ * carried to the parent node.
+ *
+ * The third pass moves down the trie while deciding which prefixes will be
+ * exported to the FIB. The node inherits a bucket from its closest ancestor
+ * that has a bucket. If the inherited bucket is one of potential buckets of
+ * this node, then this node does not need a bucket and its prefix will not
+ * be in FIB. Otherwise the node does need a bucket and any of its potential
+ * buckets can be chosen. We always choose the bucket with the lowest ID.
+ * This prefix will go to the FIB.
+ *
+ * Algorithm works with the assumption that there is a default route.
+ *
+ * The following is a description of this implementation.
+ *
+ * The trie contains three different kinds of nodes: original, aggregated and
+ * fillers. Original nodes represent prefixes from the original (import)
+ * routing table. Aggregated nodes represent prefixes that do not exist in the
+ * original table but exist in the aggregated (export) table, as they are result
+ * of the aggregation. Filler nodes are neither, they exist in the trie but do
+ * not represent any prefixes in original or aggregated table.
+ *
+ * Each node has a FIB status flag signalling whether this prefix was exported
+ * to the FIB (IN_FIB) or not (NON_FIB). It is clear that IN_FIB nodes can be
+ * either original or aggregated, whereas NON_FIB nodes can be either original
+ * or fillers.
+ *
+ * Every node contains pointer to its closest IN_FIB ancestor.
+ *
+ * After every aggregation, following invariants are always satisfied:
+ * 1. No original bucket can be null.
+ * 2. No ancestor pointer can be null.
+ * 3. If a node is IN_FIB, then
+ * a) its selected bucket must not be null,
+ * b) its ancestor pointer must point to itself,
+ * c) its origin must be ORIGINAL or AGGREGATED.
+ * 4. If a node is NON_FIB, then
+ * a) its selected bucket must be null,
+ * b) its ancestor pointer must point to the nearest IN_FIB ancestor,
+ * c) its origin must be ORIGINAL or FILLER.
+ *
+ * Our implementation differs from the algorithm as described in the original
+ * paper in several aspects. First, we do not normalize the trie by adding new
+ * nodes so that every node has either zero or two children. Second, propagation
+ * of original buckets, which was formerly done during first pass, is now done
+ * in the second pass. First pass is completely omitted.
+ * The two phases of aggregation are named propagate_and_merge() for first and
+ * second pass and group_prefixes() for third pass.
+ *
+ * Aggregator is capable of processing incremental updates. After receiving
+ * an update, which can be either announce or withdraw, corresponding node
+ * is found in the trie and its original bucket is updated. Trie now needs to
+ * be recomputed to reflect this update.
+ * Trie is traversed from the updated node upwards until its closest IN_FIB
+ * ancestor is found. This is the prefix node that covers an address space which
+ * is affected by received update. This is followed by propagate_and_merge(),
+ * which propagates potential buckets from the leaves upwards. Merging of sets
+ * of potential buckets continues upwards until the node's set is not changed by
+ * this operation. Finally, the third pass runs from this node, finishing the
+ * aggregation. During the third pass, changes in prefix FIB status are detected
+ * and routes are exported or removed from the routing table accordingly. All
+ * new routes are exported immmediately, whereas routes that are to be
+ * withdrawed are pushed on the stack and removed after recomputing the trie.
+ *
+ * References:
+ * [1] R. P. Draves, C. King, S. Venkatachary and B. D. Zill. Constructing
+ * Optimal IP Routing Tables. In Proceedings of IEEE INFOCOM, volume 1,
+ * pages 88-97, 1999.
+ * [2] Z. A. Uzmi, M. Nebel, A. Tariq, S. Jawad, R. Chen, A. Shaikh, J. Wang,
+ * P. Francis. Practical and Near-Optimal FIB Aggregation using SMALTA.
+ * In Proceedings of CoNEXT, 2011.
+ * [3] Y. Liu, B. Zhang, L. Wang. FIFA: Fast Incremental FIB Aggregation.
+ * In Proceedings of IEEE INFOCOM, 2013.
+ * [4] Y. Liu, X. Zhao, K. Nam, L. Wang, B. Zhang. Incremental Forwarding
+ * Table Aggregation. In Proceedings of IEEE GLOBECOM, 2010.
+ * [5] X. Zhao, Y. Liu, L. Wang, B. Zhang. On the Aggregatability of Router
+ * Forwarding Tables. In Proceedings of IEEE INFOCOM, 2010.
+ *
+ */
+
+#undef LOCAL_DEBUG
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include "nest/bird.h"
+#include "filter/filter.h"
+#include "proto/aggregator/aggregator.h"
+
+#include <stdbool.h>
+
+/* TODO: comment what purpose this array has */
+/*
+ * Nodes in the trie can be either original, representing input prefixes;
+ * aggregated, representing prefixes created by aggregation; or fillers,
+ * which are neither. This array maps these values to strings which are
+ * printed when dumping the contents of the trie.
+ */
+static const char *px_origin_str[] = {
+ [FILLER] = "filler",
+ [ORIGINAL] = "original",
+ [AGGREGATED] = "aggregated",
+};
+
+/*
+ * We use ip6_addr (under its alias ip_addr) to contain both IPv4 and IPv6
+ * addresses. When using bitwise operations on these addresses, we have to
+ * add offset of 96 in case of IPv4 address, because IPv4 address is stored
+ * in the lowest 32 bits of ip6_addr, whereas IPv6 occupies all 128 bits.
+ */
+static const u32 ipa_shift[] = {
+ [NET_IP4] = IP6_MAX_PREFIX_LENGTH - IP4_MAX_PREFIX_LENGTH,
+ [NET_IP6] = 0,
+};
+
+/*
+ * Allocate and initialize root node
+ */
+struct trie_node *
+aggregator_root_init(struct aggregator_bucket *bucket, struct slab *trie_slab)
+{
+ struct trie_node *root = sl_allocz(trie_slab);
+
+ *root = (struct trie_node) {
+ .original_bucket = bucket,
+ .status = NON_FIB,
+ .px_origin = ORIGINAL,
+ .depth = 0,
+ };
+
+ return root;
+}
+
+static inline int
+aggregator_is_leaf(const struct trie_node *node)
+{
+ ASSERT_DIE(node != NULL);
+ return !node->child[0] && !node->child[1];
+}
+
+/*
+ * Unlink node from the trie by setting appropriate child of parent node to NULL
+ * and free memory.
+ */
+static inline void
+aggregator_remove_node(struct trie_node *node)
+{
+ ASSERT_DIE(node != NULL);
+ ASSERT_DIE(node->child[0] == NULL && node->child[1] == NULL);
+
+ if (!node->parent)
+ ;
+ else
+ {
+ if (node->parent->child[0] == node)
+ {
+ node->parent->child[0] = NULL;
+ ASSERT_DIE(node->parent->child[1] != node);
+ }
+ else if (node->parent->child[1] == node)
+ {
+ node->parent->child[1] = NULL;
+ ASSERT_DIE(node->parent->child[0] != node);
+ }
+ else
+ bug("Corrupted memory (node is not its parent's child)");
+ }
+
+ sl_free(node);
+}
+
+/*
+ * Insert @bucket to the set of potential buckets in @node
+ */
+static inline void
+aggregator_node_add_potential_bucket(struct trie_node *node, const struct aggregator_bucket *bucket)
+{
+ ASSERT_DIE(node->potential_buckets_count < MAX_POTENTIAL_BUCKETS_COUNT);
+
+ if (BIT32R_TEST(node->potential_buckets, bucket->id))
+ return;
+
+ BIT32R_SET(node->potential_buckets, bucket->id);
+ node->potential_buckets_count++;
+}
+
+/*
+ * Check if @bucket is one of potential buckets of @node
+ */
+static inline int
+aggregator_is_bucket_potential(const struct trie_node *node, u32 id)
+{
+ /* TODO: obecná otázka: musíme do těchto funkcí předávat bucket, nebo by stačilo bucket id? */
+
+ ASSERT_DIE(node != NULL);
+
+ ASSERT_DIE(id < MAX_POTENTIAL_BUCKETS_COUNT);
+ return BIT32R_TEST(node->potential_buckets, id);
+}
+
+/*
+ * Return pointer to bucket with ID @id.
+ * Protocol contains list of pointers to all buckets. Every pointer
+ * lies at position equal to bucket ID to enable fast lookup.
+ */
+static inline struct aggregator_bucket *
+aggregator_get_bucket_from_id(const struct aggregator_proto *p, u32 id)
+{
+ ASSERT_DIE(id < p->bucket_list_size);
+ ASSERT_DIE(p->bucket_list[id] != NULL);
+ ASSERT_DIE(p->bucket_list[id]->id == id);
+ return p->bucket_list[id];
+}
+
+/*
+ * Select bucket with the lowest ID from the set of node's potential buckets
+ */
+static inline struct aggregator_bucket *
+aggregator_select_lowest_id_bucket(const struct aggregator_proto *p, const struct trie_node *node)
+{
+ ASSERT_DIE(p != NULL);
+ ASSERT_DIE(node != NULL);
+
+ for (int i = 0; i < POTENTIAL_BUCKETS_BITMAP_SIZE; i++)
+ {
+ if (node->potential_buckets[i] == 0)
+ continue;
+
+ /*
+ * Use CLZ -- Count Leading Zeroes to find first set bit.
+ * Compute its position from the beginning of the array.
+ */
+ u32 id = u32_clz(node->potential_buckets[i]) + i * 32;
+
+ struct aggregator_bucket *bucket = aggregator_get_bucket_from_id(p, id);
+ ASSERT_DIE(bucket != NULL);
+ ASSERT_DIE(bucket->id == id);
+
+ return bucket;
+ }
+
+ bug("No potential buckets to choose from");
+}
+
+/*
+ * @target: node we are computing set of potential buckets for
+ * @left, @right: left and right children of @target
+ *
+ * The resulting set is an intersection of sets of @left and @right. If this
+ * intersection is empty, resulting set is an union of @left and @right sets.
+ *
+ * Returns: whether the set of potential buckets in the target node has changed.
+ */
+static bool
+aggregator_merge_potential_buckets(struct trie_node *target, const struct trie_node *left, const struct trie_node *right)
+{
+ ASSERT_DIE(target != NULL);
+ ASSERT_DIE(left != NULL);
+ ASSERT_DIE(right != NULL);
+
+ bool has_intersection = false;
+ bool has_changed = false;
+
+ u32 before[ARRAY_SIZE(target->potential_buckets)] = { 0 };
+
+ target->potential_buckets_count = 0;
+
+ /* First we try to compute intersection. If it exists, we want to keep it. */
+ for (int i = 0; i < POTENTIAL_BUCKETS_BITMAP_SIZE; i++)
+ {
+ /* Save current bitmap values */
+ before[i] = target->potential_buckets[i];
+
+ /* Compute intersection */
+ target->potential_buckets[i] = left->potential_buckets[i] & right->potential_buckets[i];
+ target->potential_buckets_count += u32_popcount(target->potential_buckets[i]);
+
+ if (target->potential_buckets[i] != 0)
+ has_intersection = true;
+
+ if (before[i] != target->potential_buckets[i])
+ has_changed = true;
+ }
+
+ /* Intersection found */
+ if (has_intersection)
+ return has_changed;
+
+ /* Sets have an empty intersection, compute their union instead */
+ target->potential_buckets_count = 0;
+ has_changed = false;
+
+ for (int i = 0; i < POTENTIAL_BUCKETS_BITMAP_SIZE; i++)
+ {
+ target->potential_buckets[i] = left->potential_buckets[i] | right->potential_buckets[i];
+ target->potential_buckets_count += u32_popcount(target->potential_buckets[i]);
+
+ if (before[i] != target->potential_buckets[i])
+ has_changed = true;
+ }
+
+ return has_changed;
+}
+
+/*
+ * Dump aggregation trie
+ */
+static void
+aggregator_dump_trie_helper(const struct aggregator_proto *p, const struct trie_node *node, ip_addr *prefix, u32 pxlen, struct buffer *buf)
+{
+ ASSERT_DIE(p != NULL);
+ ASSERT_DIE(node != NULL);
+ ASSERT_DIE(prefix != NULL);
+
+ memset(buf->start, 0, buf->pos - buf->start);
+ buf->pos = buf->start;
+
+ struct net_addr addr = { 0 };
+ net_fill_ipa(&addr, *prefix, pxlen);
+
+ buffer_print(buf, "%*s%s%N ", 2 * node->depth, "", (node->status == IN_FIB) ? "@" : " ", &addr);
+
+ if (node->original_bucket)
+ buffer_print(buf, "[%u] ", node->original_bucket->id);
+ else
+ buffer_print(buf, "[] ");
+
+ buffer_print(buf, "{");
+
+ for (int i = 0, j = 0; i < POTENTIAL_BUCKETS_BITMAP_SIZE; i++)
+ {
+ if (node->potential_buckets[i] == 0)
+ continue;
+
+ u32 item = node->potential_buckets[i];
+
+ while (item != 0)
+ {
+ /* Find first set bit (CLZ -- Count Leading Zeroes) */
+ int bitpos = u32_clz(item);
+
+ /* Compute ID as offset from the beginning of array */
+ u32 id = i * 32 + (u32)bitpos;
+
+ buffer_print(buf, "%u", id);
+ j++;
+
+ if (j < node->potential_buckets_count)
+ buffer_print(buf, ", ");
+
+ /* Clear first set bit and continue */
+ u32 mask = 1U << (32 - bitpos - 1);
+ item &= ~mask;
+ }
+ }
+
+ buffer_print(buf, "}");
+
+ if (node->selected_bucket)
+ buffer_print(buf, " -> [[%u]]", node->selected_bucket->id);
+
+ buffer_print(buf, " %p %s", node, px_origin_str[node->px_origin]);
+ log("%s", buf->start);
+
+ if (node->child[0])
+ {
+ ASSERT_DIE((u32)node->depth == pxlen);
+ ip6_clrbit(prefix, node->depth + ipa_shift[p->addr_type]);
+ aggregator_dump_trie_helper(p, node->child[0], prefix, pxlen + 1, buf);
+ }
+
+ if (node->child[1])
+ {
+ ASSERT_DIE((u32)node->depth == pxlen);
+ ip6_setbit(prefix, node->depth + ipa_shift[p->addr_type]);
+ aggregator_dump_trie_helper(p, node->child[1], prefix, pxlen + 1, buf);
+ ip6_clrbit(prefix, node->depth + ipa_shift[p->addr_type]);
+ }
+}
+
+static void
+aggregator_dump_trie(const struct aggregator_proto *p)
+{
+ ip_addr prefix = (p->addr_type == NET_IP4) ? ipa_from_ip4(IP4_NONE) : ipa_from_ip6(IP6_NONE);
+
+ struct buffer buf = { 0 };
+ LOG_BUFFER_INIT(buf);
+
+ log("==== TRIE BEGIN ====");
+ aggregator_dump_trie_helper(p, p->root, &prefix, 0, &buf);
+ log("==== TRIE END ====");
+}
+
+static inline void
+aggregator_create_route(struct aggregator_proto *p, ip_addr prefix, u32 pxlen, struct aggregator_bucket *bucket)
+{
+ struct net_addr addr = { 0 };
+ net_fill_ipa(&addr, prefix, pxlen);
+
+ struct network *n = allocz(sizeof(*n) + sizeof(addr));
+ net_copy(n->n.addr, &addr);
+
+ aggregator_bucket_update(p, bucket, n);
+}
+
+/*
+ * Prepare to withdraw route for @prefix
+ */
+static void
+aggregator_prepare_rte_withdrawal(struct aggregator_proto *p, ip_addr prefix, u32 pxlen, struct aggregator_bucket *bucket)
+{
+ ASSERT_DIE(p != NULL);
+ ASSERT_DIE(bucket != NULL);
+
+ /* Allocate the item */
+ struct rte_withdrawal_item *item = lp_allocz(p->rte_withdrawal_pool, sizeof(*item));
+
+ /* Fill in net and bucket */
+ struct net_addr addr = { 0 };
+ net_fill_ipa(&addr, prefix, pxlen);
+ net_copy(&item->addr, &addr);
+
+ item->bucket = bucket;
+
+ /* Push item onto stack */
+ item->next = p->rte_withdrawal_stack,
+ p->rte_withdrawal_stack = item;
+ p->rte_withdrawal_count++;
+}
+
+/*
+ * Insert @prefix to the trie and assign @bucket to this prefix. If the prefix
+ * is already in the trie, update its bucket to @bucket and return updated node.
+ */
+static struct trie_node *
+aggregator_trie_insert_prefix(struct aggregator_proto *p, ip_addr prefix, u32 pxlen, struct aggregator_bucket *bucket)
+{
+ ASSERT_DIE(p != NULL);
+ ASSERT_DIE(bucket != NULL);
+
+ struct trie_node *node = p->root;
+
+ for (u32 i = 0; i < pxlen; i++)
+ {
+ u32 bit = ip6_getbit(prefix, i + ipa_shift[p->addr_type]);
+
+ /* Add filler nodes onto the path to the actual prefix node */
+ if (!node->child[bit])
+ {
+ struct trie_node *new = sl_allocz(p->trie_slab);
+
+ *new = (struct trie_node) {
+ .parent = node,
+ .status = NON_FIB,
+ .px_origin = FILLER,
+ .depth = node->depth + 1,
+ };
+
+ node->child[bit] = new;
+ }
+
+ node = node->child[bit];
+ }
+
+ /* Assign bucket to the last node */
+ node->original_bucket = bucket;
+ node->px_origin = ORIGINAL;
+
+ return node;
+}
+
+/*
+ * Remove @prefix from the trie and return the last affected node
+ */
+static struct trie_node *
+aggregator_trie_remove_prefix(struct aggregator_proto *p, ip_addr prefix, u32 pxlen)
+{
+ struct trie_node *node = p->root;
+
+ for (u32 i = 0; i < pxlen; i++)
+ {
+ u32 bit = ip6_getbit(prefix, i + ipa_shift[p->addr_type]);
+ node = node->child[bit];
+ ASSERT_DIE(node != NULL);
+ }
+
+ ASSERT_DIE(node->px_origin == ORIGINAL);
+ ASSERT_DIE((u32)node->depth == pxlen);
+
+ /* TODO: okomentovat, proč tady ještě nesmíme uklízet směrem nahoru */
+ /*
+ * Even though this function is called to remove prefix from the trie, we
+ * can only change its origin from original to filler. Node itself cannot be
+ * removed just yet. If it was removed, we would lose information about the
+ * input data which are used by the algorithm. This information is essential
+ * for correctly recomputing the trie. If the algorithm decides the node is
+ * no longer needed, it will be removed later.
+ */
+ node->px_origin = FILLER;
+ node->ancestor = NULL;
+ node->original_bucket = NULL;
+ node->potential_buckets_count = 0;
+ memset(node->potential_buckets, 0, sizeof(node->potential_buckets));
+
+ return node;
+}
+
+/*
+ * Find prefix corresponding to the position of @target node in the trie
+ * and save result into @prefix and @pxlen.
+ */
+static void
+aggregator_find_subtree_prefix(const struct trie_node *target, ip_addr *prefix, u32 *pxlen, u32 type)
+{
+ ASSERT_DIE(target != NULL);
+ ASSERT_DIE(prefix != NULL);
+ ASSERT_DIE(pxlen != NULL);
+
+ int path[IP6_MAX_PREFIX_LENGTH] = { 0 };
+ int pos = 0;
+ u32 len = 0;
+
+ const struct trie_node *node = target;
+ const struct trie_node *parent = node->parent;
+
+ /* Ascend to the root node */
+ while (parent)
+ {
+ if (node == node->parent->child[0])
+ path[pos++] = 0;
+ else if (node == node->parent->child[1])
+ path[pos++] = 1;
+ else
+ bug("Corrupted memory (node is not its parent's child)");
+
+ ASSERT_DIE(pos < IP6_MAX_PREFIX_LENGTH);
+ node = parent;
+ parent = node->parent;
+ }
+
+ ASSERT_DIE(node->parent == NULL);
+
+ /* Descend to the target node */
+ for (int i = pos - 1; i >= 0; i--)
+ {
+ if (path[i] == 0)
+ ip6_clrbit(prefix, node->depth + ipa_shift[type]);
+ else if (path[i] == 1)
+ ip6_setbit(prefix, node->depth + ipa_shift[type]);
+
+ ASSERT_DIE(node->child[path[i]] != NULL);
+ node = node->child[path[i]];
+
+ len++;
+ ASSERT_DIE((u32)node->depth == len);
+ }
+
+ ASSERT_DIE(node == target);
+ *pxlen = len;
+}
+
+/*
+ * TODO:
+ * - okomentovat (radši víc)
+ * - přejmenovat (aggregator_propagate_update?)
+ * - zrušit `recomputing`
+ *
+ * First and second pass of Optimal Route Table Construction (ORTC) algorithm
+ *
+ * This function performs two tasks. First, it propagates original buckets from
+ * target node to the leaves. Original bucket from prefix node is assigned to
+ * all his descendants in a downward direction until another original node is
+ * reached. Second, it merges sets of potential buckets from leaves upward to
+ * the target node.
+ */
+static void
+aggregator_propagate_and_merge(struct trie_node *node)
+{
+ ASSERT_DIE(node != NULL);
+ ASSERT_DIE(node->status != UNASSIGNED_FIB);
+ ASSERT_DIE(node->potential_buckets_count <= MAX_POTENTIAL_BUCKETS_COUNT);
+
+ /* Propagate original buckets from original nodes to their descendants */
+ if (node->px_origin != ORIGINAL)
+ {
+ node->original_bucket = node->parent->original_bucket;
+
+ /*
+ * During initial aggregation, there are only original and filler nodes,
+ * thus this statement has no effect. When recomputing, aggregated nodes
+ * become fillers.
+ */
+ node->px_origin = FILLER;
+ }
+
+ ASSERT_DIE(node->original_bucket != NULL);
+
+ if (aggregator_is_leaf(node))
+ {
+ /*
+ * When running aggregation for the first time, erasing sets is not
+ * necessary, because they are empty. However, when recomputing, sets
+ * of the leaf nodes must be cleared. Sets in internal nodes don't have
+ * to, because they will be overwritten by merging operation.
+ */
+ node->potential_buckets_count = 0;
+ memset(node->potential_buckets, 0, sizeof(node->potential_buckets));
+
+ ASSERT_DIE(node->potential_buckets_count == 0);
+
+ /* Original bucket of leaf nodes is their potential bucket */
+ aggregator_node_add_potential_bucket(node, node->original_bucket);
+ return;
+ }
+
+ struct trie_node *left = node->child[0];
+ struct trie_node *right = node->child[1];
+
+ /* Postorder traversal */
+ if (left)
+ aggregator_propagate_and_merge(left);
+
+ if (right)
+ aggregator_propagate_and_merge(right);
+
+ /*
+ * Merging sets of potential buckets obviously require node's two children as
+ * arguments. Since our implementation doesn't normalize the trie and therefore
+ * some nodes may have only one child, we simulate missing node by creating
+ * temporary node on stack and using it as an argument for merging.
+ */
+ struct trie_node imaginary_node = { 0 };
+
+ /* Imaginary node inherits potential bucket from its parent */
+ aggregator_node_add_potential_bucket(&imaginary_node, node->original_bucket);
+
+ /* Nodes with only one child */
+ if (left && !right)
+ right = &imaginary_node;
+ else if (!left && right)
+ left = &imaginary_node;
+
+ ASSERT_DIE(left != NULL && right != NULL);
+
+ /*
+ * If there are no common buckets among children's buckets, parent's
+ * buckets are computed as union of its children's buckets.
+ * Otherwise, parent's buckets are computed as intersection of its
+ * children's buckets.
+ */
+ aggregator_merge_potential_buckets(node, left, right);
+}
+
+/*
+ * @inherited_bucket: selected bucket of the closest ancestor of the target node
+ * which is in FIB and thus has a non-null bucket
+ *
+ * Process nodes that have only one child during grouping of prefixes and add
+ * new nodes if necessary.
+ *
+ * Because our implementation doesn't normalize the trie (by adding new nodes
+ * so that every node has either two or zero children) during first stage of
+ * aggregation, we need to decide if these missing nodes are indeed needed in
+ * the trie.
+ */
+static void
+aggregator_process_one_child_nodes(struct trie_node *node, const struct aggregator_bucket *inherited_bucket, struct slab *trie_slab)
+{
+ ASSERT_DIE(node != NULL);
+
+ /* Imaginary node that would have been added during normalization of the trie */
+ struct trie_node imaginary_node = {
+ .parent = node,
+ .original_bucket = node->original_bucket,
+ .status = NON_FIB,
+ .px_origin = AGGREGATED,
+ .depth = node->depth + 1,
+ };
+
+ /* Imaginary node inherits bucket from its parent - current node */
+ aggregator_node_add_potential_bucket(&imaginary_node, node->original_bucket);
+
+ /*
+ * If the current node (parent of the imaginary node) has a bucket, then
+ * the imaginary node inherits this bucket. Otherwise it inherits bucket
+ * from the closest ancestor which is IN_FIB and thus has a non-null bucket.
+ */
+ const struct aggregator_bucket * const imaginary_node_inherited_bucket = (node->status == IN_FIB)
+ ? node->selected_bucket
+ : inherited_bucket;
+
+ ASSERT_DIE(imaginary_node_inherited_bucket != NULL);
+
+ /*
+ * Since this implementation doesn't normalize the trie during first stage
+ * of aggregation, we need to know if these nodes are needed in the trie.
+ * These nodes are simulated by @imaginary_node. If the bucket that imaginary
+ * node inherits from its IN_FIB ancestor is NOT one of its potential buckets,
+ * imaginary node needs to be added to the trie because it's not covered
+ * by its ancestor.
+ */
+ if (!aggregator_is_bucket_potential(&imaginary_node, imaginary_node_inherited_bucket->id))
+ {
+ /* Allocate new node and copy imaginary node into it */
+ struct trie_node *new = sl_allocz(trie_slab);
+ *new = imaginary_node;
+
+ const struct trie_node * const left = node->child[0];
+ const struct trie_node * const right = node->child[1];
+
+ /* Connect new node to the trie */
+ if (left && !right)
+ node->child[1] = new;
+ else
+ node->child[0] = new;
+ }
+}
+
+/*
+ * Export prefix of the current node to FIB and mark node as IN_FIB
+ */
+static void
+aggregator_export_node_prefix(struct aggregator_proto *p, struct trie_node *node, ip_addr prefix, u32 pxlen)
+{
+ ASSERT_DIE(node->potential_buckets_count > 0);
+
+ /* Save old bucket before assigning new */
+ struct aggregator_bucket * const old_bucket = node->selected_bucket;
+
+ /* Select bucket with the lowest ID */
+ node->selected_bucket = aggregator_select_lowest_id_bucket(p, node);
+ ASSERT_DIE(node->selected_bucket != NULL);
+
+ /* Node status is changing from NON_FIB to IN_FIB, export its route */
+ if (node->status != IN_FIB)
+ {
+ aggregator_create_route(p, prefix, pxlen, node->selected_bucket);
+ }
+ else /* Prefix is already in FIB */
+ {
+ ASSERT_DIE(old_bucket != NULL);
+
+ /* Node's bucket has changed, remove old route */
+ if (old_bucket && old_bucket != node->selected_bucket)
+ {
+ aggregator_prepare_rte_withdrawal(p, prefix, pxlen, old_bucket);
+ aggregator_create_route(p, prefix, pxlen, node->selected_bucket);
+ }
+ }
+
+ node->status = IN_FIB;
+ node->ancestor = node;
+
+ /* Original prefix stays original, otherwise it becomes aggregated */
+ node->px_origin = (node->px_origin == ORIGINAL) ? ORIGINAL : AGGREGATED;
+}
+
+/*
+ * Remove prefix of the current node from FIB and mark node as NON_FIB
+ */
+static void
+aggregator_remove_node_prefix(struct aggregator_proto *p, struct trie_node *node, ip_addr prefix, u32 pxlen)
+{
+ /* Node status is changing from IN_FIB to NON_FIB, withdraw its route */
+ if (node->status == IN_FIB)
+ {
+ ASSERT_DIE(node->selected_bucket != NULL);
+ aggregator_prepare_rte_withdrawal(p, prefix, pxlen, node->selected_bucket);
+ }
+
+ node->selected_bucket = NULL;
+ node->status = NON_FIB;
+ node->ancestor = node->parent->ancestor;
+
+ /*
+ * Original prefix stays original, otherwise it was aggregated and becomes
+ * a filler
+ */
+ node->px_origin = (node->px_origin == ORIGINAL) ? ORIGINAL : FILLER;
+}
+
+/*
+ * This functions moves from the target node downwards to the leaves and
+ * decides which prefixes are the result of the aggregation and will be
+ * exported to the FIB.
+ * Each node (except root node) is covered by one of its ancestors. We can say
+ * that each node "inherits" selected bucket from one of its ancestors. If this
+ * inherited bucket is one of the node's potential buckets, then this prefix
+ * will not go to the FIB, because its address space is already covered by some
+ * shorter prefix. However, if enherited bucket is not one of the node's
+ * potential bucket, then a bucket for this node is chosen from its set and the
+ * prefix is exported to the FIB.
+ */
+static void
+aggregator_group_prefixes_helper(struct aggregator_proto *p, struct trie_node *node, ip_addr *prefix, u32 pxlen)
+{
+ ASSERT_DIE(node != NULL);
+ ASSERT_DIE(node->status != UNASSIGNED_FIB);
+ ASSERT_DIE(node->potential_buckets_count <= MAX_POTENTIAL_BUCKETS_COUNT);
+
+ ASSERT_DIE(node->original_bucket != NULL);
+ ASSERT_DIE(node->parent->ancestor != NULL);
+ ASSERT_DIE(node->parent->ancestor->selected_bucket != NULL);
+
+ /* Bucket inherited from the closest ancestor with a non-null selected bucket */
+ const struct aggregator_bucket * const inherited_bucket = node->parent->ancestor->selected_bucket;
+
+ /*
+ * If the bucket inherited from the ancestor is one of potential buckets
+ * of the current node, then this node doesn't need a bucket because it
+ * inherits one, and its prefix is thus not needed in FIB.
+ */
+ if (aggregator_is_bucket_potential(node, inherited_bucket->id))
+ aggregator_remove_node_prefix(p, node, *prefix, pxlen);
+ else
+ aggregator_export_node_prefix(p, node, *prefix, pxlen);
+
+ ASSERT_DIE((node->selected_bucket != NULL && node->status == IN_FIB) || (node->selected_bucket == NULL && node->status == NON_FIB));
+ ASSERT_DIE(node->ancestor != NULL);
+ ASSERT_DIE(node->ancestor->original_bucket != NULL);
+ ASSERT_DIE(node->ancestor->selected_bucket != NULL);
+
+ const struct trie_node * const left = node->child[0];
+ const struct trie_node * const right = node->child[1];
+
+ /* Process nodes with only one child */
+ if ((left && !right) || (!left && right))
+ aggregator_process_one_child_nodes(node, inherited_bucket, p->trie_slab);
+
+ /* Preorder traversal */
+ if (node->child[0])
+ {
+ ASSERT_DIE((u32)node->depth == pxlen);
+ ip6_clrbit(prefix, node->depth + ipa_shift[p->addr_type]);
+ aggregator_group_prefixes_helper(p, node->child[0], prefix, pxlen + 1);
+ }
+
+ if (node->child[1])
+ {
+ ASSERT_DIE((u32)node->depth == pxlen);
+ ip6_setbit(prefix, node->depth + ipa_shift[p->addr_type]);
+ aggregator_group_prefixes_helper(p, node->child[1], prefix, pxlen + 1);
+ ip6_clrbit(prefix, node->depth + ipa_shift[p->addr_type]);
+ }
+
+ /* Prune the trie */
+ if (node->status == NON_FIB && node->px_origin != ORIGINAL && aggregator_is_leaf(node))
+ {
+ ASSERT_DIE(node->selected_bucket == NULL);
+ aggregator_remove_node(node);
+ }
+}
+
+/*
+ * Third pass of Optimal Route Table Construction (ORTC) algorithm
+ *
+ * This functions represents final stage of aggregation. It decides which
+ * prefixes will be exported into FIB. In that case, it selects bucket for
+ * the target node from the set of its potential buckets and creates new
+ * route for this prefix. Recursively group prefixes in the subtree rooted
+ * at @node.
+ */
+static void
+aggregator_group_prefixes(struct aggregator_proto *p, struct trie_node *node)
+{
+ ASSERT_DIE(node != NULL);
+ ASSERT_DIE(node->potential_buckets_count <= MAX_POTENTIAL_BUCKETS_COUNT);
+ ASSERT_DIE(node->potential_buckets_count > 0);
+
+ ip_addr prefix = (p->addr_type == NET_IP4) ? ipa_from_ip4(IP4_NONE) : ipa_from_ip6(IP6_NONE);
+ u32 pxlen = 0;
+
+ /*
+ * If this function runs on a subtree and not the whole trie,
+ * find prefix that covers this subtree.
+ */
+ aggregator_find_subtree_prefix(node, &prefix, &pxlen, p->addr_type);
+
+ /* Export prefix of the current node */
+ aggregator_export_node_prefix(p, node, prefix, pxlen);
+
+ if (node->child[0])
+ {
+ ASSERT_DIE((u32)node->depth == pxlen);
+ ip6_clrbit(&prefix, node->depth + ipa_shift[p->addr_type]);
+ aggregator_group_prefixes_helper(p, node->child[0], &prefix, pxlen + 1);
+ }
+
+ if (node->child[1])
+ {
+ ASSERT_DIE((u32)node->depth == pxlen);
+ ip6_setbit(&prefix, node->depth + ipa_shift[p->addr_type]);
+ aggregator_group_prefixes_helper(p, node->child[1], &prefix, pxlen + 1);
+ ip6_clrbit(&prefix, node->depth + ipa_shift[p->addr_type]);
+ }
+}
+
+/*
+ * Check trie consistency and invariants
+ */
+static void
+check_trie_after_aggregation(const struct trie_node *node)
+{
+ ASSERT_DIE(node != NULL);
+ ASSERT_DIE(node->ancestor != NULL);
+
+ if (node->status == IN_FIB)
+ {
+ ASSERT_DIE(node->px_origin == ORIGINAL || node->px_origin == AGGREGATED);
+ ASSERT_DIE(node->selected_bucket != NULL);
+ ASSERT_DIE(node->ancestor != NULL);
+ ASSERT_DIE(node->ancestor == node);
+ }
+ else if (node->status == NON_FIB)
+ {
+ ASSERT_DIE(node->px_origin == ORIGINAL || node->px_origin == FILLER);
+ ASSERT_DIE(node->selected_bucket == NULL);
+ ASSERT_DIE(node->ancestor != NULL);
+ ASSERT_DIE(node->ancestor != node);
+ ASSERT_DIE(node->ancestor == node->parent->ancestor);
+ }
+ else
+ bug("Unknown node status");
+
+ if (node->child[0])
+ check_trie_after_aggregation(node->child[0]);
+
+ if (node->child[1])
+ check_trie_after_aggregation(node->child[1]);
+}
+
+/*
+ * Merge sets of potential buckets of node's children going from @node upwards.
+ * Stop when the node's set doesn't change and return the last updated node.
+ */
+static struct trie_node *
+aggregator_merge_buckets_above(struct trie_node *node)
+{
+ ASSERT_DIE(node != NULL);
+
+ struct trie_node *parent = node->parent;
+
+ while (parent)
+ {
+ const struct trie_node *left = parent->child[0];
+ const struct trie_node *right = parent->child[1];
+ ASSERT_DIE(left == node || right == node);
+
+ struct trie_node imaginary_node = { 0 };
+ aggregator_node_add_potential_bucket(&imaginary_node, parent->original_bucket);
+
+ /* Nodes with only one child */
+ if (left && !right)
+ right = &imaginary_node;
+ else if (!left && right)
+ left = &imaginary_node;
+
+ ASSERT_DIE(left != NULL && right != NULL);
+
+ /* The parent's set didn't change by merging, stop here */
+ if (!aggregator_merge_potential_buckets(parent, left, right))
+ return node;
+
+ node = parent;
+ parent = node->parent;
+ }
+
+ return node;
+}
+
+static void
+aggregator_construct_trie(struct aggregator_proto *p)
+{
+ HASH_WALK(p->buckets, next_hash, bucket)
+ {
+ for (const struct rte *rte = bucket->rte; rte; rte = rte->next)
+ {
+ const struct net_addr *addr = rte->net->n.addr;
+
+ const ip_addr prefix = net_prefix(addr);
+ const u32 pxlen = net_pxlen(addr);
+
+ aggregator_trie_insert_prefix(p, prefix, pxlen, bucket);
+ }
+ }
+ HASH_WALK_END;
+}
+
+/*
+ * Run Optimal Routing Table Constructor (ORTC) algorithm
+ */
+static void
+aggregator_compute_trie(struct aggregator_proto *p)
+{
+ ASSERT_DIE(p->addr_type == NET_IP4 || p->addr_type == NET_IP6);
+
+ aggregator_propagate_and_merge(p->root);
+ aggregator_group_prefixes(p, p->root);
+
+ check_trie_after_aggregation(p->root);
+}
+
+void
+aggregator_aggregate(struct aggregator_proto *p)
+{
+ ASSERT_DIE(p->root != NULL);
+
+ aggregator_construct_trie(p);
+ aggregator_compute_trie(p);
+}
+
+/*
+ * Incorporate prefix change into the trie and reaggregate
+ */
+void
+aggregator_recompute(struct aggregator_proto *p, struct aggregator_route *old, struct aggregator_route *new)
+{
+ struct trie_node *updated_node = NULL;
+
+ /* Withdraw */
+ if (old && !new)
+ {
+ const struct net_addr *addr = old->rte.net->n.addr;
+
+ const ip_addr prefix = net_prefix(addr);
+ const u32 pxlen = net_pxlen(addr);
+
+ updated_node = aggregator_trie_remove_prefix(p, prefix, pxlen);
+ ASSERT_DIE(updated_node != NULL);
+ }
+ else /* Announce or update */
+ {
+ const struct net_addr *addr = new->rte.net->n.addr;
+
+ const ip_addr prefix = net_prefix(addr);
+ const u32 pxlen = net_pxlen(addr);
+
+ updated_node = aggregator_trie_insert_prefix(p, prefix, pxlen, new->bucket);
+
+ ASSERT_DIE(updated_node != NULL);
+ ASSERT_DIE(updated_node->px_origin == ORIGINAL);
+ ASSERT_DIE(updated_node->original_bucket != NULL);
+ }
+
+ struct trie_node *ancestor = updated_node;
+
+ /* Find the closest IN_FIB ancestor of the updated node */
+ // TODO: use node ancestor pointer instead of traversing
+ while (ancestor = ancestor->parent)
+ {
+ ASSERT_DIE(ancestor != updated_node);
+
+ /* Stop when IN_FIB ancestor is found or when we cannot continue further */
+ if (ancestor->status == IN_FIB || !ancestor->parent)
+ break;
+ }
+
+ ASSERT_DIE(ancestor != NULL);
+ ASSERT_DIE(ancestor != updated_node);
+ ASSERT_DIE(ancestor->status == IN_FIB);
+
+ /* Reaggregate trie with incorporated update */
+ aggregator_propagate_and_merge(ancestor);
+
+ /* Merge buckets upwards until they change, return last updated node */
+ struct trie_node *highest_node = aggregator_merge_buckets_above(ancestor);
+ ASSERT_DIE(highest_node != NULL);
+
+ aggregator_group_prefixes(p, highest_node);
+ check_trie_after_aggregation(highest_node);
+}