struct nft_array __rcu *array;
struct nft_array *array_next;
unsigned long last_gc;
+ struct list_head expired;
};
struct nft_rbtree_elem {
struct nft_elem_priv priv;
- struct rb_node node;
+ union {
+ struct rb_node node;
+ struct list_head list;
+ };
struct nft_set_ext ext;
};
return &rbe->priv;
}
-static void nft_rbtree_gc_elem_remove(struct net *net, struct nft_set *set,
- struct nft_rbtree *priv,
- struct nft_rbtree_elem *rbe)
+static void nft_rbtree_gc_elem_move(struct net *net, struct nft_set *set,
+ struct nft_rbtree *priv,
+ struct nft_rbtree_elem *rbe)
{
lockdep_assert_held_write(&priv->lock);
nft_setelem_data_deactivate(net, set, &rbe->priv);
rb_erase(&rbe->node, &priv->root);
+
+ /* collected later on in commit callback */
+ list_add(&rbe->list, &priv->expired);
}
static const struct nft_rbtree_elem *
struct rb_node *prev = rb_prev(&rbe->node);
struct net *net = read_pnet(&set->net);
struct nft_rbtree_elem *rbe_prev;
- struct nft_trans_gc *gc;
-
- gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
- if (!gc)
- return ERR_PTR(-ENOMEM);
/* search for end interval coming before this element.
* end intervals don't carry a timeout extension, they
rbe_prev = NULL;
if (prev) {
rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
- nft_rbtree_gc_elem_remove(net, set, priv, rbe_prev);
-
- /* There is always room in this trans gc for this element,
- * memory allocation never actually happens, hence, the warning
- * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT,
- * this is synchronous gc which never fails.
- */
- gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
- if (WARN_ON_ONCE(!gc))
- return ERR_PTR(-ENOMEM);
-
- nft_trans_gc_elem_add(gc, rbe_prev);
+ nft_rbtree_gc_elem_move(net, set, priv, rbe_prev);
}
- nft_rbtree_gc_elem_remove(net, set, priv, rbe);
- gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
- if (WARN_ON_ONCE(!gc))
- return ERR_PTR(-ENOMEM);
-
- nft_trans_gc_elem_add(gc, rbe);
-
- nft_trans_gc_queue_sync_done(gc);
+ nft_rbtree_gc_elem_move(net, set, priv, rbe);
return rbe_prev;
}
}
}
-static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
- struct nft_rbtree *priv,
- struct nft_rbtree_elem *rbe)
-{
- nft_setelem_data_deactivate(net, set, &rbe->priv);
- nft_rbtree_erase(priv, rbe);
-}
-
-static void nft_rbtree_gc(struct nft_set *set)
+static void nft_rbtree_gc_scan(struct nft_set *set)
{
struct nft_rbtree *priv = nft_set_priv(set);
struct nft_rbtree_elem *rbe, *rbe_end = NULL;
struct net *net = read_pnet(&set->net);
u64 tstamp = nft_net_tstamp(net);
struct rb_node *node, *next;
- struct nft_trans_gc *gc;
-
- set = nft_set_container_of(priv);
- net = read_pnet(&set->net);
-
- gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
- if (!gc)
- return;
for (node = rb_first(&priv->root); node ; node = next) {
next = rb_next(node);
if (!__nft_set_elem_expired(&rbe->ext, tstamp))
continue;
- gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
- if (!gc)
- goto try_later;
-
/* end element needs to be removed first, it has
* no timeout extension.
*/
+ write_lock_bh(&priv->lock);
if (rbe_end) {
- nft_rbtree_gc_remove(net, set, priv, rbe_end);
- nft_trans_gc_elem_add(gc, rbe_end);
+ nft_rbtree_gc_elem_move(net, set, priv, rbe_end);
rbe_end = NULL;
}
- gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
- if (!gc)
- goto try_later;
-
- nft_rbtree_gc_remove(net, set, priv, rbe);
- nft_trans_gc_elem_add(gc, rbe);
+ nft_rbtree_gc_elem_move(net, set, priv, rbe);
+ write_unlock_bh(&priv->lock);
}
-try_later:
+ priv->last_gc = jiffies;
+}
+
+static void nft_rbtree_gc_queue(struct nft_set *set)
+{
+ struct nft_rbtree *priv = nft_set_priv(set);
+ struct nft_rbtree_elem *rbe, *rbe_end;
+ struct nft_trans_gc *gc;
+
+ if (list_empty(&priv->expired))
+ return;
- if (gc) {
- gc = nft_trans_gc_catchall_sync(gc);
- nft_trans_gc_queue_sync_done(gc);
- priv->last_gc = jiffies;
+ gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
+ if (!gc)
+ return;
+
+ list_for_each_entry_safe(rbe, rbe_end, &priv->expired, list) {
+ list_del(&rbe->list);
+ nft_trans_gc_elem_add(gc, rbe);
+
+ gc = nft_trans_gc_queue_sync(gc, GFP_KERNEL);
+ if (!gc)
+ return;
}
+
+ gc = nft_trans_gc_catchall_sync(gc);
+ nft_trans_gc_queue_sync_done(gc);
}
static u64 nft_rbtree_privsize(const struct nlattr * const nla[],
rwlock_init(&priv->lock);
priv->root = RB_ROOT;
+ INIT_LIST_HEAD(&priv->expired);
priv->array = NULL;
priv->array_next = NULL;
const struct nft_set *set)
{
struct nft_rbtree *priv = nft_set_priv(set);
- struct nft_rbtree_elem *rbe;
+ struct nft_rbtree_elem *rbe, *next;
struct nft_array *array;
struct rb_node *node;
+ list_for_each_entry_safe(rbe, next, &priv->expired, list) {
+ list_del(&rbe->list);
+ nf_tables_set_elem_destroy(ctx, set, &rbe->priv);
+ }
+
while ((node = priv->root.rb_node) != NULL) {
rb_erase(node, &priv->root);
rbe = rb_entry(node, struct nft_rbtree_elem, node);
u32 num_intervals = 0;
struct rb_node *node;
- if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
- nft_rbtree_gc(set);
-
/* No changes, skip, eg. elements updates only. */
if (!priv->array_next)
return;
+ /* GC can be performed if the binary search blob is going
+ * to be rebuilt. It has to be done in two phases: first
+ * scan tree and move all expired elements to the expired
+ * list.
+ *
+ * Then, after blob has been re-built and published to other
+ * CPUs, queue collected entries for freeing.
+ */
+ if (time_after_eq(jiffies, priv->last_gc + nft_set_gc_interval(set)))
+ nft_rbtree_gc_scan(set);
+
/* Reverse walk to create an array from smaller to largest interval. */
node = rb_last(&priv->root);
if (node)
num_intervals++;
err_out:
priv->array_next->num_intervals = num_intervals;
- old = rcu_replace_pointer(priv->array, priv->array_next, true);
+ old = rcu_replace_pointer(priv->array, priv->array_next,
+ lockdep_is_held(&nft_pernet(read_pnet(&set->net))->commit_mutex));
priv->array_next = NULL;
if (old)
call_rcu(&old->rcu_head, nft_array_free_rcu);
+
+ /* New blob is public, queue collected entries for freeing.
+ * call_rcu ensures elements stay around until readers are done.
+ */
+ nft_rbtree_gc_queue(set);
}
static void nft_rbtree_abort(const struct nft_set *set)