From: Sasha Levin Date: Sat, 23 Sep 2023 12:50:27 +0000 (-0400) Subject: Fixes for 5.10 X-Git-Tag: v6.5.6~105 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=d23049039d1354f6c765e33c4d8a78954e4f6251;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 5.10 Signed-off-by: Sasha Levin --- diff --git a/queue-5.10/netfilter-nf_tables-adapt-set-backend-to-use-gc-tran.patch b/queue-5.10/netfilter-nf_tables-adapt-set-backend-to-use-gc-tran.patch new file mode 100644 index 00000000000..74672ef1b52 --- /dev/null +++ b/queue-5.10/netfilter-nf_tables-adapt-set-backend-to-use-gc-tran.patch @@ -0,0 +1,525 @@ +From 17a7c30e1281a067fe8e1063849311667f2d3746 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:05 +0200 +Subject: netfilter: nf_tables: adapt set backend to use GC transaction API + +From: Pablo Neira Ayuso + +commit f6c383b8c31a93752a52697f8430a71dcbc46adf upstream. + +Use the GC transaction API to replace the old and buggy gc API and the +busy mark approach. + +No set elements are removed from async garbage collection anymore, +instead the _DEAD bit is set on so the set element is not visible from +lookup path anymore. Async GC enqueues transaction work that might be +aborted and retried later. + +rbtree and pipapo set backends does not set on the _DEAD bit from the +sync GC path since this runs in control plane path where mutex is held. +In this case, set elements are deactivated, removed and then released +via RCU callback, sync GC never fails. + +Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") +Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") +Fixes: 9d0982927e79 ("netfilter: nft_hash: add support for timeouts") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_set_hash.c | 77 +++++++++++------- + net/netfilter/nft_set_pipapo.c | 43 +++++++--- + net/netfilter/nft_set_rbtree.c | 138 +++++++++++++++++++++------------ + 3 files changed, 172 insertions(+), 86 deletions(-) + +diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c +index ea7bd8549bea8..6ae99b3107bc9 100644 +--- a/net/netfilter/nft_set_hash.c ++++ b/net/netfilter/nft_set_hash.c +@@ -17,6 +17,9 @@ + #include + #include + #include ++#include ++ ++extern unsigned int nf_tables_net_id; + + /* We target a hash table size of 4, element hint is 75% of final size */ + #define NFT_RHASH_ELEMENT_HINT 3 +@@ -59,6 +62,8 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, + + if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) + return 1; ++ if (nft_set_elem_is_dead(&he->ext)) ++ return 1; + if (nft_set_elem_expired(&he->ext)) + return 1; + if (!nft_set_elem_active(&he->ext, x->genmask)) +@@ -187,7 +192,6 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, + struct nft_rhash_elem *he = elem->priv; + + nft_set_elem_change_active(net, set, &he->ext); +- nft_set_elem_clear_busy(&he->ext); + } + + static bool nft_rhash_flush(const struct net *net, +@@ -195,12 +199,9 @@ static bool nft_rhash_flush(const struct net *net, + { + struct nft_rhash_elem *he = priv; + +- if (!nft_set_elem_mark_busy(&he->ext) || +- !nft_is_active(net, &he->ext)) { +- nft_set_elem_change_active(net, set, &he->ext); +- return true; +- } +- return false; ++ nft_set_elem_change_active(net, set, &he->ext); ++ ++ return true; + } + + static void *nft_rhash_deactivate(const struct net *net, +@@ -217,9 +218,8 @@ static void *nft_rhash_deactivate(const struct net *net, + + rcu_read_lock(); + he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); +- if (he != NULL && +- !nft_rhash_flush(net, set, he)) +- he = NULL; ++ if (he) ++ nft_set_elem_change_active(net, set, &he->ext); + + rcu_read_unlock(); + +@@ -295,49 +295,75 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, + + static void nft_rhash_gc(struct work_struct *work) + { ++ struct nftables_pernet *nft_net; + struct nft_set *set; + struct nft_rhash_elem *he; + struct nft_rhash *priv; +- struct nft_set_gc_batch *gcb = NULL; + struct rhashtable_iter hti; ++ struct nft_trans_gc *gc; ++ struct net *net; ++ u32 gc_seq; + + priv = container_of(work, struct nft_rhash, gc_work.work); + set = nft_set_container_of(priv); ++ net = read_pnet(&set->net); ++ nft_net = net_generic(net, nf_tables_net_id); ++ gc_seq = READ_ONCE(nft_net->gc_seq); ++ ++ gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); ++ if (!gc) ++ goto done; + + rhashtable_walk_enter(&priv->ht, &hti); + rhashtable_walk_start(&hti); + + while ((he = rhashtable_walk_next(&hti))) { + if (IS_ERR(he)) { +- if (PTR_ERR(he) != -EAGAIN) +- break; ++ if (PTR_ERR(he) != -EAGAIN) { ++ nft_trans_gc_destroy(gc); ++ gc = NULL; ++ goto try_later; ++ } + continue; + } + ++ /* Ruleset has been updated, try later. */ ++ if (READ_ONCE(nft_net->gc_seq) != gc_seq) { ++ nft_trans_gc_destroy(gc); ++ gc = NULL; ++ goto try_later; ++ } ++ ++ if (nft_set_elem_is_dead(&he->ext)) ++ goto dead_elem; ++ + if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPR)) { + struct nft_expr *expr = nft_set_ext_expr(&he->ext); + + if (expr->ops->gc && + expr->ops->gc(read_pnet(&set->net), expr)) +- goto gc; ++ goto needs_gc_run; + } ++ + if (!nft_set_elem_expired(&he->ext)) + continue; +-gc: +- if (nft_set_elem_mark_busy(&he->ext)) +- continue; +- +- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); +- if (gcb == NULL) +- break; +- rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); +- atomic_dec(&set->nelems); +- nft_set_gc_batch_add(gcb, he); ++needs_gc_run: ++ nft_set_elem_dead(&he->ext); ++dead_elem: ++ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); ++ if (!gc) ++ goto try_later; ++ ++ nft_trans_gc_elem_add(gc, he); + } ++ ++try_later: + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + +- nft_set_gc_batch_complete(gcb); ++ if (gc) ++ nft_trans_gc_queue_async_done(gc); ++done: + queue_delayed_work(system_power_efficient_wq, &priv->gc_work, + nft_set_gc_interval(set)); + } +@@ -400,7 +426,6 @@ static void nft_rhash_destroy(const struct nft_ctx *ctx, + }; + + cancel_delayed_work_sync(&priv->gc_work); +- rcu_barrier(); + rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, + (void *)&rhash_ctx); + } +diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c +index 89fa1fedadf7c..63d0723950d32 100644 +--- a/net/netfilter/nft_set_pipapo.c ++++ b/net/netfilter/nft_set_pipapo.c +@@ -1544,15 +1544,32 @@ static void pipapo_drop(struct nft_pipapo_match *m, + } + } + ++static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, ++ struct nft_pipapo_elem *e) ++{ ++ struct nft_set_elem elem = { ++ .priv = e, ++ }; ++ ++ nft_setelem_data_deactivate(net, set, &elem); ++} ++ + /** + * pipapo_gc() - Drop expired entries from set, destroy start and end elements +- * @set: nftables API set representation ++ * @_set: nftables API set representation + * @m: Matching data + */ +-static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) ++static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) + { ++ struct nft_set *set = (struct nft_set *) _set; + struct nft_pipapo *priv = nft_set_priv(set); ++ struct net *net = read_pnet(&set->net); + int rules_f0, first_rule = 0; ++ struct nft_trans_gc *gc; ++ ++ gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); ++ if (!gc) ++ return; + + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; +@@ -1577,13 +1594,19 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) + f--; + i--; + e = f->mt[rulemap[i].to].e; +- if (nft_set_elem_expired(&e->ext) && +- !nft_set_elem_mark_busy(&e->ext)) { ++ /* synchronous gc never fails, there is no need to set on ++ * NFT_SET_ELEM_DEAD_BIT. ++ */ ++ if (nft_set_elem_expired(&e->ext)) { + priv->dirty = true; +- pipapo_drop(m, rulemap); + +- rcu_barrier(); +- nft_set_elem_destroy(set, e, true); ++ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); ++ if (!gc) ++ break; ++ ++ nft_pipapo_gc_deactivate(net, set, e); ++ pipapo_drop(m, rulemap); ++ nft_trans_gc_elem_add(gc, e); + + /* And check again current first rule, which is now the + * first we haven't checked. +@@ -1593,7 +1616,10 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) + } + } + +- priv->last_gc = jiffies; ++ if (gc) { ++ nft_trans_gc_queue_sync_done(gc); ++ priv->last_gc = jiffies; ++ } + } + + /** +@@ -1718,7 +1744,6 @@ static void nft_pipapo_activate(const struct net *net, + return; + + nft_set_elem_change_active(net, set, &e->ext); +- nft_set_elem_clear_busy(&e->ext); + } + + /** +diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c +index 2aa3776c5fbb7..ed14849aa47f4 100644 +--- a/net/netfilter/nft_set_rbtree.c ++++ b/net/netfilter/nft_set_rbtree.c +@@ -14,6 +14,9 @@ + #include + #include + #include ++#include ++ ++extern unsigned int nf_tables_net_id; + + struct nft_rbtree { + struct rb_root root; +@@ -46,6 +49,12 @@ static int nft_rbtree_cmp(const struct nft_set *set, + set->klen); + } + ++static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) ++{ ++ return nft_set_elem_expired(&rbe->ext) || ++ nft_set_elem_is_dead(&rbe->ext); ++} ++ + static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext, + unsigned int seq) +@@ -80,7 +89,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set + continue; + } + +- if (nft_set_elem_expired(&rbe->ext)) ++ if (nft_rbtree_elem_expired(rbe)) + return false; + + if (nft_rbtree_interval_end(rbe)) { +@@ -98,7 +107,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set + + if (set->flags & NFT_SET_INTERVAL && interval != NULL && + nft_set_elem_active(&interval->ext, genmask) && +- !nft_set_elem_expired(&interval->ext) && ++ !nft_rbtree_elem_expired(interval) && + nft_rbtree_interval_start(interval)) { + *ext = &interval->ext; + return true; +@@ -214,6 +223,18 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, + return rbe; + } + ++static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, ++ struct nft_rbtree *priv, ++ struct nft_rbtree_elem *rbe) ++{ ++ struct nft_set_elem elem = { ++ .priv = rbe, ++ }; ++ ++ nft_setelem_data_deactivate(net, set, &elem); ++ rb_erase(&rbe->node, &priv->root); ++} ++ + static int nft_rbtree_gc_elem(const struct nft_set *__set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe, +@@ -221,11 +242,12 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, + { + struct nft_set *set = (struct nft_set *)__set; + struct rb_node *prev = rb_prev(&rbe->node); ++ struct net *net = read_pnet(&set->net); + struct nft_rbtree_elem *rbe_prev; +- struct nft_set_gc_batch *gcb; ++ struct nft_trans_gc *gc; + +- gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); +- if (!gcb) ++ gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); ++ if (!gc) + return -ENOMEM; + + /* search for end interval coming before this element. +@@ -243,17 +265,28 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, + + if (prev) { + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); ++ nft_rbtree_gc_remove(net, set, priv, rbe_prev); + +- rb_erase(&rbe_prev->node, &priv->root); +- atomic_dec(&set->nelems); +- nft_set_gc_batch_add(gcb, rbe_prev); ++ /* There is always room in this trans gc for this element, ++ * memory allocation never actually happens, hence, the warning ++ * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, ++ * this is synchronous gc which never fails. ++ */ ++ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); ++ if (WARN_ON_ONCE(!gc)) ++ return -ENOMEM; ++ ++ nft_trans_gc_elem_add(gc, rbe_prev); + } + +- rb_erase(&rbe->node, &priv->root); +- atomic_dec(&set->nelems); ++ nft_rbtree_gc_remove(net, set, priv, rbe); ++ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); ++ if (WARN_ON_ONCE(!gc)) ++ return -ENOMEM; ++ ++ nft_trans_gc_elem_add(gc, rbe); + +- nft_set_gc_batch_add(gcb, rbe); +- nft_set_gc_batch_complete(gcb); ++ nft_trans_gc_queue_sync_done(gc); + + return 0; + } +@@ -481,7 +514,6 @@ static void nft_rbtree_activate(const struct net *net, + struct nft_rbtree_elem *rbe = elem->priv; + + nft_set_elem_change_active(net, set, &rbe->ext); +- nft_set_elem_clear_busy(&rbe->ext); + } + + static bool nft_rbtree_flush(const struct net *net, +@@ -489,12 +521,9 @@ static bool nft_rbtree_flush(const struct net *net, + { + struct nft_rbtree_elem *rbe = priv; + +- if (!nft_set_elem_mark_busy(&rbe->ext) || +- !nft_is_active(net, &rbe->ext)) { +- nft_set_elem_change_active(net, set, &rbe->ext); +- return true; +- } +- return false; ++ nft_set_elem_change_active(net, set, &rbe->ext); ++ ++ return true; + } + + static void *nft_rbtree_deactivate(const struct net *net, +@@ -569,26 +598,40 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, + + static void nft_rbtree_gc(struct work_struct *work) + { +- struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; +- struct nft_set_gc_batch *gcb = NULL; ++ struct nft_rbtree_elem *rbe, *rbe_end = NULL; ++ struct nftables_pernet *nft_net; + struct nft_rbtree *priv; ++ struct nft_trans_gc *gc; + struct rb_node *node; + struct nft_set *set; ++ unsigned int gc_seq; + struct net *net; +- u8 genmask; + + priv = container_of(work, struct nft_rbtree, gc_work.work); + set = nft_set_container_of(priv); + net = read_pnet(&set->net); +- genmask = nft_genmask_cur(net); ++ nft_net = net_generic(net, nf_tables_net_id); ++ gc_seq = READ_ONCE(nft_net->gc_seq); ++ ++ gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); ++ if (!gc) ++ goto done; + + write_lock_bh(&priv->lock); + write_seqcount_begin(&priv->count); + for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { ++ ++ /* Ruleset has been updated, try later. */ ++ if (READ_ONCE(nft_net->gc_seq) != gc_seq) { ++ nft_trans_gc_destroy(gc); ++ gc = NULL; ++ goto try_later; ++ } ++ + rbe = rb_entry(node, struct nft_rbtree_elem, node); + +- if (!nft_set_elem_active(&rbe->ext, genmask)) +- continue; ++ if (nft_set_elem_is_dead(&rbe->ext)) ++ goto dead_elem; + + /* elements are reversed in the rbtree for historical reasons, + * from highest to lowest value, that is why end element is +@@ -601,40 +644,33 @@ static void nft_rbtree_gc(struct work_struct *work) + if (!nft_set_elem_expired(&rbe->ext)) + continue; + +- if (nft_set_elem_mark_busy(&rbe->ext)) { +- rbe_end = NULL; ++ nft_set_elem_dead(&rbe->ext); ++ ++ if (!rbe_end) + continue; +- } + +- if (rbe_prev) { +- rb_erase(&rbe_prev->node, &priv->root); +- rbe_prev = NULL; +- } +- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); +- if (!gcb) +- break; ++ nft_set_elem_dead(&rbe_end->ext); + +- atomic_dec(&set->nelems); +- nft_set_gc_batch_add(gcb, rbe); +- rbe_prev = rbe; ++ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); ++ if (!gc) ++ goto try_later; + +- if (rbe_end) { +- atomic_dec(&set->nelems); +- nft_set_gc_batch_add(gcb, rbe_end); +- rb_erase(&rbe_end->node, &priv->root); +- rbe_end = NULL; +- } +- node = rb_next(node); +- if (!node) +- break; ++ nft_trans_gc_elem_add(gc, rbe_end); ++ rbe_end = NULL; ++dead_elem: ++ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); ++ if (!gc) ++ goto try_later; ++ ++ nft_trans_gc_elem_add(gc, rbe); + } +- if (rbe_prev) +- rb_erase(&rbe_prev->node, &priv->root); ++try_later: + write_seqcount_end(&priv->count); + write_unlock_bh(&priv->lock); + +- nft_set_gc_batch_complete(gcb); +- ++ if (gc) ++ nft_trans_gc_queue_async_done(gc); ++done: + queue_delayed_work(system_power_efficient_wq, &priv->gc_work, + nft_set_gc_interval(set)); + } +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nf_tables-defer-gc-run-if-previous-batch-i.patch b/queue-5.10/netfilter-nf_tables-defer-gc-run-if-previous-batch-i.patch new file mode 100644 index 00000000000..9f102cd439a --- /dev/null +++ b/queue-5.10/netfilter-nf_tables-defer-gc-run-if-previous-batch-i.patch @@ -0,0 +1,80 @@ +From 39ff40482cf2bb0984686ed1f8d8414da1f0e394 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:13 +0200 +Subject: netfilter: nf_tables: defer gc run if previous batch is still pending + +From: Florian Westphal + +commit 8e51830e29e12670b4c10df070a4ea4c9593e961 upstream. + +Don't queue more gc work, else we may queue the same elements multiple +times. + +If an element is flagged as dead, this can mean that either the previous +gc request was invalidated/discarded by a transaction or that the previous +request is still pending in the system work queue. + +The latter will happen if the gc interval is set to a very low value, +e.g. 1ms, and system work queue is backlogged. + +The sets refcount is 1 if no previous gc requeusts are queued, so add +a helper for this and skip gc run if old requests are pending. + +Add a helper for this and skip the gc run in this case. + +Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") +Signed-off-by: Florian Westphal +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_tables.h | 5 +++++ + net/netfilter/nft_set_hash.c | 3 +++ + net/netfilter/nft_set_rbtree.c | 3 +++ + 3 files changed, 11 insertions(+) + +diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h +index 9182b583d4297..bbe472c07d07e 100644 +--- a/include/net/netfilter/nf_tables.h ++++ b/include/net/netfilter/nf_tables.h +@@ -479,6 +479,11 @@ static inline void *nft_set_priv(const struct nft_set *set) + return (void *)set->data; + } + ++static inline bool nft_set_gc_is_pending(const struct nft_set *s) ++{ ++ return refcount_read(&s->refs) != 1; ++} ++ + static inline struct nft_set *nft_set_container_of(const void *priv) + { + return (void *)priv - offsetof(struct nft_set, data); +diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c +index 9cdf348b048a4..68a16ee37b3d0 100644 +--- a/net/netfilter/nft_set_hash.c ++++ b/net/netfilter/nft_set_hash.c +@@ -312,6 +312,9 @@ static void nft_rhash_gc(struct work_struct *work) + nft_net = net_generic(net, nf_tables_net_id); + gc_seq = READ_ONCE(nft_net->gc_seq); + ++ if (nft_set_gc_is_pending(set)) ++ goto done; ++ + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; +diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c +index ed14849aa47f4..9b0bdd4216152 100644 +--- a/net/netfilter/nft_set_rbtree.c ++++ b/net/netfilter/nft_set_rbtree.c +@@ -613,6 +613,9 @@ static void nft_rbtree_gc(struct work_struct *work) + nft_net = net_generic(net, nf_tables_net_id); + gc_seq = READ_ONCE(nft_net->gc_seq); + ++ if (nft_set_gc_is_pending(set)) ++ goto done; ++ + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nf_tables-don-t-fail-inserts-if-duplicate-.patch b/queue-5.10/netfilter-nf_tables-don-t-fail-inserts-if-duplicate-.patch new file mode 100644 index 00000000000..e899f70a0ff --- /dev/null +++ b/queue-5.10/netfilter-nf_tables-don-t-fail-inserts-if-duplicate-.patch @@ -0,0 +1,104 @@ +From 266dbf1ff129403ffe607b86c33574cc3ee2508e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:08 +0200 +Subject: netfilter: nf_tables: don't fail inserts if duplicate has expired + +From: Florian Westphal + +commit 7845914f45f066497ac75b30c50dbc735e84e884 upstream. + +nftables selftests fail: +run-tests.sh testcases/sets/0044interval_overlap_0 +Expected: 0-2 . 0-3, got: +W: [FAILED] ./testcases/sets/0044interval_overlap_0: got 1 + +Insertion must ignore duplicate but expired entries. + +Moreover, there is a strange asymmetry in nft_pipapo_activate: + +It refetches the current element, whereas the other ->activate callbacks +(bitmap, hash, rhash, rbtree) use elem->priv. +Same for .remove: other set implementations take elem->priv, +nft_pipapo_remove fetches elem->priv, then does a relookup, +remove this. + +I suspect this was the reason for the change that prompted the +removal of the expired check in pipapo_get() in the first place, +but skipping exired elements there makes no sense to me, this helper +is used for normal get requests, insertions (duplicate check) +and deactivate callback. + +In first two cases expired elements must be skipped. + +For ->deactivate(), this gets called for DELSETELEM, so it +seems to me that expired elements should be skipped as well, i.e. +delete request should fail with -ENOENT error. + +Fixes: 24138933b97b ("netfilter: nf_tables: don't skip expired elements during walk") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_set_pipapo.c | 25 +++++-------------------- + 1 file changed, 5 insertions(+), 20 deletions(-) + +diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c +index 63d0723950d32..80440ac5d44c6 100644 +--- a/net/netfilter/nft_set_pipapo.c ++++ b/net/netfilter/nft_set_pipapo.c +@@ -566,6 +566,8 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, + goto out; + + if (last) { ++ if (nft_set_elem_expired(&f->mt[b].e->ext)) ++ goto next_match; + if ((genmask && + !nft_set_elem_active(&f->mt[b].e->ext, genmask))) + goto next_match; +@@ -600,17 +602,8 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, + static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) + { +- struct nft_pipapo_elem *ret; +- +- ret = pipapo_get(net, set, (const u8 *)elem->key.val.data, +- nft_genmask_cur(net)); +- if (IS_ERR(ret)) +- return ret; +- +- if (nft_set_elem_expired(&ret->ext)) +- return ERR_PTR(-ENOENT); +- +- return ret; ++ return pipapo_get(net, set, (const u8 *)elem->key.val.data, ++ nft_genmask_cur(net)); + } + + /** +@@ -1737,11 +1730,7 @@ static void nft_pipapo_activate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) + { +- struct nft_pipapo_elem *e; +- +- e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); +- if (IS_ERR(e)) +- return; ++ struct nft_pipapo_elem *e = elem->priv; + + nft_set_elem_change_active(net, set, &e->ext); + } +@@ -1955,10 +1944,6 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, + + data = (const u8 *)nft_set_ext_key(&e->ext); + +- e = pipapo_get(net, set, data, 0); +- if (IS_ERR(e)) +- return; +- + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + const u8 *match_start, *match_end; +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nf_tables-don-t-skip-expired-elements-duri.patch b/queue-5.10/netfilter-nf_tables-don-t-skip-expired-elements-duri.patch new file mode 100644 index 00000000000..59fb5d70718 --- /dev/null +++ b/queue-5.10/netfilter-nf_tables-don-t-skip-expired-elements-duri.patch @@ -0,0 +1,140 @@ +From 455cb7d5c164d5aa2acaf9b9fc6153ea1017afda Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:03 +0200 +Subject: netfilter: nf_tables: don't skip expired elements during walk + +From: Florian Westphal + +commit 24138933b97b055d486e8064b4a1721702442a9b upstream. + +There is an asymmetry between commit/abort and preparation phase if the +following conditions are met: + +1. set is a verdict map ("1.2.3.4 : jump foo") +2. timeouts are enabled + +In this case, following sequence is problematic: + +1. element E in set S refers to chain C +2. userspace requests removal of set S +3. kernel does a set walk to decrement chain->use count for all elements + from preparation phase +4. kernel does another set walk to remove elements from the commit phase + (or another walk to do a chain->use increment for all elements from + abort phase) + +If E has already expired in 1), it will be ignored during list walk, so its use count +won't have been changed. + +Then, when set is culled, ->destroy callback will zap the element via +nf_tables_set_elem_destroy(), but this function is only safe for +elements that have been deactivated earlier from the preparation phase: +lack of earlier deactivate removes the element but leaks the chain use +count, which results in a WARN splat when the chain gets removed later, +plus a leak of the nft_chain structure. + +Update pipapo_get() not to skip expired elements, otherwise flush +command reports bogus ENOENT errors. + +Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") +Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") +Fixes: 9d0982927e79 ("netfilter: nft_hash: add support for timeouts") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_tables_api.c | 4 ++++ + net/netfilter/nft_set_hash.c | 2 -- + net/netfilter/nft_set_pipapo.c | 18 ++++++++++++------ + net/netfilter/nft_set_rbtree.c | 2 -- + 4 files changed, 16 insertions(+), 10 deletions(-) + +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 430dcd0f6c3b2..5eef671578a25 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -4929,8 +4929,12 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) + { ++ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + struct nft_set_dump_args *args; + ++ if (nft_set_elem_expired(ext)) ++ return 0; ++ + args = container_of(iter, struct nft_set_dump_args, iter); + return nf_tables_fill_setelem(args->skb, set, elem); + } +diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c +index 51d3e6f0934a9..ea7bd8549bea8 100644 +--- a/net/netfilter/nft_set_hash.c ++++ b/net/netfilter/nft_set_hash.c +@@ -277,8 +277,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, + + if (iter->count < iter->skip) + goto cont; +- if (nft_set_elem_expired(&he->ext)) +- goto cont; + if (!nft_set_elem_active(&he->ext, iter->genmask)) + goto cont; + +diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c +index ce6c07ea7244b..89fa1fedadf7c 100644 +--- a/net/netfilter/nft_set_pipapo.c ++++ b/net/netfilter/nft_set_pipapo.c +@@ -566,8 +566,7 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, + goto out; + + if (last) { +- if (nft_set_elem_expired(&f->mt[b].e->ext) || +- (genmask && ++ if ((genmask && + !nft_set_elem_active(&f->mt[b].e->ext, genmask))) + goto next_match; + +@@ -601,8 +600,17 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, + static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) + { +- return pipapo_get(net, set, (const u8 *)elem->key.val.data, +- nft_genmask_cur(net)); ++ struct nft_pipapo_elem *ret; ++ ++ ret = pipapo_get(net, set, (const u8 *)elem->key.val.data, ++ nft_genmask_cur(net)); ++ if (IS_ERR(ret)) ++ return ret; ++ ++ if (nft_set_elem_expired(&ret->ext)) ++ return ERR_PTR(-ENOENT); ++ ++ return ret; + } + + /** +@@ -2009,8 +2017,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, + goto cont; + + e = f->mt[r].e; +- if (nft_set_elem_expired(&e->ext)) +- goto cont; + + elem.priv = e; + +diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c +index eae760adae4d5..2aa3776c5fbb7 100644 +--- a/net/netfilter/nft_set_rbtree.c ++++ b/net/netfilter/nft_set_rbtree.c +@@ -551,8 +551,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, + + if (iter->count < iter->skip) + goto cont; +- if (nft_set_elem_expired(&rbe->ext)) +- goto cont; + if (!nft_set_elem_active(&rbe->ext, iter->genmask)) + goto cont; + +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nf_tables-fix-gc-transaction-races-with-ne.patch b/queue-5.10/netfilter-nf_tables-fix-gc-transaction-races-with-ne.patch new file mode 100644 index 00000000000..6e16e8182f6 --- /dev/null +++ b/queue-5.10/netfilter-nf_tables-fix-gc-transaction-races-with-ne.patch @@ -0,0 +1,91 @@ +From 81427877dbac5be0b4539cf8df85e618ee668843 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:09 +0200 +Subject: netfilter: nf_tables: fix GC transaction races with netns and netlink + event exit path + +From: Pablo Neira Ayuso + +commit 6a33d8b73dfac0a41f3877894b38082bd0c9a5bc upstream. + +Netlink event path is missing a synchronization point with GC +transactions. Add GC sequence number update to netns release path and +netlink event path, any GC transaction losing race will be discarded. + +Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Florian Westphal +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_tables_api.c | 29 +++++++++++++++++++++++++---- + 1 file changed, 25 insertions(+), 4 deletions(-) + +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 206755eb35f3a..43da2f0a52623 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -8320,6 +8320,22 @@ static void nft_set_commit_update(struct list_head *set_update_list) + } + } + ++static unsigned int nft_gc_seq_begin(struct nftables_pernet *nft_net) ++{ ++ unsigned int gc_seq; ++ ++ /* Bump gc counter, it becomes odd, this is the busy mark. */ ++ gc_seq = READ_ONCE(nft_net->gc_seq); ++ WRITE_ONCE(nft_net->gc_seq, ++gc_seq); ++ ++ return gc_seq; ++} ++ ++static void nft_gc_seq_end(struct nftables_pernet *nft_net, unsigned int gc_seq) ++{ ++ WRITE_ONCE(nft_net->gc_seq, ++gc_seq); ++} ++ + static int nf_tables_commit(struct net *net, struct sk_buff *skb) + { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); +@@ -8401,9 +8417,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) + while (++nft_net->base_seq == 0) + ; + +- /* Bump gc counter, it becomes odd, this is the busy mark. */ +- gc_seq = READ_ONCE(nft_net->gc_seq); +- WRITE_ONCE(nft_net->gc_seq, ++gc_seq); ++ gc_seq = nft_gc_seq_begin(nft_net); + + /* step 3. Start new generation, rules_gen_X now in use. */ + net->nft.gencursor = nft_gencursor_next(net); +@@ -8583,7 +8597,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) + nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + nf_tables_commit_audit_log(&adl, nft_net->base_seq); + +- WRITE_ONCE(nft_net->gc_seq, ++gc_seq); ++ nft_gc_seq_end(nft_net, gc_seq); + nf_tables_commit_release(net); + + return 0; +@@ -9538,11 +9552,18 @@ static void __net_exit nf_tables_pre_exit_net(struct net *net) + static void __net_exit nf_tables_exit_net(struct net *net) + { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); ++ unsigned int gc_seq; + + mutex_lock(&nft_net->commit_mutex); ++ ++ gc_seq = nft_gc_seq_begin(nft_net); ++ + if (!list_empty(&nft_net->commit_list)) + __nf_tables_abort(net, NFNL_ABORT_NONE); + __nft_release_tables(net); ++ ++ nft_gc_seq_end(nft_net, gc_seq); ++ + mutex_unlock(&nft_net->commit_mutex); + WARN_ON_ONCE(!list_empty(&nft_net->tables)); + WARN_ON_ONCE(!list_empty(&nft_net->module_list)); +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nf_tables-fix-memleak-when-more-than-255-e.patch b/queue-5.10/netfilter-nf_tables-fix-memleak-when-more-than-255-e.patch new file mode 100644 index 00000000000..0351bedb687 --- /dev/null +++ b/queue-5.10/netfilter-nf_tables-fix-memleak-when-more-than-255-e.patch @@ -0,0 +1,87 @@ +From 6fce37d2dc639bc529817dd22c8a96a0850f183f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:18 +0200 +Subject: netfilter: nf_tables: fix memleak when more than 255 elements expired + +From: Florian Westphal + +commit cf5000a7787cbc10341091d37245a42c119d26c5 upstream. + +When more than 255 elements expired we're supposed to switch to a new gc +container structure. + +This never happens: u8 type will wrap before reaching the boundary +and nft_trans_gc_space() always returns true. + +This means we recycle the initial gc container structure and +lose track of the elements that came before. + +While at it, don't deref 'gc' after we've passed it to call_rcu. + +Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") +Reported-by: Pablo Neira Ayuso +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_tables.h | 2 +- + net/netfilter/nf_tables_api.c | 10 ++++++++-- + 2 files changed, 9 insertions(+), 3 deletions(-) + +diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h +index bbe472c07d07e..5619642b9ad47 100644 +--- a/include/net/netfilter/nf_tables.h ++++ b/include/net/netfilter/nf_tables.h +@@ -1525,7 +1525,7 @@ struct nft_trans_gc { + struct net *net; + struct nft_set *set; + u32 seq; +- u8 count; ++ u16 count; + void *priv[NFT_TRANS_GC_BATCHCOUNT]; + struct rcu_head rcu; + }; +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 9fc302a6836ba..32c97cc87ddc2 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -8124,12 +8124,15 @@ static int nft_trans_gc_space(struct nft_trans_gc *trans) + struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp) + { ++ struct nft_set *set; ++ + if (nft_trans_gc_space(gc)) + return gc; + ++ set = gc->set; + nft_trans_gc_queue_work(gc); + +- return nft_trans_gc_alloc(gc->set, gc_seq, gfp); ++ return nft_trans_gc_alloc(set, gc_seq, gfp); + } + + void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) +@@ -8144,15 +8147,18 @@ void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) + + struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) + { ++ struct nft_set *set; ++ + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) + return NULL; + + if (nft_trans_gc_space(gc)) + return gc; + ++ set = gc->set; + call_rcu(&gc->rcu, nft_trans_gc_trans_free); + +- return nft_trans_gc_alloc(gc->set, 0, gfp); ++ return nft_trans_gc_alloc(set, 0, gfp); + } + + void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nf_tables-gc-transaction-api-to-avoid-race.patch b/queue-5.10/netfilter-nf_tables-gc-transaction-api-to-avoid-race.patch new file mode 100644 index 00000000000..175dfa68126 --- /dev/null +++ b/queue-5.10/netfilter-nf_tables-gc-transaction-api-to-avoid-race.patch @@ -0,0 +1,528 @@ +From 38c4f00adaa96be8c8b8ccfc1bc593f7350a4706 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:04 +0200 +Subject: netfilter: nf_tables: GC transaction API to avoid race with control + plane + +From: Pablo Neira Ayuso + +commit 5f68718b34a531a556f2f50300ead2862278da26 upstream. + +The set types rhashtable and rbtree use a GC worker to reclaim memory. +From system work queue, in periodic intervals, a scan of the table is +done. + +The major caveat here is that the nft transaction mutex is not held. +This causes a race between control plane and GC when they attempt to +delete the same element. + +We cannot grab the netlink mutex from the work queue, because the +control plane has to wait for the GC work queue in case the set is to be +removed, so we get following deadlock: + + cpu 1 cpu2 + GC work transaction comes in , lock nft mutex + `acquire nft mutex // BLOCKS + transaction asks to remove the set + set destruction calls cancel_work_sync() + +cancel_work_sync will now block forever, because it is waiting for the +mutex the caller already owns. + +This patch adds a new API that deals with garbage collection in two +steps: + +1) Lockless GC of expired elements sets on the NFT_SET_ELEM_DEAD_BIT + so they are not visible via lookup. Annotate current GC sequence in + the GC transaction. Enqueue GC transaction work as soon as it is + full. If ruleset is updated, then GC transaction is aborted and + retried later. + +2) GC work grabs the mutex. If GC sequence has changed then this GC + transaction lost race with control plane, abort it as it contains + stale references to objects and let GC try again later. If the + ruleset is intact, then this GC transaction deactivates and removes + the elements and it uses call_rcu() to destroy elements. + +Note that no elements are removed from GC lockless path, the _DEAD bit +is set and pointers are collected. GC catchall does not remove the +elements anymore too. There is a new set->dead flag that is set on to +abort the GC transaction to deal with set->ops->destroy() path which +removes the remaining elements in the set from commit_release, where no +mutex is held. + +To deal with GC when mutex is held, which allows safe deactivate and +removal, add sync GC API which releases the set element object via +call_rcu(). This is used by rbtree and pipapo backends which also +perform garbage collection from control plane path. + +Since element removal from sets can happen from control plane and +element garbage collection/timeout, it is necessary to keep the set +structure alive until all elements have been deactivated and destroyed. + +We cannot do a cancel_work_sync or flush_work in nft_set_destroy because +its called with the transaction mutex held, but the aforementioned async +work queue might be blocked on the very mutex that nft_set_destroy() +callchain is sitting on. + +This gives us the choice of ABBA deadlock or UaF. + +To avoid both, add set->refs refcount_t member. The GC API can then +increment the set refcount and release it once the elements have been +free'd. + +Set backends are adapted to use the GC transaction API in a follow up +patch entitled: + + ("netfilter: nf_tables: use gc transaction API in set backends") + +This is joint work with Florian Westphal. + +Fixes: cfed7e1b1f8e ("netfilter: nf_tables: add set garbage collection helpers") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_tables.h | 61 ++++++++- + net/netfilter/nf_tables_api.c | 216 ++++++++++++++++++++++++++++-- + 2 files changed, 267 insertions(+), 10 deletions(-) + +diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h +index a3068ed0f3169..39a0b37e8a1a2 100644 +--- a/include/net/netfilter/nf_tables.h ++++ b/include/net/netfilter/nf_tables.h +@@ -407,6 +407,7 @@ struct nft_set_type { + * + * @list: table set list node + * @bindings: list of set bindings ++ * @refs: internal refcounting for async set destruction + * @table: table this set belongs to + * @net: netnamespace this set belongs to + * @name: name of the set +@@ -436,6 +437,7 @@ struct nft_set_type { + struct nft_set { + struct list_head list; + struct list_head bindings; ++ refcount_t refs; + struct nft_table *table; + possible_net_t net; + char *name; +@@ -458,7 +460,8 @@ struct nft_set { + struct list_head pending_update; + /* runtime data below here */ + const struct nft_set_ops *ops ____cacheline_aligned; +- u16 flags:14, ++ u16 flags:13, ++ dead:1, + genmask:2; + u8 klen; + u8 dlen; +@@ -1450,6 +1453,32 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) + clear_bit(NFT_SET_ELEM_BUSY_BIT, word); + } + ++#define NFT_SET_ELEM_DEAD_MASK (1 << 3) ++ ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++#define NFT_SET_ELEM_DEAD_BIT 3 ++#elif defined(__BIG_ENDIAN_BITFIELD) ++#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3) ++#else ++#error ++#endif ++ ++static inline void nft_set_elem_dead(struct nft_set_ext *ext) ++{ ++ unsigned long *word = (unsigned long *)ext; ++ ++ BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); ++ set_bit(NFT_SET_ELEM_DEAD_BIT, word); ++} ++ ++static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) ++{ ++ unsigned long *word = (unsigned long *)ext; ++ ++ BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); ++ return test_bit(NFT_SET_ELEM_DEAD_BIT, word); ++} ++ + /** + * struct nft_trans - nf_tables object update in transaction + * +@@ -1575,6 +1604,35 @@ struct nft_trans_flowtable { + #define nft_trans_flowtable_flags(trans) \ + (((struct nft_trans_flowtable *)trans->data)->flags) + ++#define NFT_TRANS_GC_BATCHCOUNT 256 ++ ++struct nft_trans_gc { ++ struct list_head list; ++ struct net *net; ++ struct nft_set *set; ++ u32 seq; ++ u8 count; ++ void *priv[NFT_TRANS_GC_BATCHCOUNT]; ++ struct rcu_head rcu; ++}; ++ ++struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, ++ unsigned int gc_seq, gfp_t gfp); ++void nft_trans_gc_destroy(struct nft_trans_gc *trans); ++ ++struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, ++ unsigned int gc_seq, gfp_t gfp); ++void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); ++ ++struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); ++void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); ++ ++void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); ++ ++void nft_setelem_data_deactivate(const struct net *net, ++ const struct nft_set *set, ++ struct nft_set_elem *elem); ++ + int __init nft_chain_filter_init(void); + void nft_chain_filter_fini(void); + +@@ -1595,6 +1653,7 @@ struct nftables_pernet { + struct mutex commit_mutex; + unsigned int base_seq; + u8 validate_state; ++ unsigned int gc_seq; + }; + + #endif /* _NET_NF_TABLES_H */ +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 5eef671578a25..1f06dd065d75e 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -32,7 +32,9 @@ static LIST_HEAD(nf_tables_expressions); + static LIST_HEAD(nf_tables_objects); + static LIST_HEAD(nf_tables_flowtables); + static LIST_HEAD(nf_tables_destroy_list); ++static LIST_HEAD(nf_tables_gc_list); + static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); ++static DEFINE_SPINLOCK(nf_tables_gc_list_lock); + static u64 table_handle; + + enum { +@@ -124,6 +126,9 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state) + static void nf_tables_trans_destroy_work(struct work_struct *w); + static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); + ++static void nft_trans_gc_work(struct work_struct *work); ++static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); ++ + static void nft_ctx_init(struct nft_ctx *ctx, + struct net *net, + const struct sk_buff *skb, +@@ -559,10 +564,6 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, + return 0; + } + +-static void nft_setelem_data_deactivate(const struct net *net, +- const struct nft_set *set, +- struct nft_set_elem *elem); +- + static int nft_mapelem_deactivate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, +@@ -4474,6 +4475,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, + } + + INIT_LIST_HEAD(&set->bindings); ++ refcount_set(&set->refs, 1); + set->table = table; + write_pnet(&set->net, net); + set->ops = ops; +@@ -4534,6 +4536,14 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, + return err; + } + ++static void nft_set_put(struct nft_set *set) ++{ ++ if (refcount_dec_and_test(&set->refs)) { ++ kfree(set->name); ++ kvfree(set); ++ } ++} ++ + static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) + { + if (WARN_ON(set->use > 0)) +@@ -4543,8 +4553,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) + nft_expr_destroy(ctx, set->expr); + + set->ops->destroy(ctx, set); +- kfree(set->name); +- kvfree(set); ++ nft_set_put(set); + } + + static int nf_tables_delset(struct net *net, struct sock *nlsk, +@@ -5768,9 +5777,9 @@ static void nft_setelem_data_activate(const struct net *net, + nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use); + } + +-static void nft_setelem_data_deactivate(const struct net *net, +- const struct nft_set *set, +- struct nft_set_elem *elem) ++void nft_setelem_data_deactivate(const struct net *net, ++ const struct nft_set *set, ++ struct nft_set_elem *elem) + { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + +@@ -8002,6 +8011,179 @@ void nft_chain_del(struct nft_chain *chain) + list_del_rcu(&chain->list); + } + ++static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx, ++ struct nft_trans_gc *trans) ++{ ++ void **priv = trans->priv; ++ unsigned int i; ++ ++ for (i = 0; i < trans->count; i++) { ++ struct nft_set_elem elem = { ++ .priv = priv[i], ++ }; ++ ++ nft_setelem_data_deactivate(ctx->net, trans->set, &elem); ++ trans->set->ops->remove(trans->net, trans->set, &elem); ++ } ++} ++ ++void nft_trans_gc_destroy(struct nft_trans_gc *trans) ++{ ++ nft_set_put(trans->set); ++ put_net(trans->net); ++ kfree(trans); ++} ++ ++static void nft_trans_gc_trans_free(struct rcu_head *rcu) ++{ ++ struct nft_set_elem elem = {}; ++ struct nft_trans_gc *trans; ++ struct nft_ctx ctx = {}; ++ unsigned int i; ++ ++ trans = container_of(rcu, struct nft_trans_gc, rcu); ++ ctx.net = read_pnet(&trans->set->net); ++ ++ for (i = 0; i < trans->count; i++) { ++ elem.priv = trans->priv[i]; ++ atomic_dec(&trans->set->nelems); ++ ++ nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv); ++ } ++ ++ nft_trans_gc_destroy(trans); ++} ++ ++static bool nft_trans_gc_work_done(struct nft_trans_gc *trans) ++{ ++ struct nftables_pernet *nft_net; ++ struct nft_ctx ctx = {}; ++ ++ nft_net = net_generic(trans->net, nf_tables_net_id); ++ ++ mutex_lock(&nft_net->commit_mutex); ++ ++ /* Check for race with transaction, otherwise this batch refers to ++ * stale objects that might not be there anymore. Skip transaction if ++ * set has been destroyed from control plane transaction in case gc ++ * worker loses race. ++ */ ++ if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) { ++ mutex_unlock(&nft_net->commit_mutex); ++ return false; ++ } ++ ++ ctx.net = trans->net; ++ ctx.table = trans->set->table; ++ ++ nft_trans_gc_setelem_remove(&ctx, trans); ++ mutex_unlock(&nft_net->commit_mutex); ++ ++ return true; ++} ++ ++static void nft_trans_gc_work(struct work_struct *work) ++{ ++ struct nft_trans_gc *trans, *next; ++ LIST_HEAD(trans_gc_list); ++ ++ spin_lock(&nf_tables_destroy_list_lock); ++ list_splice_init(&nf_tables_gc_list, &trans_gc_list); ++ spin_unlock(&nf_tables_destroy_list_lock); ++ ++ list_for_each_entry_safe(trans, next, &trans_gc_list, list) { ++ list_del(&trans->list); ++ if (!nft_trans_gc_work_done(trans)) { ++ nft_trans_gc_destroy(trans); ++ continue; ++ } ++ call_rcu(&trans->rcu, nft_trans_gc_trans_free); ++ } ++} ++ ++struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, ++ unsigned int gc_seq, gfp_t gfp) ++{ ++ struct net *net = read_pnet(&set->net); ++ struct nft_trans_gc *trans; ++ ++ trans = kzalloc(sizeof(*trans), gfp); ++ if (!trans) ++ return NULL; ++ ++ refcount_inc(&set->refs); ++ trans->set = set; ++ trans->net = get_net(net); ++ trans->seq = gc_seq; ++ ++ return trans; ++} ++ ++void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv) ++{ ++ trans->priv[trans->count++] = priv; ++} ++ ++static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) ++{ ++ spin_lock(&nf_tables_gc_list_lock); ++ list_add_tail(&trans->list, &nf_tables_gc_list); ++ spin_unlock(&nf_tables_gc_list_lock); ++ ++ schedule_work(&trans_gc_work); ++} ++ ++static int nft_trans_gc_space(struct nft_trans_gc *trans) ++{ ++ return NFT_TRANS_GC_BATCHCOUNT - trans->count; ++} ++ ++struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, ++ unsigned int gc_seq, gfp_t gfp) ++{ ++ if (nft_trans_gc_space(gc)) ++ return gc; ++ ++ nft_trans_gc_queue_work(gc); ++ ++ return nft_trans_gc_alloc(gc->set, gc_seq, gfp); ++} ++ ++void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) ++{ ++ if (trans->count == 0) { ++ nft_trans_gc_destroy(trans); ++ return; ++ } ++ ++ nft_trans_gc_queue_work(trans); ++} ++ ++struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) ++{ ++ if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) ++ return NULL; ++ ++ if (nft_trans_gc_space(gc)) ++ return gc; ++ ++ call_rcu(&gc->rcu, nft_trans_gc_trans_free); ++ ++ return nft_trans_gc_alloc(gc->set, 0, gfp); ++} ++ ++void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) ++{ ++ WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net)); ++ ++ if (trans->count == 0) { ++ nft_trans_gc_destroy(trans); ++ return; ++ } ++ ++ call_rcu(&trans->rcu, nft_trans_gc_trans_free); ++} ++ + static void nf_tables_module_autoload_cleanup(struct net *net) + { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); +@@ -8168,6 +8350,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) + struct nft_trans_elem *te; + struct nft_chain *chain; + struct nft_table *table; ++ unsigned int gc_seq; + LIST_HEAD(adl); + int err; + +@@ -8240,6 +8423,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) + while (++nft_net->base_seq == 0) + ; + ++ /* Bump gc counter, it becomes odd, this is the busy mark. */ ++ gc_seq = READ_ONCE(nft_net->gc_seq); ++ WRITE_ONCE(nft_net->gc_seq, ++gc_seq); ++ + /* step 3. Start new generation, rules_gen_X now in use. */ + net->nft.gencursor = nft_gencursor_next(net); + +@@ -8319,6 +8506,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSET: ++ nft_trans_set(trans)->dead = 1; + list_del_rcu(&nft_trans_set(trans)->list); + nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + NFT_MSG_DELSET, GFP_KERNEL); +@@ -8416,6 +8604,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) + nft_commit_notify(net, NETLINK_CB(skb).portid); + nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + nf_tables_commit_audit_log(&adl, nft_net->base_seq); ++ ++ WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + nf_tables_commit_release(net); + + return 0; +@@ -9353,6 +9543,7 @@ static int __net_init nf_tables_init_net(struct net *net) + mutex_init(&nft_net->commit_mutex); + nft_net->base_seq = 1; + nft_net->validate_state = NFT_VALIDATE_SKIP; ++ nft_net->gc_seq = 0; + + return 0; + } +@@ -9380,10 +9571,16 @@ static void __net_exit nf_tables_exit_net(struct net *net) + WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); + } + ++static void nf_tables_exit_batch(struct list_head *net_exit_list) ++{ ++ flush_work(&trans_gc_work); ++} ++ + static struct pernet_operations nf_tables_net_ops = { + .init = nf_tables_init_net, + .pre_exit = nf_tables_pre_exit_net, + .exit = nf_tables_exit_net, ++ .exit_batch = nf_tables_exit_batch, + .id = &nf_tables_net_id, + .size = sizeof(struct nftables_pernet), + }; +@@ -9448,6 +9645,7 @@ static void __exit nf_tables_module_exit(void) + nft_chain_filter_fini(); + nft_chain_route_fini(); + unregister_pernet_subsys(&nf_tables_net_ops); ++ cancel_work_sync(&trans_gc_work); + cancel_work_sync(&trans_destroy_work); + rcu_barrier(); + rhltable_destroy(&nft_objname_ht); +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nf_tables-gc-transaction-race-with-abort-p.patch b/queue-5.10/netfilter-nf_tables-gc-transaction-race-with-abort-p.patch new file mode 100644 index 00000000000..444914519f3 --- /dev/null +++ b/queue-5.10/netfilter-nf_tables-gc-transaction-race-with-abort-p.patch @@ -0,0 +1,41 @@ +From 0a32bcedfeaf611c24b5f3b227aaec6670db517d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:11 +0200 +Subject: netfilter: nf_tables: GC transaction race with abort path + +From: Pablo Neira Ayuso + +commit 720344340fb9be2765bbaab7b292ece0a4570eae upstream. + +Abort path is missing a synchronization point with GC transactions. Add +GC sequence number hence any GC transaction losing race will be +discarded. + +Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_tables_api.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 78bf82f89ecd8..1f67931b86d8e 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -8844,7 +8844,12 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, + enum nfnl_abort_action action) + { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); +- int ret = __nf_tables_abort(net, action); ++ unsigned int gc_seq; ++ int ret; ++ ++ gc_seq = nft_gc_seq_begin(nft_net); ++ ret = __nf_tables_abort(net, action); ++ nft_gc_seq_end(nft_net, gc_seq); + + mutex_unlock(&nft_net->commit_mutex); + +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nf_tables-gc-transaction-race-with-netns-d.patch b/queue-5.10/netfilter-nf_tables-gc-transaction-race-with-netns-d.patch new file mode 100644 index 00000000000..b127e3e3e38 --- /dev/null +++ b/queue-5.10/netfilter-nf_tables-gc-transaction-race-with-netns-d.patch @@ -0,0 +1,42 @@ +From bb1c9a36337d403251b692bd474df25abde91129 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:10 +0200 +Subject: netfilter: nf_tables: GC transaction race with netns dismantle + +From: Pablo Neira Ayuso + +commit 02c6c24402bf1c1e986899c14ba22a10b510916b upstream. + +Use maybe_get_net() since GC workqueue might race with netns exit path. + +Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Florian Westphal +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_tables_api.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 43da2f0a52623..78bf82f89ecd8 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -8089,9 +8089,14 @@ struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + if (!trans) + return NULL; + ++ trans->net = maybe_get_net(net); ++ if (!trans->net) { ++ kfree(trans); ++ return NULL; ++ } ++ + refcount_inc(&set->refs); + trans->set = set; +- trans->net = get_net(net); + trans->seq = gc_seq; + + return trans; +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nf_tables-integrate-pipapo-into-commit-pro.patch b/queue-5.10/netfilter-nf_tables-integrate-pipapo-into-commit-pro.patch new file mode 100644 index 00000000000..c921cc57631 --- /dev/null +++ b/queue-5.10/netfilter-nf_tables-integrate-pipapo-into-commit-pro.patch @@ -0,0 +1,316 @@ +From f8617e22dc6cf135765c8c6e1efc5d2cc6c852da Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:02 +0200 +Subject: netfilter: nf_tables: integrate pipapo into commit protocol + +From: Pablo Neira Ayuso + +commit 212ed75dc5fb9d1423b3942c8f872a868cda3466 upstream. + +The pipapo set backend follows copy-on-update approach, maintaining one +clone of the existing datastructure that is being updated. The clone +and current datastructures are swapped via rcu from the commit step. + +The existing integration with the commit protocol is flawed because +there is no operation to clean up the clone if the transaction is +aborted. Moreover, the datastructure swap happens on set element +activation. + +This patch adds two new operations for sets: commit and abort, these new +operations are invoked from the commit and abort steps, after the +transactions have been digested, and it updates the pipapo set backend +to use it. + +This patch adds a new ->pending_update field to sets to maintain a list +of sets that require this new commit and abort operations. + +Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_tables.h | 4 ++- + net/netfilter/nf_tables_api.c | 56 +++++++++++++++++++++++++++++++ + net/netfilter/nft_set_pipapo.c | 55 +++++++++++++++++++++--------- + 3 files changed, 99 insertions(+), 16 deletions(-) + +diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h +index eec29dd6681ca..a3068ed0f3169 100644 +--- a/include/net/netfilter/nf_tables.h ++++ b/include/net/netfilter/nf_tables.h +@@ -373,7 +373,8 @@ struct nft_set_ops { + const struct nft_set *set, + const struct nft_set_elem *elem, + unsigned int flags); +- ++ void (*commit)(const struct nft_set *set); ++ void (*abort)(const struct nft_set *set); + u64 (*privsize)(const struct nlattr * const nla[], + const struct nft_set_desc *desc); + bool (*estimate)(const struct nft_set_desc *desc, +@@ -454,6 +455,7 @@ struct nft_set { + u16 udlen; + unsigned char *udata; + struct nft_expr *expr; ++ struct list_head pending_update; + /* runtime data below here */ + const struct nft_set_ops *ops ____cacheline_aligned; + u16 flags:14, +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 2669999d1bc9c..430dcd0f6c3b2 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -4509,6 +4509,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, + } + + set->handle = nf_tables_alloc_handle(table); ++ INIT_LIST_HEAD(&set->pending_update); + + err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); + if (err < 0) +@@ -8141,10 +8142,25 @@ static void nf_tables_commit_audit_log(struct list_head *adl, u32 generation) + } + } + ++static void nft_set_commit_update(struct list_head *set_update_list) ++{ ++ struct nft_set *set, *next; ++ ++ list_for_each_entry_safe(set, next, set_update_list, pending_update) { ++ list_del_init(&set->pending_update); ++ ++ if (!set->ops->commit) ++ continue; ++ ++ set->ops->commit(set); ++ } ++} ++ + static int nf_tables_commit(struct net *net, struct sk_buff *skb) + { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nft_trans *trans, *next; ++ LIST_HEAD(set_update_list); + struct nft_trans_elem *te; + struct nft_chain *chain; + struct nft_table *table; +@@ -8310,6 +8326,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) + nf_tables_setelem_notify(&trans->ctx, te->set, + &te->elem, + NFT_MSG_NEWSETELEM, 0); ++ if (te->set->ops->commit && ++ list_empty(&te->set->pending_update)) { ++ list_add_tail(&te->set->pending_update, ++ &set_update_list); ++ } + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSETELEM: +@@ -8321,6 +8342,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) + te->set->ops->remove(net, te->set, &te->elem); + atomic_dec(&te->set->nelems); + te->set->ndeact--; ++ if (te->set->ops->commit && ++ list_empty(&te->set->pending_update)) { ++ list_add_tail(&te->set->pending_update, ++ &set_update_list); ++ } + break; + case NFT_MSG_NEWOBJ: + if (nft_trans_obj_update(trans)) { +@@ -8381,6 +8407,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) + } + } + ++ nft_set_commit_update(&set_update_list); ++ + nft_commit_notify(net, NETLINK_CB(skb).portid); + nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + nf_tables_commit_audit_log(&adl, nft_net->base_seq); +@@ -8437,10 +8465,25 @@ static void nf_tables_abort_release(struct nft_trans *trans) + kfree(trans); + } + ++static void nft_set_abort_update(struct list_head *set_update_list) ++{ ++ struct nft_set *set, *next; ++ ++ list_for_each_entry_safe(set, next, set_update_list, pending_update) { ++ list_del_init(&set->pending_update); ++ ++ if (!set->ops->abort) ++ continue; ++ ++ set->ops->abort(set); ++ } ++} ++ + static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) + { + struct nftables_pernet *nft_net = net_generic(net, nf_tables_net_id); + struct nft_trans *trans, *next; ++ LIST_HEAD(set_update_list); + struct nft_trans_elem *te; + + if (action == NFNL_ABORT_VALIDATE && +@@ -8529,6 +8572,12 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) + te = (struct nft_trans_elem *)trans->data; + te->set->ops->remove(net, te->set, &te->elem); + atomic_dec(&te->set->nelems); ++ ++ if (te->set->ops->abort && ++ list_empty(&te->set->pending_update)) { ++ list_add_tail(&te->set->pending_update, ++ &set_update_list); ++ } + break; + case NFT_MSG_DELSETELEM: + te = (struct nft_trans_elem *)trans->data; +@@ -8537,6 +8586,11 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) + te->set->ops->activate(net, te->set, &te->elem); + te->set->ndeact--; + ++ if (te->set->ops->abort && ++ list_empty(&te->set->pending_update)) { ++ list_add_tail(&te->set->pending_update, ++ &set_update_list); ++ } + nft_trans_destroy(trans); + break; + case NFT_MSG_NEWOBJ: +@@ -8577,6 +8631,8 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) + } + } + ++ nft_set_abort_update(&set_update_list); ++ + synchronize_rcu(); + + list_for_each_entry_safe_reverse(trans, next, +diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c +index 50f840e312b03..ce6c07ea7244b 100644 +--- a/net/netfilter/nft_set_pipapo.c ++++ b/net/netfilter/nft_set_pipapo.c +@@ -1603,17 +1603,10 @@ static void pipapo_free_fields(struct nft_pipapo_match *m) + } + } + +-/** +- * pipapo_reclaim_match - RCU callback to free fields from old matching data +- * @rcu: RCU head +- */ +-static void pipapo_reclaim_match(struct rcu_head *rcu) ++static void pipapo_free_match(struct nft_pipapo_match *m) + { +- struct nft_pipapo_match *m; + int i; + +- m = container_of(rcu, struct nft_pipapo_match, rcu); +- + for_each_possible_cpu(i) + kfree(*per_cpu_ptr(m->scratch, i)); + +@@ -1628,7 +1621,19 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) + } + + /** +- * pipapo_commit() - Replace lookup data with current working copy ++ * pipapo_reclaim_match - RCU callback to free fields from old matching data ++ * @rcu: RCU head ++ */ ++static void pipapo_reclaim_match(struct rcu_head *rcu) ++{ ++ struct nft_pipapo_match *m; ++ ++ m = container_of(rcu, struct nft_pipapo_match, rcu); ++ pipapo_free_match(m); ++} ++ ++/** ++ * nft_pipapo_commit() - Replace lookup data with current working copy + * @set: nftables API set representation + * + * While at it, check if we should perform garbage collection on the working +@@ -1638,7 +1643,7 @@ static void pipapo_reclaim_match(struct rcu_head *rcu) + * We also need to create a new working copy for subsequent insertions and + * deletions. + */ +-static void pipapo_commit(const struct nft_set *set) ++static void nft_pipapo_commit(const struct nft_set *set) + { + struct nft_pipapo *priv = nft_set_priv(set); + struct nft_pipapo_match *new_clone, *old; +@@ -1663,6 +1668,26 @@ static void pipapo_commit(const struct nft_set *set) + priv->clone = new_clone; + } + ++static void nft_pipapo_abort(const struct nft_set *set) ++{ ++ struct nft_pipapo *priv = nft_set_priv(set); ++ struct nft_pipapo_match *new_clone, *m; ++ ++ if (!priv->dirty) ++ return; ++ ++ m = rcu_dereference(priv->match); ++ ++ new_clone = pipapo_clone(m); ++ if (IS_ERR(new_clone)) ++ return; ++ ++ priv->dirty = false; ++ ++ pipapo_free_match(priv->clone); ++ priv->clone = new_clone; ++} ++ + /** + * nft_pipapo_activate() - Mark element reference as active given key, commit + * @net: Network namespace +@@ -1670,8 +1695,7 @@ static void pipapo_commit(const struct nft_set *set) + * @elem: nftables API element representation containing key data + * + * On insertion, elements are added to a copy of the matching data currently +- * in use for lookups, and not directly inserted into current lookup data, so +- * we'll take care of that by calling pipapo_commit() here. Both ++ * in use for lookups, and not directly inserted into current lookup data. Both + * nft_pipapo_insert() and nft_pipapo_activate() are called once for each + * element, hence we can't purpose either one as a real commit operation. + */ +@@ -1687,8 +1711,6 @@ static void nft_pipapo_activate(const struct net *net, + + nft_set_elem_change_active(net, set, &e->ext); + nft_set_elem_clear_busy(&e->ext); +- +- pipapo_commit(set); + } + + /** +@@ -1938,7 +1960,6 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, + if (i == m->field_count) { + priv->dirty = true; + pipapo_drop(m, rulemap); +- pipapo_commit(set); + return; + } + +@@ -2245,6 +2266,8 @@ const struct nft_set_type nft_set_pipapo_type = { + .init = nft_pipapo_init, + .destroy = nft_pipapo_destroy, + .gc_init = nft_pipapo_gc_init, ++ .commit = nft_pipapo_commit, ++ .abort = nft_pipapo_abort, + .elemsize = offsetof(struct nft_pipapo_elem, ext), + }, + }; +@@ -2267,6 +2290,8 @@ const struct nft_set_type nft_set_pipapo_avx2_type = { + .init = nft_pipapo_init, + .destroy = nft_pipapo_destroy, + .gc_init = nft_pipapo_gc_init, ++ .commit = nft_pipapo_commit, ++ .abort = nft_pipapo_abort, + .elemsize = offsetof(struct nft_pipapo_elem, ext), + }, + }; +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nf_tables-remove-busy-mark-and-gc-batch-ap.patch b/queue-5.10/netfilter-nf_tables-remove-busy-mark-and-gc-batch-ap.patch new file mode 100644 index 00000000000..f1a65ab3aa5 --- /dev/null +++ b/queue-5.10/netfilter-nf_tables-remove-busy-mark-and-gc-batch-ap.patch @@ -0,0 +1,184 @@ +From 344587286eea4cd70d285f25be6e9b2bc108a737 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:07 +0200 +Subject: netfilter: nf_tables: remove busy mark and gc batch API + +From: Pablo Neira Ayuso + +commit a2dd0233cbc4d8a0abb5f64487487ffc9265beb5 upstream. + +Ditch it, it has been replace it by the GC transaction API and it has no +clients anymore. + +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_tables.h | 97 +------------------------------ + net/netfilter/nf_tables_api.c | 26 +-------- + 2 files changed, 5 insertions(+), 118 deletions(-) + +diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h +index 39a0b37e8a1a2..9182b583d4297 100644 +--- a/include/net/netfilter/nf_tables.h ++++ b/include/net/netfilter/nf_tables.h +@@ -695,62 +695,6 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, + void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, void *elem); + +-/** +- * struct nft_set_gc_batch_head - nf_tables set garbage collection batch +- * +- * @rcu: rcu head +- * @set: set the elements belong to +- * @cnt: count of elements +- */ +-struct nft_set_gc_batch_head { +- struct rcu_head rcu; +- const struct nft_set *set; +- unsigned int cnt; +-}; +- +-#define NFT_SET_GC_BATCH_SIZE ((PAGE_SIZE - \ +- sizeof(struct nft_set_gc_batch_head)) / \ +- sizeof(void *)) +- +-/** +- * struct nft_set_gc_batch - nf_tables set garbage collection batch +- * +- * @head: GC batch head +- * @elems: garbage collection elements +- */ +-struct nft_set_gc_batch { +- struct nft_set_gc_batch_head head; +- void *elems[NFT_SET_GC_BATCH_SIZE]; +-}; +- +-struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, +- gfp_t gfp); +-void nft_set_gc_batch_release(struct rcu_head *rcu); +- +-static inline void nft_set_gc_batch_complete(struct nft_set_gc_batch *gcb) +-{ +- if (gcb != NULL) +- call_rcu(&gcb->head.rcu, nft_set_gc_batch_release); +-} +- +-static inline struct nft_set_gc_batch * +-nft_set_gc_batch_check(const struct nft_set *set, struct nft_set_gc_batch *gcb, +- gfp_t gfp) +-{ +- if (gcb != NULL) { +- if (gcb->head.cnt + 1 < ARRAY_SIZE(gcb->elems)) +- return gcb; +- nft_set_gc_batch_complete(gcb); +- } +- return nft_set_gc_batch_alloc(set, gfp); +-} +- +-static inline void nft_set_gc_batch_add(struct nft_set_gc_batch *gcb, +- void *elem) +-{ +- gcb->elems[gcb->head.cnt++] = elem; +-} +- + struct nft_expr_ops; + /** + * struct nft_expr_type - nf_tables expression type +@@ -1418,47 +1362,12 @@ static inline void nft_set_elem_change_active(const struct net *net, + + #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ + +-/* +- * We use a free bit in the genmask field to indicate the element +- * is busy, meaning it is currently being processed either by +- * the netlink API or GC. +- * +- * Even though the genmask is only a single byte wide, this works +- * because the extension structure if fully constant once initialized, +- * so there are no non-atomic write accesses unless it is already +- * marked busy. +- */ +-#define NFT_SET_ELEM_BUSY_MASK (1 << 2) +- +-#if defined(__LITTLE_ENDIAN_BITFIELD) +-#define NFT_SET_ELEM_BUSY_BIT 2 +-#elif defined(__BIG_ENDIAN_BITFIELD) +-#define NFT_SET_ELEM_BUSY_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) +-#else +-#error +-#endif +- +-static inline int nft_set_elem_mark_busy(struct nft_set_ext *ext) +-{ +- unsigned long *word = (unsigned long *)ext; +- +- BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); +- return test_and_set_bit(NFT_SET_ELEM_BUSY_BIT, word); +-} +- +-static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) +-{ +- unsigned long *word = (unsigned long *)ext; +- +- clear_bit(NFT_SET_ELEM_BUSY_BIT, word); +-} +- +-#define NFT_SET_ELEM_DEAD_MASK (1 << 3) ++#define NFT_SET_ELEM_DEAD_MASK (1 << 2) + + #if defined(__LITTLE_ENDIAN_BITFIELD) +-#define NFT_SET_ELEM_DEAD_BIT 3 ++#define NFT_SET_ELEM_DEAD_BIT 2 + #elif defined(__BIG_ENDIAN_BITFIELD) +-#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3) ++#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) + #else + #error + #endif +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 1f06dd065d75e..206755eb35f3a 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -5637,7 +5637,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, + goto err_elem_expr; + } + +- ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; ++ ext->genmask = nft_genmask_cur(ctx->net); ++ + err = set->ops->insert(ctx->net, set, &elem, &ext2); + if (err) { + if (err == -EEXIST) { +@@ -5945,29 +5946,6 @@ static int nf_tables_delsetelem(struct net *net, struct sock *nlsk, + return err; + } + +-void nft_set_gc_batch_release(struct rcu_head *rcu) +-{ +- struct nft_set_gc_batch *gcb; +- unsigned int i; +- +- gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu); +- for (i = 0; i < gcb->head.cnt; i++) +- nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true); +- kfree(gcb); +-} +- +-struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, +- gfp_t gfp) +-{ +- struct nft_set_gc_batch *gcb; +- +- gcb = kzalloc(sizeof(*gcb), gfp); +- if (gcb == NULL) +- return gcb; +- gcb->head.set = set; +- return gcb; +-} +- + /* + * Stateful objects + */ +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nf_tables-use-correct-lock-to-protect-gc_l.patch b/queue-5.10/netfilter-nf_tables-use-correct-lock-to-protect-gc_l.patch new file mode 100644 index 00000000000..ab181da1542 --- /dev/null +++ b/queue-5.10/netfilter-nf_tables-use-correct-lock-to-protect-gc_l.patch @@ -0,0 +1,38 @@ +From 881dd729cbf8d7a87b68450c70ab9d930c39b393 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:12 +0200 +Subject: netfilter: nf_tables: use correct lock to protect gc_list + +From: Pablo Neira Ayuso + +commit 8357bc946a2abc2a10ca40e5a2105d2b4c57515e upstream. + +Use nf_tables_gc_list_lock spinlock, not nf_tables_destroy_list_lock to +protect the gc_list. + +Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_tables_api.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 1f67931b86d8e..9fc302a6836ba 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -8065,9 +8065,9 @@ static void nft_trans_gc_work(struct work_struct *work) + struct nft_trans_gc *trans, *next; + LIST_HEAD(trans_gc_list); + +- spin_lock(&nf_tables_destroy_list_lock); ++ spin_lock(&nf_tables_gc_list_lock); + list_splice_init(&nf_tables_gc_list, &trans_gc_list); +- spin_unlock(&nf_tables_destroy_list_lock); ++ spin_unlock(&nf_tables_gc_list_lock); + + list_for_each_entry_safe(trans, next, &trans_gc_list, list) { + list_del(&trans->list); +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nft_set_hash-mark-set-element-as-dead-when.patch b/queue-5.10/netfilter-nft_set_hash-mark-set-element-as-dead-when.patch new file mode 100644 index 00000000000..0e3c034cfe5 --- /dev/null +++ b/queue-5.10/netfilter-nft_set_hash-mark-set-element-as-dead-when.patch @@ -0,0 +1,49 @@ +From aa5597dda62c3f544aad122b646afc5b979526e8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:06 +0200 +Subject: netfilter: nft_set_hash: mark set element as dead when deleting from + packet path + +From: Pablo Neira Ayuso + +commit c92db3030492b8ad1d0faace7a93bbcf53850d0c upstream. + +Set on the NFT_SET_ELEM_DEAD_BIT flag on this element, instead of +performing element removal which might race with an ongoing transaction. +Enable gc when dynamic flag is set on since dynset deletion requires +garbage collection after this patch. + +Fixes: d0a8d877da97 ("netfilter: nft_dynset: support for element deletion") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_set_hash.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c +index 6ae99b3107bc9..9cdf348b048a4 100644 +--- a/net/netfilter/nft_set_hash.c ++++ b/net/netfilter/nft_set_hash.c +@@ -251,7 +251,9 @@ static bool nft_rhash_delete(const struct nft_set *set, + if (he == NULL) + return false; + +- return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0; ++ nft_set_elem_dead(&he->ext); ++ ++ return true; + } + + static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, +@@ -398,7 +400,7 @@ static int nft_rhash_init(const struct nft_set *set, + return err; + + INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc); +- if (set->flags & NFT_SET_TIMEOUT) ++ if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL)) + nft_rhash_gc_init(set); + + return 0; +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nft_set_hash-try-later-when-gc-hits-eagain.patch b/queue-5.10/netfilter-nft_set_hash-try-later-when-gc-hits-eagain.patch new file mode 100644 index 00000000000..978954bd4d5 --- /dev/null +++ b/queue-5.10/netfilter-nft_set_hash-try-later-when-gc-hits-eagain.patch @@ -0,0 +1,42 @@ +From 9d1209c57ca27a0a606d525bf0e1f3781c1559ec Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:17 +0200 +Subject: netfilter: nft_set_hash: try later when GC hits EAGAIN on iteration + +From: Pablo Neira Ayuso + +commit b079155faae94e9b3ab9337e82100a914ebb4e8d upstream. + +Skip GC run if iterator rewinds to the beginning with EAGAIN, otherwise GC +might collect the same element more than once. + +Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_set_hash.c | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c +index 68a16ee37b3d0..f0a9ad1c4ea44 100644 +--- a/net/netfilter/nft_set_hash.c ++++ b/net/netfilter/nft_set_hash.c +@@ -324,12 +324,9 @@ static void nft_rhash_gc(struct work_struct *work) + + while ((he = rhashtable_walk_next(&hti))) { + if (IS_ERR(he)) { +- if (PTR_ERR(he) != -EAGAIN) { +- nft_trans_gc_destroy(gc); +- gc = NULL; +- goto try_later; +- } +- continue; ++ nft_trans_gc_destroy(gc); ++ gc = NULL; ++ goto try_later; + } + + /* Ruleset has been updated, try later. */ +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nft_set_pipapo-stop-gc-iteration-if-gc-tra.patch b/queue-5.10/netfilter-nft_set_pipapo-stop-gc-iteration-if-gc-tra.patch new file mode 100644 index 00000000000..79f5ecee58b --- /dev/null +++ b/queue-5.10/netfilter-nft_set_pipapo-stop-gc-iteration-if-gc-tra.patch @@ -0,0 +1,37 @@ +From 833ed97574f6983f9fec97128f55d361f60227af Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:16 +0200 +Subject: netfilter: nft_set_pipapo: stop GC iteration if GC transaction + allocation fails + +From: Pablo Neira Ayuso + +commit 6d365eabce3c018a80f6e0379b17df2abb17405e upstream. + +nft_trans_gc_queue_sync() enqueues the GC transaction and it allocates a +new one. If this allocation fails, then stop this GC sync run and retry +later. + +Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_set_pipapo.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c +index 80440ac5d44c6..fbfcc3275cadf 100644 +--- a/net/netfilter/nft_set_pipapo.c ++++ b/net/netfilter/nft_set_pipapo.c +@@ -1595,7 +1595,7 @@ static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) + + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (!gc) +- break; ++ return; + + nft_pipapo_gc_deactivate(net, set, e); + pipapo_drop(m, rulemap); +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nft_set_rbtree-skip-sync-gc-for-new-elemen.patch b/queue-5.10/netfilter-nft_set_rbtree-skip-sync-gc-for-new-elemen.patch new file mode 100644 index 00000000000..a6b77734ff0 --- /dev/null +++ b/queue-5.10/netfilter-nft_set_rbtree-skip-sync-gc-for-new-elemen.patch @@ -0,0 +1,53 @@ +From 1ae5807f869d5f00a1099af2812b69cbcc525c00 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:14 +0200 +Subject: netfilter: nft_set_rbtree: skip sync GC for new elements in this + transaction + +From: Pablo Neira Ayuso + +commit 2ee52ae94baabf7ee09cf2a8d854b990dac5d0e4 upstream. + +New elements in this transaction might expired before such transaction +ends. Skip sync GC for such elements otherwise commit path might walk +over an already released object. Once transaction is finished, async GC +will collect such expired element. + +Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_set_rbtree.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c +index 9b0bdd4216152..535076b4de53d 100644 +--- a/net/netfilter/nft_set_rbtree.c ++++ b/net/netfilter/nft_set_rbtree.c +@@ -314,6 +314,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, + struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; + struct rb_node *node, *next, *parent, **p, *first = NULL; + struct nft_rbtree *priv = nft_set_priv(set); ++ u8 cur_genmask = nft_genmask_cur(net); + u8 genmask = nft_genmask_next(net); + int d, err; + +@@ -359,8 +360,11 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, + if (!nft_set_elem_active(&rbe->ext, genmask)) + continue; + +- /* perform garbage collection to avoid bogus overlap reports. */ +- if (nft_set_elem_expired(&rbe->ext)) { ++ /* perform garbage collection to avoid bogus overlap reports ++ * but skip new elements in this transaction. ++ */ ++ if (nft_set_elem_expired(&rbe->ext) && ++ nft_set_elem_active(&rbe->ext, cur_genmask)) { + err = nft_rbtree_gc_elem(set, priv, rbe, genmask); + if (err < 0) + return err; +-- +2.40.1 + diff --git a/queue-5.10/netfilter-nft_set_rbtree-use-read-spinlock-to-avoid-.patch b/queue-5.10/netfilter-nft_set_rbtree-use-read-spinlock-to-avoid-.patch new file mode 100644 index 00000000000..9537bcdd692 --- /dev/null +++ b/queue-5.10/netfilter-nft_set_rbtree-use-read-spinlock-to-avoid-.patch @@ -0,0 +1,48 @@ +From c9a026771d8a46e978f8d2f48792fea2dde20b27 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 19:01:15 +0200 +Subject: netfilter: nft_set_rbtree: use read spinlock to avoid datapath + contention + +From: Pablo Neira Ayuso + +commit 96b33300fba880ec0eafcf3d82486f3463b4b6da upstream. + +rbtree GC does not modify the datastructure, instead it collects expired +elements and it enqueues a GC transaction. Use a read spinlock instead +to avoid data contention while GC worker is running. + +Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_set_rbtree.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c +index 535076b4de53d..cc32e19b4041a 100644 +--- a/net/netfilter/nft_set_rbtree.c ++++ b/net/netfilter/nft_set_rbtree.c +@@ -624,8 +624,7 @@ static void nft_rbtree_gc(struct work_struct *work) + if (!gc) + goto done; + +- write_lock_bh(&priv->lock); +- write_seqcount_begin(&priv->count); ++ read_lock_bh(&priv->lock); + for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + + /* Ruleset has been updated, try later. */ +@@ -672,8 +671,7 @@ static void nft_rbtree_gc(struct work_struct *work) + nft_trans_gc_elem_add(gc, rbe); + } + try_later: +- write_seqcount_end(&priv->count); +- write_unlock_bh(&priv->lock); ++ read_unlock_bh(&priv->lock); + + if (gc) + nft_trans_gc_queue_async_done(gc); +-- +2.40.1 + diff --git a/queue-5.10/series b/queue-5.10/series index e5f1b481266..73bdad8c4d8 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -13,3 +13,20 @@ ext4-replace-the-traditional-ternary-conditional-ope.patch ext4-move-setting-of-trimmed-bit-into-ext4_try_to_tr.patch ext4-do-not-let-fstrim-block-system-suspend.patch tracing-have-event-inject-files-inc-the-trace-array-.patch +netfilter-nf_tables-integrate-pipapo-into-commit-pro.patch +netfilter-nf_tables-don-t-skip-expired-elements-duri.patch +netfilter-nf_tables-gc-transaction-api-to-avoid-race.patch +netfilter-nf_tables-adapt-set-backend-to-use-gc-tran.patch +netfilter-nft_set_hash-mark-set-element-as-dead-when.patch +netfilter-nf_tables-remove-busy-mark-and-gc-batch-ap.patch +netfilter-nf_tables-don-t-fail-inserts-if-duplicate-.patch +netfilter-nf_tables-fix-gc-transaction-races-with-ne.patch +netfilter-nf_tables-gc-transaction-race-with-netns-d.patch +netfilter-nf_tables-gc-transaction-race-with-abort-p.patch +netfilter-nf_tables-use-correct-lock-to-protect-gc_l.patch +netfilter-nf_tables-defer-gc-run-if-previous-batch-i.patch +netfilter-nft_set_rbtree-skip-sync-gc-for-new-elemen.patch +netfilter-nft_set_rbtree-use-read-spinlock-to-avoid-.patch +netfilter-nft_set_pipapo-stop-gc-iteration-if-gc-tra.patch +netfilter-nft_set_hash-try-later-when-gc-hits-eagain.patch +netfilter-nf_tables-fix-memleak-when-more-than-255-e.patch