From: Sasha Levin Date: Sat, 23 Sep 2023 12:50:26 +0000 (-0400) Subject: Fixes for 5.15 X-Git-Tag: v6.5.6~106 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=debb252eebf944bffc1c76403bda62406d60e4d5;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 5.15 Signed-off-by: Sasha Levin --- diff --git a/queue-5.15/netfilter-nf_tables-adapt-set-backend-to-use-gc-tran.patch b/queue-5.15/netfilter-nf_tables-adapt-set-backend-to-use-gc-tran.patch new file mode 100644 index 00000000000..812b93091dd --- /dev/null +++ b/queue-5.15/netfilter-nf_tables-adapt-set-backend-to-use-gc-tran.patch @@ -0,0 +1,557 @@ +From f45e1f5f796e4c04a69525708ef596c738f7dd92 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:42:59 +0200 +Subject: netfilter: nf_tables: adapt set backend to use GC transaction API + +From: Pablo Neira Ayuso + +commit f6c383b8c31a93752a52697f8430a71dcbc46adf upstream. + +Use the GC transaction API to replace the old and buggy gc API and the +busy mark approach. + +No set elements are removed from async garbage collection anymore, +instead the _DEAD bit is set on so the set element is not visible from +lookup path anymore. Async GC enqueues transaction work that might be +aborted and retried later. + +rbtree and pipapo set backends does not set on the _DEAD bit from the +sync GC path since this runs in control plane path where mutex is held. +In this case, set elements are deactivated, removed and then released +via RCU callback, sync GC never fails. + +Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") +Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") +Fixes: 9d0982927e79 ("netfilter: nft_hash: add support for timeouts") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_tables_api.c | 7 +- + net/netfilter/nft_set_hash.c | 77 +++++++++++------- + net/netfilter/nft_set_pipapo.c | 48 ++++++++--- + net/netfilter/nft_set_rbtree.c | 144 ++++++++++++++++++++------------- + 4 files changed, 173 insertions(+), 103 deletions(-) + +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index a389240a67f32..3749cbd7c1fdd 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -6008,7 +6008,6 @@ static void nft_setelem_activate(struct net *net, struct nft_set *set, + + if (nft_setelem_is_catchall(set, elem)) { + nft_set_elem_change_active(net, set, ext); +- nft_set_elem_clear_busy(ext); + } else { + set->ops->activate(net, set, elem); + } +@@ -6023,8 +6022,7 @@ static int nft_setelem_catchall_deactivate(const struct net *net, + + list_for_each_entry(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); +- if (!nft_is_active(net, ext) || +- nft_set_elem_mark_busy(ext)) ++ if (!nft_is_active(net, ext)) + continue; + + kfree(elem->priv); +@@ -6725,8 +6723,7 @@ static int nft_set_catchall_flush(const struct nft_ctx *ctx, + + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); +- if (!nft_set_elem_active(ext, genmask) || +- nft_set_elem_mark_busy(ext)) ++ if (!nft_set_elem_active(ext, genmask)) + continue; + + elem.priv = catchall->elem; +diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c +index 24caa31fa2310..2f067e4596b02 100644 +--- a/net/netfilter/nft_set_hash.c ++++ b/net/netfilter/nft_set_hash.c +@@ -59,6 +59,8 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, + + if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) + return 1; ++ if (nft_set_elem_is_dead(&he->ext)) ++ return 1; + if (nft_set_elem_expired(&he->ext)) + return 1; + if (!nft_set_elem_active(&he->ext, x->genmask)) +@@ -188,7 +190,6 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, + struct nft_rhash_elem *he = elem->priv; + + nft_set_elem_change_active(net, set, &he->ext); +- nft_set_elem_clear_busy(&he->ext); + } + + static bool nft_rhash_flush(const struct net *net, +@@ -196,12 +197,9 @@ static bool nft_rhash_flush(const struct net *net, + { + struct nft_rhash_elem *he = priv; + +- if (!nft_set_elem_mark_busy(&he->ext) || +- !nft_is_active(net, &he->ext)) { +- nft_set_elem_change_active(net, set, &he->ext); +- return true; +- } +- return false; ++ nft_set_elem_change_active(net, set, &he->ext); ++ ++ return true; + } + + static void *nft_rhash_deactivate(const struct net *net, +@@ -218,9 +216,8 @@ static void *nft_rhash_deactivate(const struct net *net, + + rcu_read_lock(); + he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); +- if (he != NULL && +- !nft_rhash_flush(net, set, he)) +- he = NULL; ++ if (he) ++ nft_set_elem_change_active(net, set, &he->ext); + + rcu_read_unlock(); + +@@ -312,25 +309,48 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, + + static void nft_rhash_gc(struct work_struct *work) + { ++ struct nftables_pernet *nft_net; + struct nft_set *set; + struct nft_rhash_elem *he; + struct nft_rhash *priv; +- struct nft_set_gc_batch *gcb = NULL; + struct rhashtable_iter hti; ++ struct nft_trans_gc *gc; ++ struct net *net; ++ u32 gc_seq; + + priv = container_of(work, struct nft_rhash, gc_work.work); + set = nft_set_container_of(priv); ++ net = read_pnet(&set->net); ++ nft_net = nft_pernet(net); ++ gc_seq = READ_ONCE(nft_net->gc_seq); ++ ++ gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); ++ if (!gc) ++ goto done; + + rhashtable_walk_enter(&priv->ht, &hti); + rhashtable_walk_start(&hti); + + while ((he = rhashtable_walk_next(&hti))) { + if (IS_ERR(he)) { +- if (PTR_ERR(he) != -EAGAIN) +- break; ++ if (PTR_ERR(he) != -EAGAIN) { ++ nft_trans_gc_destroy(gc); ++ gc = NULL; ++ goto try_later; ++ } + continue; + } + ++ /* Ruleset has been updated, try later. */ ++ if (READ_ONCE(nft_net->gc_seq) != gc_seq) { ++ nft_trans_gc_destroy(gc); ++ gc = NULL; ++ goto try_later; ++ } ++ ++ if (nft_set_elem_is_dead(&he->ext)) ++ goto dead_elem; ++ + if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) && + nft_rhash_expr_needs_gc_run(set, &he->ext)) + goto needs_gc_run; +@@ -338,26 +358,26 @@ static void nft_rhash_gc(struct work_struct *work) + if (!nft_set_elem_expired(&he->ext)) + continue; + needs_gc_run: +- if (nft_set_elem_mark_busy(&he->ext)) +- continue; ++ nft_set_elem_dead(&he->ext); ++dead_elem: ++ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); ++ if (!gc) ++ goto try_later; + +- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); +- if (gcb == NULL) +- break; +- rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params); +- atomic_dec(&set->nelems); +- nft_set_gc_batch_add(gcb, he); ++ nft_trans_gc_elem_add(gc, he); + } ++ ++ gc = nft_trans_gc_catchall(gc, gc_seq); ++ ++try_later: ++ /* catchall list iteration requires rcu read side lock. */ + rhashtable_walk_stop(&hti); + rhashtable_walk_exit(&hti); + +- he = nft_set_catchall_gc(set); +- if (he) { +- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); +- if (gcb) +- nft_set_gc_batch_add(gcb, he); +- } +- nft_set_gc_batch_complete(gcb); ++ if (gc) ++ nft_trans_gc_queue_async_done(gc); ++ ++done: + queue_delayed_work(system_power_efficient_wq, &priv->gc_work, + nft_set_gc_interval(set)); + } +@@ -420,7 +440,6 @@ static void nft_rhash_destroy(const struct nft_ctx *ctx, + }; + + cancel_delayed_work_sync(&priv->gc_work); +- rcu_barrier(); + rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, + (void *)&rhash_ctx); + } +diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c +index b6a994ba72f31..a307a227d28db 100644 +--- a/net/netfilter/nft_set_pipapo.c ++++ b/net/netfilter/nft_set_pipapo.c +@@ -1544,16 +1544,34 @@ static void pipapo_drop(struct nft_pipapo_match *m, + } + } + ++static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set, ++ struct nft_pipapo_elem *e) ++ ++{ ++ struct nft_set_elem elem = { ++ .priv = e, ++ }; ++ ++ nft_setelem_data_deactivate(net, set, &elem); ++} ++ + /** + * pipapo_gc() - Drop expired entries from set, destroy start and end elements + * @set: nftables API set representation + * @m: Matching data + */ +-static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) ++static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) + { ++ struct nft_set *set = (struct nft_set *) _set; + struct nft_pipapo *priv = nft_set_priv(set); ++ struct net *net = read_pnet(&set->net); + int rules_f0, first_rule = 0; + struct nft_pipapo_elem *e; ++ struct nft_trans_gc *gc; ++ ++ gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL); ++ if (!gc) ++ return; + + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; +@@ -1577,13 +1595,20 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) + f--; + i--; + e = f->mt[rulemap[i].to].e; +- if (nft_set_elem_expired(&e->ext) && +- !nft_set_elem_mark_busy(&e->ext)) { ++ ++ /* synchronous gc never fails, there is no need to set on ++ * NFT_SET_ELEM_DEAD_BIT. ++ */ ++ if (nft_set_elem_expired(&e->ext)) { + priv->dirty = true; +- pipapo_drop(m, rulemap); + +- rcu_barrier(); +- nft_set_elem_destroy(set, e, true); ++ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); ++ if (!gc) ++ break; ++ ++ nft_pipapo_gc_deactivate(net, set, e); ++ pipapo_drop(m, rulemap); ++ nft_trans_gc_elem_add(gc, e); + + /* And check again current first rule, which is now the + * first we haven't checked. +@@ -1593,11 +1618,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) + } + } + +- e = nft_set_catchall_gc(set); +- if (e) +- nft_set_elem_destroy(set, e, true); +- +- priv->last_gc = jiffies; ++ gc = nft_trans_gc_catchall(gc, 0); ++ if (gc) { ++ nft_trans_gc_queue_sync_done(gc); ++ priv->last_gc = jiffies; ++ } + } + + /** +@@ -1733,7 +1758,6 @@ static void nft_pipapo_activate(const struct net *net, + return; + + nft_set_elem_change_active(net, set, &e->ext); +- nft_set_elem_clear_busy(&e->ext); + } + + /** +diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c +index 39956e5341c9e..f9d4c8fcbbf82 100644 +--- a/net/netfilter/nft_set_rbtree.c ++++ b/net/netfilter/nft_set_rbtree.c +@@ -46,6 +46,12 @@ static int nft_rbtree_cmp(const struct nft_set *set, + set->klen); + } + ++static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe) ++{ ++ return nft_set_elem_expired(&rbe->ext) || ++ nft_set_elem_is_dead(&rbe->ext); ++} ++ + static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, + const u32 *key, const struct nft_set_ext **ext, + unsigned int seq) +@@ -80,7 +86,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set + continue; + } + +- if (nft_set_elem_expired(&rbe->ext)) ++ if (nft_rbtree_elem_expired(rbe)) + return false; + + if (nft_rbtree_interval_end(rbe)) { +@@ -98,7 +104,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set + + if (set->flags & NFT_SET_INTERVAL && interval != NULL && + nft_set_elem_active(&interval->ext, genmask) && +- !nft_set_elem_expired(&interval->ext) && ++ !nft_rbtree_elem_expired(interval) && + nft_rbtree_interval_start(interval)) { + *ext = &interval->ext; + return true; +@@ -215,6 +221,18 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, + return rbe; + } + ++static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set, ++ struct nft_rbtree *priv, ++ struct nft_rbtree_elem *rbe) ++{ ++ struct nft_set_elem elem = { ++ .priv = rbe, ++ }; ++ ++ nft_setelem_data_deactivate(net, set, &elem); ++ rb_erase(&rbe->node, &priv->root); ++} ++ + static int nft_rbtree_gc_elem(const struct nft_set *__set, + struct nft_rbtree *priv, + struct nft_rbtree_elem *rbe, +@@ -222,11 +240,12 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, + { + struct nft_set *set = (struct nft_set *)__set; + struct rb_node *prev = rb_prev(&rbe->node); ++ struct net *net = read_pnet(&set->net); + struct nft_rbtree_elem *rbe_prev; +- struct nft_set_gc_batch *gcb; ++ struct nft_trans_gc *gc; + +- gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); +- if (!gcb) ++ gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC); ++ if (!gc) + return -ENOMEM; + + /* search for end interval coming before this element. +@@ -244,17 +263,28 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, + + if (prev) { + rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); ++ nft_rbtree_gc_remove(net, set, priv, rbe_prev); + +- rb_erase(&rbe_prev->node, &priv->root); +- atomic_dec(&set->nelems); +- nft_set_gc_batch_add(gcb, rbe_prev); ++ /* There is always room in this trans gc for this element, ++ * memory allocation never actually happens, hence, the warning ++ * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT, ++ * this is synchronous gc which never fails. ++ */ ++ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); ++ if (WARN_ON_ONCE(!gc)) ++ return -ENOMEM; ++ ++ nft_trans_gc_elem_add(gc, rbe_prev); + } + +- rb_erase(&rbe->node, &priv->root); +- atomic_dec(&set->nelems); ++ nft_rbtree_gc_remove(net, set, priv, rbe); ++ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); ++ if (WARN_ON_ONCE(!gc)) ++ return -ENOMEM; ++ ++ nft_trans_gc_elem_add(gc, rbe); + +- nft_set_gc_batch_add(gcb, rbe); +- nft_set_gc_batch_complete(gcb); ++ nft_trans_gc_queue_sync_done(gc); + + return 0; + } +@@ -482,7 +512,6 @@ static void nft_rbtree_activate(const struct net *net, + struct nft_rbtree_elem *rbe = elem->priv; + + nft_set_elem_change_active(net, set, &rbe->ext); +- nft_set_elem_clear_busy(&rbe->ext); + } + + static bool nft_rbtree_flush(const struct net *net, +@@ -490,12 +519,9 @@ static bool nft_rbtree_flush(const struct net *net, + { + struct nft_rbtree_elem *rbe = priv; + +- if (!nft_set_elem_mark_busy(&rbe->ext) || +- !nft_is_active(net, &rbe->ext)) { +- nft_set_elem_change_active(net, set, &rbe->ext); +- return true; +- } +- return false; ++ nft_set_elem_change_active(net, set, &rbe->ext); ++ ++ return true; + } + + static void *nft_rbtree_deactivate(const struct net *net, +@@ -570,26 +596,40 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, + + static void nft_rbtree_gc(struct work_struct *work) + { +- struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; +- struct nft_set_gc_batch *gcb = NULL; ++ struct nft_rbtree_elem *rbe, *rbe_end = NULL; ++ struct nftables_pernet *nft_net; + struct nft_rbtree *priv; ++ struct nft_trans_gc *gc; + struct rb_node *node; + struct nft_set *set; ++ unsigned int gc_seq; + struct net *net; +- u8 genmask; + + priv = container_of(work, struct nft_rbtree, gc_work.work); + set = nft_set_container_of(priv); + net = read_pnet(&set->net); +- genmask = nft_genmask_cur(net); ++ nft_net = nft_pernet(net); ++ gc_seq = READ_ONCE(nft_net->gc_seq); ++ ++ gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); ++ if (!gc) ++ goto done; + + write_lock_bh(&priv->lock); + write_seqcount_begin(&priv->count); + for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { ++ ++ /* Ruleset has been updated, try later. */ ++ if (READ_ONCE(nft_net->gc_seq) != gc_seq) { ++ nft_trans_gc_destroy(gc); ++ gc = NULL; ++ goto try_later; ++ } ++ + rbe = rb_entry(node, struct nft_rbtree_elem, node); + +- if (!nft_set_elem_active(&rbe->ext, genmask)) +- continue; ++ if (nft_set_elem_is_dead(&rbe->ext)) ++ goto dead_elem; + + /* elements are reversed in the rbtree for historical reasons, + * from highest to lowest value, that is why end element is +@@ -602,46 +642,36 @@ static void nft_rbtree_gc(struct work_struct *work) + if (!nft_set_elem_expired(&rbe->ext)) + continue; + +- if (nft_set_elem_mark_busy(&rbe->ext)) { +- rbe_end = NULL; ++ nft_set_elem_dead(&rbe->ext); ++ ++ if (!rbe_end) + continue; +- } + +- if (rbe_prev) { +- rb_erase(&rbe_prev->node, &priv->root); +- rbe_prev = NULL; +- } +- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); +- if (!gcb) +- break; ++ nft_set_elem_dead(&rbe_end->ext); + +- atomic_dec(&set->nelems); +- nft_set_gc_batch_add(gcb, rbe); +- rbe_prev = rbe; ++ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); ++ if (!gc) ++ goto try_later; + +- if (rbe_end) { +- atomic_dec(&set->nelems); +- nft_set_gc_batch_add(gcb, rbe_end); +- rb_erase(&rbe_end->node, &priv->root); +- rbe_end = NULL; +- } +- node = rb_next(node); +- if (!node) +- break; ++ nft_trans_gc_elem_add(gc, rbe_end); ++ rbe_end = NULL; ++dead_elem: ++ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); ++ if (!gc) ++ goto try_later; ++ ++ nft_trans_gc_elem_add(gc, rbe); + } +- if (rbe_prev) +- rb_erase(&rbe_prev->node, &priv->root); ++ ++ gc = nft_trans_gc_catchall(gc, gc_seq); ++ ++try_later: + write_seqcount_end(&priv->count); + write_unlock_bh(&priv->lock); + +- rbe = nft_set_catchall_gc(set); +- if (rbe) { +- gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); +- if (gcb) +- nft_set_gc_batch_add(gcb, rbe); +- } +- nft_set_gc_batch_complete(gcb); +- ++ if (gc) ++ nft_trans_gc_queue_async_done(gc); ++done: + queue_delayed_work(system_power_efficient_wq, &priv->gc_work, + nft_set_gc_interval(set)); + } +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nf_tables-defer-gc-run-if-previous-batch-i.patch b/queue-5.15/netfilter-nf_tables-defer-gc-run-if-previous-batch-i.patch new file mode 100644 index 00000000000..6cdf3c90636 --- /dev/null +++ b/queue-5.15/netfilter-nf_tables-defer-gc-run-if-previous-batch-i.patch @@ -0,0 +1,80 @@ +From 65dc70b68602575231d541279da92bfe4944594c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:43:07 +0200 +Subject: netfilter: nf_tables: defer gc run if previous batch is still pending + +From: Florian Westphal + +commit 8e51830e29e12670b4c10df070a4ea4c9593e961 upstream. + +Don't queue more gc work, else we may queue the same elements multiple +times. + +If an element is flagged as dead, this can mean that either the previous +gc request was invalidated/discarded by a transaction or that the previous +request is still pending in the system work queue. + +The latter will happen if the gc interval is set to a very low value, +e.g. 1ms, and system work queue is backlogged. + +The sets refcount is 1 if no previous gc requeusts are queued, so add +a helper for this and skip gc run if old requests are pending. + +Add a helper for this and skip the gc run in this case. + +Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") +Signed-off-by: Florian Westphal +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_tables.h | 5 +++++ + net/netfilter/nft_set_hash.c | 3 +++ + net/netfilter/nft_set_rbtree.c | 3 +++ + 3 files changed, 11 insertions(+) + +diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h +index 5a0c854e9dc6e..af703d295f0cd 100644 +--- a/include/net/netfilter/nf_tables.h ++++ b/include/net/netfilter/nf_tables.h +@@ -551,6 +551,11 @@ static inline void *nft_set_priv(const struct nft_set *set) + return (void *)set->data; + } + ++static inline bool nft_set_gc_is_pending(const struct nft_set *s) ++{ ++ return refcount_read(&s->refs) != 1; ++} ++ + static inline struct nft_set *nft_set_container_of(const void *priv) + { + return (void *)priv - offsetof(struct nft_set, data); +diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c +index cef5df8460009..524763659f251 100644 +--- a/net/netfilter/nft_set_hash.c ++++ b/net/netfilter/nft_set_hash.c +@@ -326,6 +326,9 @@ static void nft_rhash_gc(struct work_struct *work) + nft_net = nft_pernet(net); + gc_seq = READ_ONCE(nft_net->gc_seq); + ++ if (nft_set_gc_is_pending(set)) ++ goto done; ++ + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; +diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c +index f9d4c8fcbbf82..c6435e7092319 100644 +--- a/net/netfilter/nft_set_rbtree.c ++++ b/net/netfilter/nft_set_rbtree.c +@@ -611,6 +611,9 @@ static void nft_rbtree_gc(struct work_struct *work) + nft_net = nft_pernet(net); + gc_seq = READ_ONCE(nft_net->gc_seq); + ++ if (nft_set_gc_is_pending(set)) ++ goto done; ++ + gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL); + if (!gc) + goto done; +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nf_tables-don-t-fail-inserts-if-duplicate-.patch b/queue-5.15/netfilter-nf_tables-don-t-fail-inserts-if-duplicate-.patch new file mode 100644 index 00000000000..ea3a92b631f --- /dev/null +++ b/queue-5.15/netfilter-nf_tables-don-t-fail-inserts-if-duplicate-.patch @@ -0,0 +1,102 @@ +From 505e27c97151aee7bee432521c557d1225ba39cc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:43:02 +0200 +Subject: netfilter: nf_tables: don't fail inserts if duplicate has expired + +From: Florian Westphal + +commit 7845914f45f066497ac75b30c50dbc735e84e884 upstream. + +nftables selftests fail: +run-tests.sh testcases/sets/0044interval_overlap_0 +Expected: 0-2 . 0-3, got: +W: [FAILED] ./testcases/sets/0044interval_overlap_0: got 1 + +Insertion must ignore duplicate but expired entries. + +Moreover, there is a strange asymmetry in nft_pipapo_activate: + +It refetches the current element, whereas the other ->activate callbacks +(bitmap, hash, rhash, rbtree) use elem->priv. +Same for .remove: other set implementations take elem->priv, +nft_pipapo_remove fetches elem->priv, then does a relookup, +remove this. + +I suspect this was the reason for the change that prompted the +removal of the expired check in pipapo_get() in the first place, +but skipping exired elements there makes no sense to me, this helper +is used for normal get requests, insertions (duplicate check) +and deactivate callback. + +In first two cases expired elements must be skipped. + +For ->deactivate(), this gets called for DELSETELEM, so it +seems to me that expired elements should be skipped as well, i.e. +delete request should fail with -ENOENT error. + +Fixes: 24138933b97b ("netfilter: nf_tables: don't skip expired elements during walk") +Signed-off-by: Florian Westphal +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_set_pipapo.c | 23 ++++------------------- + 1 file changed, 4 insertions(+), 19 deletions(-) + +diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c +index a307a227d28db..58bd514260b90 100644 +--- a/net/netfilter/nft_set_pipapo.c ++++ b/net/netfilter/nft_set_pipapo.c +@@ -566,6 +566,8 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, + goto out; + + if (last) { ++ if (nft_set_elem_expired(&f->mt[b].e->ext)) ++ goto next_match; + if ((genmask && + !nft_set_elem_active(&f->mt[b].e->ext, genmask))) + goto next_match; +@@ -600,17 +602,8 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, + static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) + { +- struct nft_pipapo_elem *ret; +- +- ret = pipapo_get(net, set, (const u8 *)elem->key.val.data, ++ return pipapo_get(net, set, (const u8 *)elem->key.val.data, + nft_genmask_cur(net)); +- if (IS_ERR(ret)) +- return ret; +- +- if (nft_set_elem_expired(&ret->ext)) +- return ERR_PTR(-ENOENT); +- +- return ret; + } + + /** +@@ -1751,11 +1744,7 @@ static void nft_pipapo_activate(const struct net *net, + const struct nft_set *set, + const struct nft_set_elem *elem) + { +- struct nft_pipapo_elem *e; +- +- e = pipapo_get(net, set, (const u8 *)elem->key.val.data, 0); +- if (IS_ERR(e)) +- return; ++ struct nft_pipapo_elem *e = elem->priv; + + nft_set_elem_change_active(net, set, &e->ext); + } +@@ -1969,10 +1958,6 @@ static void nft_pipapo_remove(const struct net *net, const struct nft_set *set, + + data = (const u8 *)nft_set_ext_key(&e->ext); + +- e = pipapo_get(net, set, data, 0); +- if (IS_ERR(e)) +- return; +- + while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { + union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; + const u8 *match_start, *match_end; +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nf_tables-don-t-skip-expired-elements-duri.patch b/queue-5.15/netfilter-nf_tables-don-t-skip-expired-elements-duri.patch new file mode 100644 index 00000000000..cb217b86084 --- /dev/null +++ b/queue-5.15/netfilter-nf_tables-don-t-skip-expired-elements-duri.patch @@ -0,0 +1,140 @@ +From 328e163ba008e35148100ef92cbedbdb06c817e5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:42:57 +0200 +Subject: netfilter: nf_tables: don't skip expired elements during walk + +From: Florian Westphal + +commit 24138933b97b055d486e8064b4a1721702442a9b upstream. + +There is an asymmetry between commit/abort and preparation phase if the +following conditions are met: + +1. set is a verdict map ("1.2.3.4 : jump foo") +2. timeouts are enabled + +In this case, following sequence is problematic: + +1. element E in set S refers to chain C +2. userspace requests removal of set S +3. kernel does a set walk to decrement chain->use count for all elements + from preparation phase +4. kernel does another set walk to remove elements from the commit phase + (or another walk to do a chain->use increment for all elements from + abort phase) + +If E has already expired in 1), it will be ignored during list walk, so its use count +won't have been changed. + +Then, when set is culled, ->destroy callback will zap the element via +nf_tables_set_elem_destroy(), but this function is only safe for +elements that have been deactivated earlier from the preparation phase: +lack of earlier deactivate removes the element but leaks the chain use +count, which results in a WARN splat when the chain gets removed later, +plus a leak of the nft_chain structure. + +Update pipapo_get() not to skip expired elements, otherwise flush +command reports bogus ENOENT errors. + +Fixes: 3c4287f62044 ("nf_tables: Add set type for arbitrary concatenation of ranges") +Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") +Fixes: 9d0982927e79 ("netfilter: nft_hash: add support for timeouts") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_tables_api.c | 4 ++++ + net/netfilter/nft_set_hash.c | 2 -- + net/netfilter/nft_set_pipapo.c | 18 ++++++++++++------ + net/netfilter/nft_set_rbtree.c | 2 -- + 4 files changed, 16 insertions(+), 10 deletions(-) + +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index d84da11aaee5c..9ac8b83b4a458 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -5289,8 +5289,12 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx, + const struct nft_set_iter *iter, + struct nft_set_elem *elem) + { ++ const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + struct nft_set_dump_args *args; + ++ if (nft_set_elem_expired(ext)) ++ return 0; ++ + args = container_of(iter, struct nft_set_dump_args, iter); + return nf_tables_fill_setelem(args->skb, set, elem); + } +diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c +index 0b73cb0e752f7..24caa31fa2310 100644 +--- a/net/netfilter/nft_set_hash.c ++++ b/net/netfilter/nft_set_hash.c +@@ -278,8 +278,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, + + if (iter->count < iter->skip) + goto cont; +- if (nft_set_elem_expired(&he->ext)) +- goto cont; + if (!nft_set_elem_active(&he->ext, iter->genmask)) + goto cont; + +diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c +index 8c16681884b7e..b6a994ba72f31 100644 +--- a/net/netfilter/nft_set_pipapo.c ++++ b/net/netfilter/nft_set_pipapo.c +@@ -566,8 +566,7 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, + goto out; + + if (last) { +- if (nft_set_elem_expired(&f->mt[b].e->ext) || +- (genmask && ++ if ((genmask && + !nft_set_elem_active(&f->mt[b].e->ext, genmask))) + goto next_match; + +@@ -601,8 +600,17 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, + static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, + const struct nft_set_elem *elem, unsigned int flags) + { +- return pipapo_get(net, set, (const u8 *)elem->key.val.data, +- nft_genmask_cur(net)); ++ struct nft_pipapo_elem *ret; ++ ++ ret = pipapo_get(net, set, (const u8 *)elem->key.val.data, ++ nft_genmask_cur(net)); ++ if (IS_ERR(ret)) ++ return ret; ++ ++ if (nft_set_elem_expired(&ret->ext)) ++ return ERR_PTR(-ENOENT); ++ ++ return ret; + } + + /** +@@ -2024,8 +2032,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, + goto cont; + + e = f->mt[r].e; +- if (nft_set_elem_expired(&e->ext)) +- goto cont; + + elem.priv = e; + +diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c +index 8d73fffd2d09d..39956e5341c9e 100644 +--- a/net/netfilter/nft_set_rbtree.c ++++ b/net/netfilter/nft_set_rbtree.c +@@ -552,8 +552,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, + + if (iter->count < iter->skip) + goto cont; +- if (nft_set_elem_expired(&rbe->ext)) +- goto cont; + if (!nft_set_elem_active(&rbe->ext, iter->genmask)) + goto cont; + +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nf_tables-fix-gc-transaction-races-with-ne.patch b/queue-5.15/netfilter-nf_tables-fix-gc-transaction-races-with-ne.patch new file mode 100644 index 00000000000..5c1d72248dc --- /dev/null +++ b/queue-5.15/netfilter-nf_tables-fix-gc-transaction-races-with-ne.patch @@ -0,0 +1,120 @@ +From 4cf119d14e4ae9b1753edfea43e7044b25b19f8f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:43:03 +0200 +Subject: netfilter: nf_tables: fix GC transaction races with netns and netlink + event exit path + +From: Pablo Neira Ayuso + +commit 6a33d8b73dfac0a41f3877894b38082bd0c9a5bc upstream. + +Netlink event path is missing a synchronization point with GC +transactions. Add GC sequence number update to netns release path and +netlink event path, any GC transaction losing race will be discarded. + +Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Florian Westphal +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_tables_api.c | 36 +++++++++++++++++++++++++++++++---- + 1 file changed, 32 insertions(+), 4 deletions(-) + +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 194b78900bd33..aadcb2a5dc816 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -9207,6 +9207,22 @@ static void nft_set_commit_update(struct list_head *set_update_list) + } + } + ++static unsigned int nft_gc_seq_begin(struct nftables_pernet *nft_net) ++{ ++ unsigned int gc_seq; ++ ++ /* Bump gc counter, it becomes odd, this is the busy mark. */ ++ gc_seq = READ_ONCE(nft_net->gc_seq); ++ WRITE_ONCE(nft_net->gc_seq, ++gc_seq); ++ ++ return gc_seq; ++} ++ ++static void nft_gc_seq_end(struct nftables_pernet *nft_net, unsigned int gc_seq) ++{ ++ WRITE_ONCE(nft_net->gc_seq, ++gc_seq); ++} ++ + static int nf_tables_commit(struct net *net, struct sk_buff *skb) + { + struct nftables_pernet *nft_net = nft_pernet(net); +@@ -9292,9 +9308,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) + + WRITE_ONCE(nft_net->base_seq, base_seq); + +- /* Bump gc counter, it becomes odd, this is the busy mark. */ +- gc_seq = READ_ONCE(nft_net->gc_seq); +- WRITE_ONCE(nft_net->gc_seq, ++gc_seq); ++ gc_seq = nft_gc_seq_begin(nft_net); + + /* step 3. Start new generation, rules_gen_X now in use. */ + net->nft.gencursor = nft_gencursor_next(net); +@@ -9485,7 +9499,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) + nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + nf_tables_commit_audit_log(&adl, nft_net->base_seq); + +- WRITE_ONCE(nft_net->gc_seq, ++gc_seq); ++ nft_gc_seq_end(nft_net, gc_seq); + nf_tables_commit_release(net); + + return 0; +@@ -10468,6 +10482,7 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, + struct net *net = n->net; + unsigned int deleted; + bool restart = false; ++ unsigned int gc_seq; + + if (event != NETLINK_URELEASE || n->protocol != NETLINK_NETFILTER) + return NOTIFY_DONE; +@@ -10475,6 +10490,9 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, + nft_net = nft_pernet(net); + deleted = 0; + mutex_lock(&nft_net->commit_mutex); ++ ++ gc_seq = nft_gc_seq_begin(nft_net); ++ + if (!list_empty(&nf_tables_destroy_list)) + nf_tables_trans_destroy_flush_work(); + again: +@@ -10497,6 +10515,8 @@ static int nft_rcv_nl_event(struct notifier_block *this, unsigned long event, + if (restart) + goto again; + } ++ nft_gc_seq_end(nft_net, gc_seq); ++ + mutex_unlock(&nft_net->commit_mutex); + + return NOTIFY_DONE; +@@ -10535,12 +10555,20 @@ static void __net_exit nf_tables_pre_exit_net(struct net *net) + static void __net_exit nf_tables_exit_net(struct net *net) + { + struct nftables_pernet *nft_net = nft_pernet(net); ++ unsigned int gc_seq; + + mutex_lock(&nft_net->commit_mutex); ++ ++ gc_seq = nft_gc_seq_begin(nft_net); ++ + if (!list_empty(&nft_net->commit_list) || + !list_empty(&nft_net->module_list)) + __nf_tables_abort(net, NFNL_ABORT_NONE); ++ + __nft_release_tables(net); ++ ++ nft_gc_seq_end(nft_net, gc_seq); ++ + mutex_unlock(&nft_net->commit_mutex); + WARN_ON_ONCE(!list_empty(&nft_net->tables)); + WARN_ON_ONCE(!list_empty(&nft_net->module_list)); +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nf_tables-fix-memleak-when-more-than-255-e.patch b/queue-5.15/netfilter-nf_tables-fix-memleak-when-more-than-255-e.patch new file mode 100644 index 00000000000..40d890bf73a --- /dev/null +++ b/queue-5.15/netfilter-nf_tables-fix-memleak-when-more-than-255-e.patch @@ -0,0 +1,87 @@ +From 9b6e0227303c345ebf3ff842bef6cb7d035fc644 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:43:13 +0200 +Subject: netfilter: nf_tables: fix memleak when more than 255 elements expired + +From: Florian Westphal + +commit cf5000a7787cbc10341091d37245a42c119d26c5 upstream. + +When more than 255 elements expired we're supposed to switch to a new gc +container structure. + +This never happens: u8 type will wrap before reaching the boundary +and nft_trans_gc_space() always returns true. + +This means we recycle the initial gc container structure and +lose track of the elements that came before. + +While at it, don't deref 'gc' after we've passed it to call_rcu. + +Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") +Reported-by: Pablo Neira Ayuso +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_tables.h | 2 +- + net/netfilter/nf_tables_api.c | 10 ++++++++-- + 2 files changed, 9 insertions(+), 3 deletions(-) + +diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h +index d543078c43f95..098e829fd762f 100644 +--- a/include/net/netfilter/nf_tables.h ++++ b/include/net/netfilter/nf_tables.h +@@ -1605,7 +1605,7 @@ struct nft_trans_gc { + struct net *net; + struct nft_set *set; + u32 seq; +- u8 count; ++ u16 count; + void *priv[NFT_TRANS_GC_BATCHCOUNT]; + struct rcu_head rcu; + }; +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 58fec8806ec53..4dadb0eebf614 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -8984,12 +8984,15 @@ static int nft_trans_gc_space(struct nft_trans_gc *trans) + struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, + unsigned int gc_seq, gfp_t gfp) + { ++ struct nft_set *set; ++ + if (nft_trans_gc_space(gc)) + return gc; + ++ set = gc->set; + nft_trans_gc_queue_work(gc); + +- return nft_trans_gc_alloc(gc->set, gc_seq, gfp); ++ return nft_trans_gc_alloc(set, gc_seq, gfp); + } + + void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) +@@ -9004,15 +9007,18 @@ void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) + + struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) + { ++ struct nft_set *set; ++ + if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) + return NULL; + + if (nft_trans_gc_space(gc)) + return gc; + ++ set = gc->set; + call_rcu(&gc->rcu, nft_trans_gc_trans_free); + +- return nft_trans_gc_alloc(gc->set, 0, gfp); ++ return nft_trans_gc_alloc(set, 0, gfp); + } + + void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nf_tables-gc-transaction-api-to-avoid-race.patch b/queue-5.15/netfilter-nf_tables-gc-transaction-api-to-avoid-race.patch new file mode 100644 index 00000000000..d09eb45bb4d --- /dev/null +++ b/queue-5.15/netfilter-nf_tables-gc-transaction-api-to-avoid-race.patch @@ -0,0 +1,574 @@ +From ac432caba18b3c345ca5e76bf70163e034afb732 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:42:58 +0200 +Subject: netfilter: nf_tables: GC transaction API to avoid race with control + plane + +From: Pablo Neira Ayuso + +commit 5f68718b34a531a556f2f50300ead2862278da26 upstream. + +The set types rhashtable and rbtree use a GC worker to reclaim memory. +From system work queue, in periodic intervals, a scan of the table is +done. + +The major caveat here is that the nft transaction mutex is not held. +This causes a race between control plane and GC when they attempt to +delete the same element. + +We cannot grab the netlink mutex from the work queue, because the +control plane has to wait for the GC work queue in case the set is to be +removed, so we get following deadlock: + + cpu 1 cpu2 + GC work transaction comes in , lock nft mutex + `acquire nft mutex // BLOCKS + transaction asks to remove the set + set destruction calls cancel_work_sync() + +cancel_work_sync will now block forever, because it is waiting for the +mutex the caller already owns. + +This patch adds a new API that deals with garbage collection in two +steps: + +1) Lockless GC of expired elements sets on the NFT_SET_ELEM_DEAD_BIT + so they are not visible via lookup. Annotate current GC sequence in + the GC transaction. Enqueue GC transaction work as soon as it is + full. If ruleset is updated, then GC transaction is aborted and + retried later. + +2) GC work grabs the mutex. If GC sequence has changed then this GC + transaction lost race with control plane, abort it as it contains + stale references to objects and let GC try again later. If the + ruleset is intact, then this GC transaction deactivates and removes + the elements and it uses call_rcu() to destroy elements. + +Note that no elements are removed from GC lockless path, the _DEAD bit +is set and pointers are collected. GC catchall does not remove the +elements anymore too. There is a new set->dead flag that is set on to +abort the GC transaction to deal with set->ops->destroy() path which +removes the remaining elements in the set from commit_release, where no +mutex is held. + +To deal with GC when mutex is held, which allows safe deactivate and +removal, add sync GC API which releases the set element object via +call_rcu(). This is used by rbtree and pipapo backends which also +perform garbage collection from control plane path. + +Since element removal from sets can happen from control plane and +element garbage collection/timeout, it is necessary to keep the set +structure alive until all elements have been deactivated and destroyed. + +We cannot do a cancel_work_sync or flush_work in nft_set_destroy because +its called with the transaction mutex held, but the aforementioned async +work queue might be blocked on the very mutex that nft_set_destroy() +callchain is sitting on. + +This gives us the choice of ABBA deadlock or UaF. + +To avoid both, add set->refs refcount_t member. The GC API can then +increment the set refcount and release it once the elements have been +free'd. + +Set backends are adapted to use the GC transaction API in a follow up +patch entitled: + + ("netfilter: nf_tables: use gc transaction API in set backends") + +This is joint work with Florian Westphal. + +Fixes: cfed7e1b1f8e ("netfilter: nf_tables: add set garbage collection helpers") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_tables.h | 64 +++++++- + net/netfilter/nf_tables_api.c | 248 ++++++++++++++++++++++++++++-- + 2 files changed, 300 insertions(+), 12 deletions(-) + +diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h +index 1458b3eae8ada..a930fc2defeee 100644 +--- a/include/net/netfilter/nf_tables.h ++++ b/include/net/netfilter/nf_tables.h +@@ -477,6 +477,7 @@ struct nft_set_elem_expr { + * + * @list: table set list node + * @bindings: list of set bindings ++ * @refs: internal refcounting for async set destruction + * @table: table this set belongs to + * @net: netnamespace this set belongs to + * @name: name of the set +@@ -506,6 +507,7 @@ struct nft_set_elem_expr { + struct nft_set { + struct list_head list; + struct list_head bindings; ++ refcount_t refs; + struct nft_table *table; + possible_net_t net; + char *name; +@@ -527,7 +529,8 @@ struct nft_set { + struct list_head pending_update; + /* runtime data below here */ + const struct nft_set_ops *ops ____cacheline_aligned; +- u16 flags:14, ++ u16 flags:13, ++ dead:1, + genmask:2; + u8 klen; + u8 dlen; +@@ -1525,6 +1528,32 @@ static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) + clear_bit(NFT_SET_ELEM_BUSY_BIT, word); + } + ++#define NFT_SET_ELEM_DEAD_MASK (1 << 3) ++ ++#if defined(__LITTLE_ENDIAN_BITFIELD) ++#define NFT_SET_ELEM_DEAD_BIT 3 ++#elif defined(__BIG_ENDIAN_BITFIELD) ++#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3) ++#else ++#error ++#endif ++ ++static inline void nft_set_elem_dead(struct nft_set_ext *ext) ++{ ++ unsigned long *word = (unsigned long *)ext; ++ ++ BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); ++ set_bit(NFT_SET_ELEM_DEAD_BIT, word); ++} ++ ++static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext) ++{ ++ unsigned long *word = (unsigned long *)ext; ++ ++ BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); ++ return test_bit(NFT_SET_ELEM_DEAD_BIT, word); ++} ++ + /** + * struct nft_trans - nf_tables object update in transaction + * +@@ -1656,6 +1685,38 @@ struct nft_trans_flowtable { + #define nft_trans_flowtable_flags(trans) \ + (((struct nft_trans_flowtable *)trans->data)->flags) + ++#define NFT_TRANS_GC_BATCHCOUNT 256 ++ ++struct nft_trans_gc { ++ struct list_head list; ++ struct net *net; ++ struct nft_set *set; ++ u32 seq; ++ u8 count; ++ void *priv[NFT_TRANS_GC_BATCHCOUNT]; ++ struct rcu_head rcu; ++}; ++ ++struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, ++ unsigned int gc_seq, gfp_t gfp); ++void nft_trans_gc_destroy(struct nft_trans_gc *trans); ++ ++struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, ++ unsigned int gc_seq, gfp_t gfp); ++void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc); ++ ++struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp); ++void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); ++ ++void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); ++ ++struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, ++ unsigned int gc_seq); ++ ++void nft_setelem_data_deactivate(const struct net *net, ++ const struct nft_set *set, ++ struct nft_set_elem *elem); ++ + int __init nft_chain_filter_init(void); + void nft_chain_filter_fini(void); + +@@ -1683,6 +1744,7 @@ struct nftables_pernet { + u64 table_handle; + unsigned int base_seq; + u8 validate_state; ++ unsigned int gc_seq; + }; + + extern unsigned int nf_tables_net_id; +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 9ac8b83b4a458..a389240a67f32 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -31,7 +31,9 @@ static LIST_HEAD(nf_tables_expressions); + static LIST_HEAD(nf_tables_objects); + static LIST_HEAD(nf_tables_flowtables); + static LIST_HEAD(nf_tables_destroy_list); ++static LIST_HEAD(nf_tables_gc_list); + static DEFINE_SPINLOCK(nf_tables_destroy_list_lock); ++static DEFINE_SPINLOCK(nf_tables_gc_list_lock); + + enum { + NFT_VALIDATE_SKIP = 0, +@@ -122,6 +124,9 @@ static void nft_validate_state_update(struct net *net, u8 new_validate_state) + static void nf_tables_trans_destroy_work(struct work_struct *w); + static DECLARE_WORK(trans_destroy_work, nf_tables_trans_destroy_work); + ++static void nft_trans_gc_work(struct work_struct *work); ++static DECLARE_WORK(trans_gc_work, nft_trans_gc_work); ++ + static void nft_ctx_init(struct nft_ctx *ctx, + struct net *net, + const struct sk_buff *skb, +@@ -583,10 +588,6 @@ static int nft_trans_set_add(const struct nft_ctx *ctx, int msg_type, + return __nft_trans_set_add(ctx, msg_type, set, NULL); + } + +-static void nft_setelem_data_deactivate(const struct net *net, +- const struct nft_set *set, +- struct nft_set_elem *elem); +- + static int nft_mapelem_deactivate(const struct nft_ctx *ctx, + struct nft_set *set, + const struct nft_set_iter *iter, +@@ -4757,6 +4758,7 @@ static int nf_tables_newset(struct sk_buff *skb, const struct nfnl_info *info, + + INIT_LIST_HEAD(&set->bindings); + INIT_LIST_HEAD(&set->catchall_list); ++ refcount_set(&set->refs, 1); + set->table = table; + write_pnet(&set->net, net); + set->ops = ops; +@@ -4824,6 +4826,14 @@ static void nft_set_catchall_destroy(const struct nft_ctx *ctx, + } + } + ++static void nft_set_put(struct nft_set *set) ++{ ++ if (refcount_dec_and_test(&set->refs)) { ++ kfree(set->name); ++ kvfree(set); ++ } ++} ++ + static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) + { + int i; +@@ -4836,8 +4846,7 @@ static void nft_set_destroy(const struct nft_ctx *ctx, struct nft_set *set) + + set->ops->destroy(ctx, set); + nft_set_catchall_destroy(ctx, set); +- kfree(set->name); +- kvfree(set); ++ nft_set_put(set); + } + + static int nf_tables_delset(struct sk_buff *skb, const struct nfnl_info *info, +@@ -5906,7 +5915,8 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, + list_for_each_entry_rcu(catchall, &set->catchall_list, list) { + ext = nft_set_elem_ext(set, catchall->elem); + if (nft_set_elem_active(ext, genmask) && +- !nft_set_elem_expired(ext)) ++ !nft_set_elem_expired(ext) && ++ !nft_set_elem_is_dead(ext)) + return ext; + } + +@@ -6551,9 +6561,9 @@ static void nft_setelem_data_activate(const struct net *net, + nft_use_inc_restore(&(*nft_set_ext_obj(ext))->use); + } + +-static void nft_setelem_data_deactivate(const struct net *net, +- const struct nft_set *set, +- struct nft_set_elem *elem) ++void nft_setelem_data_deactivate(const struct net *net, ++ const struct nft_set *set, ++ struct nft_set_elem *elem) + { + const struct nft_set_ext *ext = nft_set_elem_ext(set, elem->priv); + +@@ -8887,6 +8897,207 @@ void nft_chain_del(struct nft_chain *chain) + list_del_rcu(&chain->list); + } + ++static void nft_trans_gc_setelem_remove(struct nft_ctx *ctx, ++ struct nft_trans_gc *trans) ++{ ++ void **priv = trans->priv; ++ unsigned int i; ++ ++ for (i = 0; i < trans->count; i++) { ++ struct nft_set_elem elem = { ++ .priv = priv[i], ++ }; ++ ++ nft_setelem_data_deactivate(ctx->net, trans->set, &elem); ++ nft_setelem_remove(ctx->net, trans->set, &elem); ++ } ++} ++ ++void nft_trans_gc_destroy(struct nft_trans_gc *trans) ++{ ++ nft_set_put(trans->set); ++ put_net(trans->net); ++ kfree(trans); ++} ++ ++static void nft_trans_gc_trans_free(struct rcu_head *rcu) ++{ ++ struct nft_set_elem elem = {}; ++ struct nft_trans_gc *trans; ++ struct nft_ctx ctx = {}; ++ unsigned int i; ++ ++ trans = container_of(rcu, struct nft_trans_gc, rcu); ++ ctx.net = read_pnet(&trans->set->net); ++ ++ for (i = 0; i < trans->count; i++) { ++ elem.priv = trans->priv[i]; ++ if (!nft_setelem_is_catchall(trans->set, &elem)) ++ atomic_dec(&trans->set->nelems); ++ ++ nf_tables_set_elem_destroy(&ctx, trans->set, elem.priv); ++ } ++ ++ nft_trans_gc_destroy(trans); ++} ++ ++static bool nft_trans_gc_work_done(struct nft_trans_gc *trans) ++{ ++ struct nftables_pernet *nft_net; ++ struct nft_ctx ctx = {}; ++ ++ nft_net = nft_pernet(trans->net); ++ ++ mutex_lock(&nft_net->commit_mutex); ++ ++ /* Check for race with transaction, otherwise this batch refers to ++ * stale objects that might not be there anymore. Skip transaction if ++ * set has been destroyed from control plane transaction in case gc ++ * worker loses race. ++ */ ++ if (READ_ONCE(nft_net->gc_seq) != trans->seq || trans->set->dead) { ++ mutex_unlock(&nft_net->commit_mutex); ++ return false; ++ } ++ ++ ctx.net = trans->net; ++ ctx.table = trans->set->table; ++ ++ nft_trans_gc_setelem_remove(&ctx, trans); ++ mutex_unlock(&nft_net->commit_mutex); ++ ++ return true; ++} ++ ++static void nft_trans_gc_work(struct work_struct *work) ++{ ++ struct nft_trans_gc *trans, *next; ++ LIST_HEAD(trans_gc_list); ++ ++ spin_lock(&nf_tables_destroy_list_lock); ++ list_splice_init(&nf_tables_gc_list, &trans_gc_list); ++ spin_unlock(&nf_tables_destroy_list_lock); ++ ++ list_for_each_entry_safe(trans, next, &trans_gc_list, list) { ++ list_del(&trans->list); ++ if (!nft_trans_gc_work_done(trans)) { ++ nft_trans_gc_destroy(trans); ++ continue; ++ } ++ call_rcu(&trans->rcu, nft_trans_gc_trans_free); ++ } ++} ++ ++struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, ++ unsigned int gc_seq, gfp_t gfp) ++{ ++ struct net *net = read_pnet(&set->net); ++ struct nft_trans_gc *trans; ++ ++ trans = kzalloc(sizeof(*trans), gfp); ++ if (!trans) ++ return NULL; ++ ++ refcount_inc(&set->refs); ++ trans->set = set; ++ trans->net = get_net(net); ++ trans->seq = gc_seq; ++ ++ return trans; ++} ++ ++void nft_trans_gc_elem_add(struct nft_trans_gc *trans, void *priv) ++{ ++ trans->priv[trans->count++] = priv; ++} ++ ++static void nft_trans_gc_queue_work(struct nft_trans_gc *trans) ++{ ++ spin_lock(&nf_tables_gc_list_lock); ++ list_add_tail(&trans->list, &nf_tables_gc_list); ++ spin_unlock(&nf_tables_gc_list_lock); ++ ++ schedule_work(&trans_gc_work); ++} ++ ++static int nft_trans_gc_space(struct nft_trans_gc *trans) ++{ ++ return NFT_TRANS_GC_BATCHCOUNT - trans->count; ++} ++ ++struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc, ++ unsigned int gc_seq, gfp_t gfp) ++{ ++ if (nft_trans_gc_space(gc)) ++ return gc; ++ ++ nft_trans_gc_queue_work(gc); ++ ++ return nft_trans_gc_alloc(gc->set, gc_seq, gfp); ++} ++ ++void nft_trans_gc_queue_async_done(struct nft_trans_gc *trans) ++{ ++ if (trans->count == 0) { ++ nft_trans_gc_destroy(trans); ++ return; ++ } ++ ++ nft_trans_gc_queue_work(trans); ++} ++ ++struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp) ++{ ++ if (WARN_ON_ONCE(!lockdep_commit_lock_is_held(gc->net))) ++ return NULL; ++ ++ if (nft_trans_gc_space(gc)) ++ return gc; ++ ++ call_rcu(&gc->rcu, nft_trans_gc_trans_free); ++ ++ return nft_trans_gc_alloc(gc->set, 0, gfp); ++} ++ ++void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) ++{ ++ WARN_ON_ONCE(!lockdep_commit_lock_is_held(trans->net)); ++ ++ if (trans->count == 0) { ++ nft_trans_gc_destroy(trans); ++ return; ++ } ++ ++ call_rcu(&trans->rcu, nft_trans_gc_trans_free); ++} ++ ++struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, ++ unsigned int gc_seq) ++{ ++ struct nft_set_elem_catchall *catchall; ++ const struct nft_set *set = gc->set; ++ struct nft_set_ext *ext; ++ ++ list_for_each_entry_rcu(catchall, &set->catchall_list, list) { ++ ext = nft_set_elem_ext(set, catchall->elem); ++ ++ if (!nft_set_elem_expired(ext)) ++ continue; ++ if (nft_set_elem_is_dead(ext)) ++ goto dead_elem; ++ ++ nft_set_elem_dead(ext); ++dead_elem: ++ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); ++ if (!gc) ++ return NULL; ++ ++ nft_trans_gc_elem_add(gc, catchall->elem); ++ } ++ ++ return gc; ++} ++ + static void nf_tables_module_autoload_cleanup(struct net *net) + { + struct nftables_pernet *nft_net = nft_pernet(net); +@@ -9049,11 +9260,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) + { + struct nftables_pernet *nft_net = nft_pernet(net); + struct nft_trans *trans, *next; ++ unsigned int base_seq, gc_seq; + LIST_HEAD(set_update_list); + struct nft_trans_elem *te; + struct nft_chain *chain; + struct nft_table *table; +- unsigned int base_seq; + LIST_HEAD(adl); + int err; + +@@ -9130,6 +9341,10 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) + + WRITE_ONCE(nft_net->base_seq, base_seq); + ++ /* Bump gc counter, it becomes odd, this is the busy mark. */ ++ gc_seq = READ_ONCE(nft_net->gc_seq); ++ WRITE_ONCE(nft_net->gc_seq, ++gc_seq); ++ + /* step 3. Start new generation, rules_gen_X now in use. */ + net->nft.gencursor = nft_gencursor_next(net); + +@@ -9218,6 +9433,7 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) + nft_trans_destroy(trans); + break; + case NFT_MSG_DELSET: ++ nft_trans_set(trans)->dead = 1; + list_del_rcu(&nft_trans_set(trans)->list); + nf_tables_set_notify(&trans->ctx, nft_trans_set(trans), + NFT_MSG_DELSET, GFP_KERNEL); +@@ -9317,6 +9533,8 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) + nft_commit_notify(net, NETLINK_CB(skb).portid); + nf_tables_gen_notify(net, skb, NFT_MSG_NEWGEN); + nf_tables_commit_audit_log(&adl, nft_net->base_seq); ++ ++ WRITE_ONCE(nft_net->gc_seq, ++gc_seq); + nf_tables_commit_release(net); + + return 0; +@@ -10349,6 +10567,7 @@ static int __net_init nf_tables_init_net(struct net *net) + mutex_init(&nft_net->commit_mutex); + nft_net->base_seq = 1; + nft_net->validate_state = NFT_VALIDATE_SKIP; ++ nft_net->gc_seq = 0; + + return 0; + } +@@ -10377,10 +10596,16 @@ static void __net_exit nf_tables_exit_net(struct net *net) + WARN_ON_ONCE(!list_empty(&nft_net->notify_list)); + } + ++static void nf_tables_exit_batch(struct list_head *net_exit_list) ++{ ++ flush_work(&trans_gc_work); ++} ++ + static struct pernet_operations nf_tables_net_ops = { + .init = nf_tables_init_net, + .pre_exit = nf_tables_pre_exit_net, + .exit = nf_tables_exit_net, ++ .exit_batch = nf_tables_exit_batch, + .id = &nf_tables_net_id, + .size = sizeof(struct nftables_pernet), + }; +@@ -10452,6 +10677,7 @@ static void __exit nf_tables_module_exit(void) + nft_chain_filter_fini(); + nft_chain_route_fini(); + unregister_pernet_subsys(&nf_tables_net_ops); ++ cancel_work_sync(&trans_gc_work); + cancel_work_sync(&trans_destroy_work); + rcu_barrier(); + rhltable_destroy(&nft_objname_ht); +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nf_tables-gc-transaction-race-with-abort-p.patch b/queue-5.15/netfilter-nf_tables-gc-transaction-race-with-abort-p.patch new file mode 100644 index 00000000000..063f0c98596 --- /dev/null +++ b/queue-5.15/netfilter-nf_tables-gc-transaction-race-with-abort-p.patch @@ -0,0 +1,41 @@ +From 6ff0a0528be01f1ae9ab3b19a2344fb039d4acad Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:43:05 +0200 +Subject: netfilter: nf_tables: GC transaction race with abort path + +From: Pablo Neira Ayuso + +commit 720344340fb9be2765bbaab7b292ece0a4570eae upstream. + +Abort path is missing a synchronization point with GC transactions. Add +GC sequence number hence any GC transaction losing race will be +discarded. + +Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_tables_api.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index a2543db74cf65..c9a3a692879a8 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -9763,7 +9763,12 @@ static int nf_tables_abort(struct net *net, struct sk_buff *skb, + enum nfnl_abort_action action) + { + struct nftables_pernet *nft_net = nft_pernet(net); +- int ret = __nf_tables_abort(net, action); ++ unsigned int gc_seq; ++ int ret; ++ ++ gc_seq = nft_gc_seq_begin(nft_net); ++ ret = __nf_tables_abort(net, action); ++ nft_gc_seq_end(nft_net, gc_seq); + + mutex_unlock(&nft_net->commit_mutex); + +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nf_tables-gc-transaction-race-with-netns-d.patch b/queue-5.15/netfilter-nf_tables-gc-transaction-race-with-netns-d.patch new file mode 100644 index 00000000000..0dc796a47c7 --- /dev/null +++ b/queue-5.15/netfilter-nf_tables-gc-transaction-race-with-netns-d.patch @@ -0,0 +1,42 @@ +From 708b0a770a9066e3ae7ed5e39ece306929a52dd5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:43:04 +0200 +Subject: netfilter: nf_tables: GC transaction race with netns dismantle + +From: Pablo Neira Ayuso + +commit 02c6c24402bf1c1e986899c14ba22a10b510916b upstream. + +Use maybe_get_net() since GC workqueue might race with netns exit path. + +Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Florian Westphal +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_tables_api.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index aadcb2a5dc816..a2543db74cf65 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -8949,9 +8949,14 @@ struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set, + if (!trans) + return NULL; + ++ trans->net = maybe_get_net(net); ++ if (!trans->net) { ++ kfree(trans); ++ return NULL; ++ } ++ + refcount_inc(&set->refs); + trans->set = set; +- trans->net = get_net(net); + trans->seq = gc_seq; + + return trans; +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nf_tables-remove-busy-mark-and-gc-batch-ap.patch b/queue-5.15/netfilter-nf_tables-remove-busy-mark-and-gc-batch-ap.patch new file mode 100644 index 00000000000..4583804d24d --- /dev/null +++ b/queue-5.15/netfilter-nf_tables-remove-busy-mark-and-gc-batch-ap.patch @@ -0,0 +1,221 @@ +From 7c2b7b29ae7e1c0779b68fa310890b02f7d7f446 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:43:01 +0200 +Subject: netfilter: nf_tables: remove busy mark and gc batch API + +From: Pablo Neira Ayuso + +commit a2dd0233cbc4d8a0abb5f64487487ffc9265beb5 upstream. + +Ditch it, it has been replace it by the GC transaction API and it has no +clients anymore. + +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_tables.h | 98 +------------------------------ + net/netfilter/nf_tables_api.c | 48 +-------------- + 2 files changed, 4 insertions(+), 142 deletions(-) + +diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h +index a930fc2defeee..5a0c854e9dc6e 100644 +--- a/include/net/netfilter/nf_tables.h ++++ b/include/net/netfilter/nf_tables.h +@@ -564,7 +564,6 @@ struct nft_set *nft_set_lookup_global(const struct net *net, + + struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, + const struct nft_set *set); +-void *nft_set_catchall_gc(const struct nft_set *set); + + static inline unsigned long nft_set_gc_interval(const struct nft_set *set) + { +@@ -779,62 +778,6 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, + void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, + const struct nft_set *set, void *elem); + +-/** +- * struct nft_set_gc_batch_head - nf_tables set garbage collection batch +- * +- * @rcu: rcu head +- * @set: set the elements belong to +- * @cnt: count of elements +- */ +-struct nft_set_gc_batch_head { +- struct rcu_head rcu; +- const struct nft_set *set; +- unsigned int cnt; +-}; +- +-#define NFT_SET_GC_BATCH_SIZE ((PAGE_SIZE - \ +- sizeof(struct nft_set_gc_batch_head)) / \ +- sizeof(void *)) +- +-/** +- * struct nft_set_gc_batch - nf_tables set garbage collection batch +- * +- * @head: GC batch head +- * @elems: garbage collection elements +- */ +-struct nft_set_gc_batch { +- struct nft_set_gc_batch_head head; +- void *elems[NFT_SET_GC_BATCH_SIZE]; +-}; +- +-struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, +- gfp_t gfp); +-void nft_set_gc_batch_release(struct rcu_head *rcu); +- +-static inline void nft_set_gc_batch_complete(struct nft_set_gc_batch *gcb) +-{ +- if (gcb != NULL) +- call_rcu(&gcb->head.rcu, nft_set_gc_batch_release); +-} +- +-static inline struct nft_set_gc_batch * +-nft_set_gc_batch_check(const struct nft_set *set, struct nft_set_gc_batch *gcb, +- gfp_t gfp) +-{ +- if (gcb != NULL) { +- if (gcb->head.cnt + 1 < ARRAY_SIZE(gcb->elems)) +- return gcb; +- nft_set_gc_batch_complete(gcb); +- } +- return nft_set_gc_batch_alloc(set, gfp); +-} +- +-static inline void nft_set_gc_batch_add(struct nft_set_gc_batch *gcb, +- void *elem) +-{ +- gcb->elems[gcb->head.cnt++] = elem; +-} +- + struct nft_expr_ops; + /** + * struct nft_expr_type - nf_tables expression type +@@ -1493,47 +1436,12 @@ static inline void nft_set_elem_change_active(const struct net *net, + + #endif /* IS_ENABLED(CONFIG_NF_TABLES) */ + +-/* +- * We use a free bit in the genmask field to indicate the element +- * is busy, meaning it is currently being processed either by +- * the netlink API or GC. +- * +- * Even though the genmask is only a single byte wide, this works +- * because the extension structure if fully constant once initialized, +- * so there are no non-atomic write accesses unless it is already +- * marked busy. +- */ +-#define NFT_SET_ELEM_BUSY_MASK (1 << 2) +- +-#if defined(__LITTLE_ENDIAN_BITFIELD) +-#define NFT_SET_ELEM_BUSY_BIT 2 +-#elif defined(__BIG_ENDIAN_BITFIELD) +-#define NFT_SET_ELEM_BUSY_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) +-#else +-#error +-#endif +- +-static inline int nft_set_elem_mark_busy(struct nft_set_ext *ext) +-{ +- unsigned long *word = (unsigned long *)ext; +- +- BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); +- return test_and_set_bit(NFT_SET_ELEM_BUSY_BIT, word); +-} +- +-static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) +-{ +- unsigned long *word = (unsigned long *)ext; +- +- clear_bit(NFT_SET_ELEM_BUSY_BIT, word); +-} +- +-#define NFT_SET_ELEM_DEAD_MASK (1 << 3) ++#define NFT_SET_ELEM_DEAD_MASK (1 << 2) + + #if defined(__LITTLE_ENDIAN_BITFIELD) +-#define NFT_SET_ELEM_DEAD_BIT 3 ++#define NFT_SET_ELEM_DEAD_BIT 2 + #elif defined(__BIG_ENDIAN_BITFIELD) +-#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 3) ++#define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) + #else + #error + #endif +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index 3749cbd7c1fdd..194b78900bd33 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -5924,29 +5924,6 @@ struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, + } + EXPORT_SYMBOL_GPL(nft_set_catchall_lookup); + +-void *nft_set_catchall_gc(const struct nft_set *set) +-{ +- struct nft_set_elem_catchall *catchall, *next; +- struct nft_set_ext *ext; +- void *elem = NULL; +- +- list_for_each_entry_safe(catchall, next, &set->catchall_list, list) { +- ext = nft_set_elem_ext(set, catchall->elem); +- +- if (!nft_set_elem_expired(ext) || +- nft_set_elem_mark_busy(ext)) +- continue; +- +- elem = catchall->elem; +- list_del_rcu(&catchall->list); +- kfree_rcu(catchall, rcu); +- break; +- } +- +- return elem; +-} +-EXPORT_SYMBOL_GPL(nft_set_catchall_gc); +- + static int nft_setelem_catchall_insert(const struct net *net, + struct nft_set *set, + const struct nft_set_elem *elem, +@@ -6411,7 +6388,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, + goto err_elem_expr; + } + +- ext->genmask = nft_genmask_cur(ctx->net) | NFT_SET_ELEM_BUSY_MASK; ++ ext->genmask = nft_genmask_cur(ctx->net); + + err = nft_setelem_insert(ctx->net, set, &elem, &ext2, flags); + if (err) { +@@ -6792,29 +6769,6 @@ static int nf_tables_delsetelem(struct sk_buff *skb, + return err; + } + +-void nft_set_gc_batch_release(struct rcu_head *rcu) +-{ +- struct nft_set_gc_batch *gcb; +- unsigned int i; +- +- gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu); +- for (i = 0; i < gcb->head.cnt; i++) +- nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true); +- kfree(gcb); +-} +- +-struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set, +- gfp_t gfp) +-{ +- struct nft_set_gc_batch *gcb; +- +- gcb = kzalloc(sizeof(*gcb), gfp); +- if (gcb == NULL) +- return gcb; +- gcb->head.set = set; +- return gcb; +-} +- + /* + * Stateful objects + */ +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nf_tables-use-correct-lock-to-protect-gc_l.patch b/queue-5.15/netfilter-nf_tables-use-correct-lock-to-protect-gc_l.patch new file mode 100644 index 00000000000..c84862a07be --- /dev/null +++ b/queue-5.15/netfilter-nf_tables-use-correct-lock-to-protect-gc_l.patch @@ -0,0 +1,38 @@ +From 38dcbe308e7f7bf0b78786f1189062e5ed9ec2e3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:43:06 +0200 +Subject: netfilter: nf_tables: use correct lock to protect gc_list + +From: Pablo Neira Ayuso + +commit 8357bc946a2abc2a10ca40e5a2105d2b4c57515e upstream. + +Use nf_tables_gc_list_lock spinlock, not nf_tables_destroy_list_lock to +protect the gc_list. + +Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_tables_api.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index c9a3a692879a8..ac266dc2c31b7 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -8925,9 +8925,9 @@ static void nft_trans_gc_work(struct work_struct *work) + struct nft_trans_gc *trans, *next; + LIST_HEAD(trans_gc_list); + +- spin_lock(&nf_tables_destroy_list_lock); ++ spin_lock(&nf_tables_gc_list_lock); + list_splice_init(&nf_tables_gc_list, &trans_gc_list); +- spin_unlock(&nf_tables_destroy_list_lock); ++ spin_unlock(&nf_tables_gc_list_lock); + + list_for_each_entry_safe(trans, next, &trans_gc_list, list) { + list_del(&trans->list); +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nft_set_hash-mark-set-element-as-dead-when.patch b/queue-5.15/netfilter-nft_set_hash-mark-set-element-as-dead-when.patch new file mode 100644 index 00000000000..59684cb2b19 --- /dev/null +++ b/queue-5.15/netfilter-nft_set_hash-mark-set-element-as-dead-when.patch @@ -0,0 +1,49 @@ +From bd38b4378333ed586fc78a347e721e9e1a5c0dc7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:43:00 +0200 +Subject: netfilter: nft_set_hash: mark set element as dead when deleting from + packet path + +From: Pablo Neira Ayuso + +upstream c92db3030492b8ad1d0faace7a93bbcf53850d0c commit. + +Set on the NFT_SET_ELEM_DEAD_BIT flag on this element, instead of +performing element removal which might race with an ongoing transaction. +Enable gc when dynamic flag is set on since dynset deletion requires +garbage collection after this patch. + +Fixes: d0a8d877da97 ("netfilter: nft_dynset: support for element deletion") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_set_hash.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c +index 2f067e4596b02..cef5df8460009 100644 +--- a/net/netfilter/nft_set_hash.c ++++ b/net/netfilter/nft_set_hash.c +@@ -249,7 +249,9 @@ static bool nft_rhash_delete(const struct nft_set *set, + if (he == NULL) + return false; + +- return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0; ++ nft_set_elem_dead(&he->ext); ++ ++ return true; + } + + static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, +@@ -412,7 +414,7 @@ static int nft_rhash_init(const struct nft_set *set, + return err; + + INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc); +- if (set->flags & NFT_SET_TIMEOUT) ++ if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL)) + nft_rhash_gc_init(set); + + return 0; +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nft_set_hash-try-later-when-gc-hits-eagain.patch b/queue-5.15/netfilter-nft_set_hash-try-later-when-gc-hits-eagain.patch new file mode 100644 index 00000000000..6ee396f0a31 --- /dev/null +++ b/queue-5.15/netfilter-nft_set_hash-try-later-when-gc-hits-eagain.patch @@ -0,0 +1,42 @@ +From 2fff4730fbfc65bfe5ca40653dd4691d5bda9d26 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:43:12 +0200 +Subject: netfilter: nft_set_hash: try later when GC hits EAGAIN on iteration + +From: Pablo Neira Ayuso + +commit b079155faae94e9b3ab9337e82100a914ebb4e8d upstream. + +Skip GC run if iterator rewinds to the beginning with EAGAIN, otherwise GC +might collect the same element more than once. + +Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_set_hash.c | 9 +++------ + 1 file changed, 3 insertions(+), 6 deletions(-) + +diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c +index eca20dc601384..2013de934cef0 100644 +--- a/net/netfilter/nft_set_hash.c ++++ b/net/netfilter/nft_set_hash.c +@@ -338,12 +338,9 @@ static void nft_rhash_gc(struct work_struct *work) + + while ((he = rhashtable_walk_next(&hti))) { + if (IS_ERR(he)) { +- if (PTR_ERR(he) != -EAGAIN) { +- nft_trans_gc_destroy(gc); +- gc = NULL; +- goto try_later; +- } +- continue; ++ nft_trans_gc_destroy(gc); ++ gc = NULL; ++ goto try_later; + } + + /* Ruleset has been updated, try later. */ +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nft_set_pipapo-call-nft_trans_gc_queue_syn.patch b/queue-5.15/netfilter-nft_set_pipapo-call-nft_trans_gc_queue_syn.patch new file mode 100644 index 00000000000..0bf3dd2f9e0 --- /dev/null +++ b/queue-5.15/netfilter-nft_set_pipapo-call-nft_trans_gc_queue_syn.patch @@ -0,0 +1,132 @@ +From 22adbf1965df147f474c28497c3357da1c0ac061 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:43:10 +0200 +Subject: netfilter: nft_set_pipapo: call nft_trans_gc_queue_sync() in catchall + GC + +From: Pablo Neira Ayuso + +commit 4a9e12ea7e70223555ec010bec9f711089ce96f6 upstream. + +pipapo needs to enqueue GC transactions for catchall elements through +nft_trans_gc_queue_sync(). Add nft_trans_gc_catchall_sync() and +nft_trans_gc_catchall_async() to handle GC transaction queueing +accordingly. + +Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") +Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_tables.h | 5 +++-- + net/netfilter/nf_tables_api.c | 22 +++++++++++++++++++--- + net/netfilter/nft_set_hash.c | 2 +- + net/netfilter/nft_set_pipapo.c | 2 +- + net/netfilter/nft_set_rbtree.c | 2 +- + 5 files changed, 25 insertions(+), 8 deletions(-) + +diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h +index af703d295f0cd..d543078c43f95 100644 +--- a/include/net/netfilter/nf_tables.h ++++ b/include/net/netfilter/nf_tables.h +@@ -1623,8 +1623,9 @@ void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans); + + void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv); + +-struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, +- unsigned int gc_seq); ++struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, ++ unsigned int gc_seq); ++struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc); + + void nft_setelem_data_deactivate(const struct net *net, + const struct nft_set *set, +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index ac266dc2c31b7..58fec8806ec53 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -9027,8 +9027,9 @@ void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans) + call_rcu(&trans->rcu, nft_trans_gc_trans_free); + } + +-struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, +- unsigned int gc_seq) ++static struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, ++ unsigned int gc_seq, ++ bool sync) + { + struct nft_set_elem_catchall *catchall; + const struct nft_set *set = gc->set; +@@ -9044,7 +9045,11 @@ struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, + + nft_set_elem_dead(ext); + dead_elem: +- gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); ++ if (sync) ++ gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); ++ else ++ gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC); ++ + if (!gc) + return NULL; + +@@ -9054,6 +9059,17 @@ struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc, + return gc; + } + ++struct nft_trans_gc *nft_trans_gc_catchall_async(struct nft_trans_gc *gc, ++ unsigned int gc_seq) ++{ ++ return nft_trans_gc_catchall(gc, gc_seq, false); ++} ++ ++struct nft_trans_gc *nft_trans_gc_catchall_sync(struct nft_trans_gc *gc) ++{ ++ return nft_trans_gc_catchall(gc, 0, true); ++} ++ + static void nf_tables_module_autoload_cleanup(struct net *net) + { + struct nftables_pernet *nft_net = nft_pernet(net); +diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c +index 524763659f251..eca20dc601384 100644 +--- a/net/netfilter/nft_set_hash.c ++++ b/net/netfilter/nft_set_hash.c +@@ -372,7 +372,7 @@ static void nft_rhash_gc(struct work_struct *work) + nft_trans_gc_elem_add(gc, he); + } + +- gc = nft_trans_gc_catchall(gc, gc_seq); ++ gc = nft_trans_gc_catchall_async(gc, gc_seq); + + try_later: + /* catchall list iteration requires rcu read side lock. */ +diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c +index 58bd514260b90..7248a1737ee14 100644 +--- a/net/netfilter/nft_set_pipapo.c ++++ b/net/netfilter/nft_set_pipapo.c +@@ -1611,7 +1611,7 @@ static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) + } + } + +- gc = nft_trans_gc_catchall(gc, 0); ++ gc = nft_trans_gc_catchall_sync(gc); + if (gc) { + nft_trans_gc_queue_sync_done(gc); + priv->last_gc = jiffies; +diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c +index 70491ba98decb..487572dcd6144 100644 +--- a/net/netfilter/nft_set_rbtree.c ++++ b/net/netfilter/nft_set_rbtree.c +@@ -669,7 +669,7 @@ static void nft_rbtree_gc(struct work_struct *work) + nft_trans_gc_elem_add(gc, rbe); + } + +- gc = nft_trans_gc_catchall(gc, gc_seq); ++ gc = nft_trans_gc_catchall_async(gc, gc_seq); + + try_later: + read_unlock_bh(&priv->lock); +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nft_set_pipapo-stop-gc-iteration-if-gc-tra.patch b/queue-5.15/netfilter-nft_set_pipapo-stop-gc-iteration-if-gc-tra.patch new file mode 100644 index 00000000000..f3bd33a6e82 --- /dev/null +++ b/queue-5.15/netfilter-nft_set_pipapo-stop-gc-iteration-if-gc-tra.patch @@ -0,0 +1,37 @@ +From 30ace644350cdf0ef19e6a2364d13838df648f9c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:43:11 +0200 +Subject: netfilter: nft_set_pipapo: stop GC iteration if GC transaction + allocation fails + +From: Pablo Neira Ayuso + +commit 6d365eabce3c018a80f6e0379b17df2abb17405e upstream. + +nft_trans_gc_queue_sync() enqueues the GC transaction and it allocates a +new one. If this allocation fails, then stop this GC sync run and retry +later. + +Fixes: 5f68718b34a5 ("netfilter: nf_tables: GC transaction API to avoid race with control plane") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_set_pipapo.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/netfilter/nft_set_pipapo.c b/net/netfilter/nft_set_pipapo.c +index 7248a1737ee14..83f5f276c3bff 100644 +--- a/net/netfilter/nft_set_pipapo.c ++++ b/net/netfilter/nft_set_pipapo.c +@@ -1597,7 +1597,7 @@ static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m) + + gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC); + if (!gc) +- break; ++ return; + + nft_pipapo_gc_deactivate(net, set, e); + pipapo_drop(m, rulemap); +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nft_set_rbtree-skip-sync-gc-for-new-elemen.patch b/queue-5.15/netfilter-nft_set_rbtree-skip-sync-gc-for-new-elemen.patch new file mode 100644 index 00000000000..dde9151ba88 --- /dev/null +++ b/queue-5.15/netfilter-nft_set_rbtree-skip-sync-gc-for-new-elemen.patch @@ -0,0 +1,53 @@ +From 35857a51b17b99668c12d5488a1794e0930025fb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:43:08 +0200 +Subject: netfilter: nft_set_rbtree: skip sync GC for new elements in this + transaction + +From: Pablo Neira Ayuso + +commit 2ee52ae94baabf7ee09cf2a8d854b990dac5d0e4 upstream. + +New elements in this transaction might expired before such transaction +ends. Skip sync GC for such elements otherwise commit path might walk +over an already released object. Once transaction is finished, async GC +will collect such expired element. + +Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_set_rbtree.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c +index c6435e7092319..f250b5399344a 100644 +--- a/net/netfilter/nft_set_rbtree.c ++++ b/net/netfilter/nft_set_rbtree.c +@@ -312,6 +312,7 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, + struct nft_rbtree_elem *rbe, *rbe_le = NULL, *rbe_ge = NULL; + struct rb_node *node, *next, *parent, **p, *first = NULL; + struct nft_rbtree *priv = nft_set_priv(set); ++ u8 cur_genmask = nft_genmask_cur(net); + u8 genmask = nft_genmask_next(net); + int d, err; + +@@ -357,8 +358,11 @@ static int __nft_rbtree_insert(const struct net *net, const struct nft_set *set, + if (!nft_set_elem_active(&rbe->ext, genmask)) + continue; + +- /* perform garbage collection to avoid bogus overlap reports. */ +- if (nft_set_elem_expired(&rbe->ext)) { ++ /* perform garbage collection to avoid bogus overlap reports ++ * but skip new elements in this transaction. ++ */ ++ if (nft_set_elem_expired(&rbe->ext) && ++ nft_set_elem_active(&rbe->ext, cur_genmask)) { + err = nft_rbtree_gc_elem(set, priv, rbe, genmask); + if (err < 0) + return err; +-- +2.40.1 + diff --git a/queue-5.15/netfilter-nft_set_rbtree-use-read-spinlock-to-avoid-.patch b/queue-5.15/netfilter-nft_set_rbtree-use-read-spinlock-to-avoid-.patch new file mode 100644 index 00000000000..5918697c625 --- /dev/null +++ b/queue-5.15/netfilter-nft_set_rbtree-use-read-spinlock-to-avoid-.patch @@ -0,0 +1,48 @@ +From bceef0ad91672e1e05aad6926978a3e9e548d262 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 22 Sep 2023 18:43:09 +0200 +Subject: netfilter: nft_set_rbtree: use read spinlock to avoid datapath + contention + +From: Pablo Neira Ayuso + +commit 96b33300fba880ec0eafcf3d82486f3463b4b6da upstream. + +rbtree GC does not modify the datastructure, instead it collects expired +elements and it enqueues a GC transaction. Use a read spinlock instead +to avoid data contention while GC worker is running. + +Fixes: f6c383b8c31a ("netfilter: nf_tables: adapt set backend to use GC transaction API") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_set_rbtree.c | 6 ++---- + 1 file changed, 2 insertions(+), 4 deletions(-) + +diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c +index f250b5399344a..70491ba98decb 100644 +--- a/net/netfilter/nft_set_rbtree.c ++++ b/net/netfilter/nft_set_rbtree.c +@@ -622,8 +622,7 @@ static void nft_rbtree_gc(struct work_struct *work) + if (!gc) + goto done; + +- write_lock_bh(&priv->lock); +- write_seqcount_begin(&priv->count); ++ read_lock_bh(&priv->lock); + for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { + + /* Ruleset has been updated, try later. */ +@@ -673,8 +672,7 @@ static void nft_rbtree_gc(struct work_struct *work) + gc = nft_trans_gc_catchall(gc, gc_seq); + + try_later: +- write_seqcount_end(&priv->count); +- write_unlock_bh(&priv->lock); ++ read_unlock_bh(&priv->lock); + + if (gc) + nft_trans_gc_queue_async_done(gc); +-- +2.40.1 + diff --git a/queue-5.15/series b/queue-5.15/series index 86755ffe71b..b6356061f21 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -14,3 +14,20 @@ ext4-replace-the-traditional-ternary-conditional-ope.patch ext4-move-setting-of-trimmed-bit-into-ext4_try_to_tr.patch ext4-do-not-let-fstrim-block-system-suspend.patch tracing-have-event-inject-files-inc-the-trace-array-.patch +netfilter-nf_tables-don-t-skip-expired-elements-duri.patch +netfilter-nf_tables-gc-transaction-api-to-avoid-race.patch +netfilter-nf_tables-adapt-set-backend-to-use-gc-tran.patch +netfilter-nft_set_hash-mark-set-element-as-dead-when.patch +netfilter-nf_tables-remove-busy-mark-and-gc-batch-ap.patch +netfilter-nf_tables-don-t-fail-inserts-if-duplicate-.patch +netfilter-nf_tables-fix-gc-transaction-races-with-ne.patch +netfilter-nf_tables-gc-transaction-race-with-netns-d.patch +netfilter-nf_tables-gc-transaction-race-with-abort-p.patch +netfilter-nf_tables-use-correct-lock-to-protect-gc_l.patch +netfilter-nf_tables-defer-gc-run-if-previous-batch-i.patch +netfilter-nft_set_rbtree-skip-sync-gc-for-new-elemen.patch +netfilter-nft_set_rbtree-use-read-spinlock-to-avoid-.patch +netfilter-nft_set_pipapo-call-nft_trans_gc_queue_syn.patch +netfilter-nft_set_pipapo-stop-gc-iteration-if-gc-tra.patch +netfilter-nft_set_hash-try-later-when-gc-hits-eagain.patch +netfilter-nf_tables-fix-memleak-when-more-than-255-e.patch