From: Greg Kroah-Hartman Date: Fri, 18 Jan 2019 08:00:10 +0000 (+0100) Subject: 4.20-stable patches X-Git-Tag: v4.20.4~54 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=b8d91c41e683795f36dc96e2b085c86a2ebdf826;p=thirdparty%2Fkernel%2Fstable-queue.git 4.20-stable patches added patches: netfilter-nf_conncount-don-t-skip-eviction-when-age-is-negative.patch netfilter-nf_conncount-fix-argument-order-to-find_next_bit.patch netfilter-nf_conncount-merge-lookup-and-add-functions.patch netfilter-nf_conncount-move-all-list-iterations-under-spinlock.patch netfilter-nf_conncount-replace-conncount_lock_slots-with-conncount_slots.patch netfilter-nf_conncount-restart-search-when-nodes-have-been-erased.patch netfilter-nf_conncount-speculative-garbage-collection-on-empty-lists.patch netfilter-nf_conncount-split-gc-in-two-phases.patch --- diff --git a/queue-4.20/netfilter-nf_conncount-don-t-skip-eviction-when-age-is-negative.patch b/queue-4.20/netfilter-nf_conncount-don-t-skip-eviction-when-age-is-negative.patch new file mode 100644 index 00000000000..80403e5372d --- /dev/null +++ b/queue-4.20/netfilter-nf_conncount-don-t-skip-eviction-when-age-is-negative.patch @@ -0,0 +1,35 @@ +From 4cd273bb91b3001f623f516ec726c49754571b1a Mon Sep 17 00:00:00 2001 +From: Florian Westphal +Date: Fri, 28 Dec 2018 01:24:43 +0100 +Subject: netfilter: nf_conncount: don't skip eviction when age is negative + +From: Florian Westphal + +commit 4cd273bb91b3001f623f516ec726c49754571b1a upstream. + +age is signed integer, so result can be negative when the timestamps +have a large delta. In this case we want to discard the entry. + +Instead of using age >= 2 || age < 0, just make it unsigned. + +Fixes: b36e4523d4d56 ("netfilter: nf_conncount: fix garbage collection confirm race") +Reviewed-by: Shawn Bohrer +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Greg Kroah-Hartman + +--- + net/netfilter/nf_conncount.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/netfilter/nf_conncount.c ++++ b/net/netfilter/nf_conncount.c +@@ -155,7 +155,7 @@ find_or_evict(struct net *net, struct nf + const struct nf_conntrack_tuple_hash *found; + unsigned long a, b; + int cpu = raw_smp_processor_id(); +- __s32 age; ++ u32 age; + + found = nf_conntrack_find_get(net, &conn->zone, &conn->tuple); + if (found) diff --git a/queue-4.20/netfilter-nf_conncount-fix-argument-order-to-find_next_bit.patch b/queue-4.20/netfilter-nf_conncount-fix-argument-order-to-find_next_bit.patch new file mode 100644 index 00000000000..4b66c9ef258 --- /dev/null +++ b/queue-4.20/netfilter-nf_conncount-fix-argument-order-to-find_next_bit.patch @@ -0,0 +1,33 @@ +From a007232066f6839d6f256bab21e825d968f1a163 Mon Sep 17 00:00:00 2001 +From: Florian Westphal +Date: Fri, 28 Dec 2018 01:24:49 +0100 +Subject: netfilter: nf_conncount: fix argument order to find_next_bit + +From: Florian Westphal + +commit a007232066f6839d6f256bab21e825d968f1a163 upstream. + +Size and 'next bit' were swapped, this bug could cause worker to +reschedule itself even if system was idle. + +Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search") +Reviewed-by: Shawn Bohrer +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Greg Kroah-Hartman + +--- + net/netfilter/nf_conncount.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/net/netfilter/nf_conncount.c ++++ b/net/netfilter/nf_conncount.c +@@ -488,7 +488,7 @@ next: + clear_bit(tree, data->pending_trees); + + next_tree = (tree + 1) % CONNCOUNT_SLOTS; +- next_tree = find_next_bit(data->pending_trees, next_tree, CONNCOUNT_SLOTS); ++ next_tree = find_next_bit(data->pending_trees, CONNCOUNT_SLOTS, next_tree); + + if (next_tree < CONNCOUNT_SLOTS) { + data->gc_tree = next_tree; diff --git a/queue-4.20/netfilter-nf_conncount-merge-lookup-and-add-functions.patch b/queue-4.20/netfilter-nf_conncount-merge-lookup-and-add-functions.patch new file mode 100644 index 00000000000..792e855d6b0 --- /dev/null +++ b/queue-4.20/netfilter-nf_conncount-merge-lookup-and-add-functions.patch @@ -0,0 +1,359 @@ +From df4a902509766897f7371fdfa4c3bf8bc321b55d Mon Sep 17 00:00:00 2001 +From: Florian Westphal +Date: Fri, 28 Dec 2018 01:24:46 +0100 +Subject: netfilter: nf_conncount: merge lookup and add functions + +From: Florian Westphal + +commit df4a902509766897f7371fdfa4c3bf8bc321b55d upstream. + +'lookup' is always followed by 'add'. +Merge both and make the list-walk part of nf_conncount_add(). + +This also avoids one unneeded unlock/re-lock pair. + +Extra care needs to be taken in count_tree, as we only hold rcu +read lock, i.e. we can only insert to an existing tree node after +acquiring its lock and making sure it has a nonzero count. + +As a zero count should be rare, just fall back to insert_tree() +(which acquires tree lock). + +This issue and its solution were pointed out by Shawn Bohrer +during patch review. + +Reviewed-by: Shawn Bohrer +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Greg Kroah-Hartman + +--- + include/net/netfilter/nf_conntrack_count.h | 18 --- + net/netfilter/nf_conncount.c | 146 +++++++++++++---------------- + net/netfilter/nft_connlimit.c | 14 -- + 3 files changed, 72 insertions(+), 106 deletions(-) + +--- a/include/net/netfilter/nf_conntrack_count.h ++++ b/include/net/netfilter/nf_conntrack_count.h +@@ -5,12 +5,6 @@ + + struct nf_conncount_data; + +-enum nf_conncount_list_add { +- NF_CONNCOUNT_ADDED, /* list add was ok */ +- NF_CONNCOUNT_ERR, /* -ENOMEM, must drop skb */ +- NF_CONNCOUNT_SKIP, /* list is already reclaimed by gc */ +-}; +- + struct nf_conncount_list { + spinlock_t list_lock; + struct list_head head; /* connections with the same filtering key */ +@@ -29,18 +23,12 @@ unsigned int nf_conncount_count(struct n + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone); + +-void nf_conncount_lookup(struct net *net, struct nf_conncount_list *list, +- const struct nf_conntrack_tuple *tuple, +- const struct nf_conntrack_zone *zone, +- bool *addit); ++int nf_conncount_add(struct net *net, struct nf_conncount_list *list, ++ const struct nf_conntrack_tuple *tuple, ++ const struct nf_conntrack_zone *zone); + + void nf_conncount_list_init(struct nf_conncount_list *list); + +-enum nf_conncount_list_add +-nf_conncount_add(struct nf_conncount_list *list, +- const struct nf_conntrack_tuple *tuple, +- const struct nf_conntrack_zone *zone); +- + bool nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list); + +--- a/net/netfilter/nf_conncount.c ++++ b/net/netfilter/nf_conncount.c +@@ -83,38 +83,6 @@ static int key_diff(const u32 *a, const + return memcmp(a, b, klen * sizeof(u32)); + } + +-enum nf_conncount_list_add +-nf_conncount_add(struct nf_conncount_list *list, +- const struct nf_conntrack_tuple *tuple, +- const struct nf_conntrack_zone *zone) +-{ +- struct nf_conncount_tuple *conn; +- +- if (WARN_ON_ONCE(list->count > INT_MAX)) +- return NF_CONNCOUNT_ERR; +- +- conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); +- if (conn == NULL) +- return NF_CONNCOUNT_ERR; +- +- conn->tuple = *tuple; +- conn->zone = *zone; +- conn->cpu = raw_smp_processor_id(); +- conn->jiffies32 = (u32)jiffies; +- conn->dead = false; +- spin_lock_bh(&list->list_lock); +- if (list->dead == true) { +- kmem_cache_free(conncount_conn_cachep, conn); +- spin_unlock_bh(&list->list_lock); +- return NF_CONNCOUNT_SKIP; +- } +- list_add_tail(&conn->node, &list->head); +- list->count++; +- spin_unlock_bh(&list->list_lock); +- return NF_CONNCOUNT_ADDED; +-} +-EXPORT_SYMBOL_GPL(nf_conncount_add); +- + static void __conn_free(struct rcu_head *h) + { + struct nf_conncount_tuple *conn; +@@ -177,11 +145,10 @@ find_or_evict(struct net *net, struct nf + return ERR_PTR(-EAGAIN); + } + +-void nf_conncount_lookup(struct net *net, +- struct nf_conncount_list *list, +- const struct nf_conntrack_tuple *tuple, +- const struct nf_conntrack_zone *zone, +- bool *addit) ++static int __nf_conncount_add(struct net *net, ++ struct nf_conncount_list *list, ++ const struct nf_conntrack_tuple *tuple, ++ const struct nf_conntrack_zone *zone) + { + const struct nf_conntrack_tuple_hash *found; + struct nf_conncount_tuple *conn, *conn_n; +@@ -189,9 +156,6 @@ void nf_conncount_lookup(struct net *net + unsigned int collect = 0; + bool free_entry = false; + +- /* best effort only */ +- *addit = tuple ? true : false; +- + /* check the saved connections */ + list_for_each_entry_safe(conn, conn_n, &list->head, node) { + if (collect > CONNCOUNT_GC_MAX_NODES) +@@ -201,21 +165,19 @@ void nf_conncount_lookup(struct net *net + if (IS_ERR(found)) { + /* Not found, but might be about to be confirmed */ + if (PTR_ERR(found) == -EAGAIN) { +- if (!tuple) +- continue; +- + if (nf_ct_tuple_equal(&conn->tuple, tuple) && + nf_ct_zone_id(&conn->zone, conn->zone.dir) == + nf_ct_zone_id(zone, zone->dir)) +- *addit = false; +- } else if (PTR_ERR(found) == -ENOENT) ++ return 0; /* already exists */ ++ } else { + collect++; ++ } + continue; + } + + found_ct = nf_ct_tuplehash_to_ctrack(found); + +- if (tuple && nf_ct_tuple_equal(&conn->tuple, tuple) && ++ if (nf_ct_tuple_equal(&conn->tuple, tuple) && + nf_ct_zone_equal(found_ct, zone, zone->dir)) { + /* + * We should not see tuples twice unless someone hooks +@@ -223,7 +185,8 @@ void nf_conncount_lookup(struct net *net + * + * Attempt to avoid a re-add in this case. + */ +- *addit = false; ++ nf_ct_put(found_ct); ++ return 0; + } else if (already_closed(found_ct)) { + /* + * we do not care about connections which are +@@ -237,8 +200,38 @@ void nf_conncount_lookup(struct net *net + + nf_ct_put(found_ct); + } ++ ++ if (WARN_ON_ONCE(list->count > INT_MAX)) ++ return -EOVERFLOW; ++ ++ conn = kmem_cache_alloc(conncount_conn_cachep, GFP_ATOMIC); ++ if (conn == NULL) ++ return -ENOMEM; ++ ++ conn->tuple = *tuple; ++ conn->zone = *zone; ++ conn->cpu = raw_smp_processor_id(); ++ conn->jiffies32 = (u32)jiffies; ++ list_add_tail(&conn->node, &list->head); ++ list->count++; ++ return 0; ++} ++ ++int nf_conncount_add(struct net *net, ++ struct nf_conncount_list *list, ++ const struct nf_conntrack_tuple *tuple, ++ const struct nf_conntrack_zone *zone) ++{ ++ int ret; ++ ++ /* check the saved connections */ ++ spin_lock_bh(&list->list_lock); ++ ret = __nf_conncount_add(net, list, tuple, zone); ++ spin_unlock_bh(&list->list_lock); ++ ++ return ret; + } +-EXPORT_SYMBOL_GPL(nf_conncount_lookup); ++EXPORT_SYMBOL_GPL(nf_conncount_add); + + void nf_conncount_list_init(struct nf_conncount_list *list) + { +@@ -339,13 +332,11 @@ insert_tree(struct net *net, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) + { +- enum nf_conncount_list_add ret; + struct nf_conncount_rb *gc_nodes[CONNCOUNT_GC_MAX_NODES]; + struct rb_node **rbnode, *parent; + struct nf_conncount_rb *rbconn; + struct nf_conncount_tuple *conn; + unsigned int count = 0, gc_count = 0; +- bool node_found = false; + bool do_gc = true; + + spin_lock_bh(&nf_conncount_locks[hash]); +@@ -363,20 +354,15 @@ restart: + } else if (diff > 0) { + rbnode = &((*rbnode)->rb_right); + } else { +- /* unlikely: other cpu added node already */ +- node_found = true; +- ret = nf_conncount_add(&rbconn->list, tuple, zone); +- if (ret == NF_CONNCOUNT_ERR) { ++ int ret; ++ ++ ret = nf_conncount_add(net, &rbconn->list, tuple, zone); ++ if (ret) + count = 0; /* hotdrop */ +- } else if (ret == NF_CONNCOUNT_ADDED) { ++ else + count = rbconn->list.count; +- } else { +- /* NF_CONNCOUNT_SKIP, rbconn is already +- * reclaimed by gc, insert a new tree node +- */ +- node_found = false; +- } +- break; ++ tree_nodes_free(root, gc_nodes, gc_count); ++ goto out_unlock; + } + + if (gc_count >= ARRAY_SIZE(gc_nodes)) +@@ -394,9 +380,6 @@ restart: + goto restart; + } + +- if (node_found) +- goto out_unlock; +- + /* expected case: match, insert new node */ + rbconn = kmem_cache_alloc(conncount_rb_cachep, GFP_ATOMIC); + if (rbconn == NULL) +@@ -431,7 +414,6 @@ count_tree(struct net *net, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) + { +- enum nf_conncount_list_add ret; + struct rb_root *root; + struct rb_node *parent; + struct nf_conncount_rb *rbconn; +@@ -444,7 +426,6 @@ count_tree(struct net *net, + parent = rcu_dereference_raw(root->rb_node); + while (parent) { + int diff; +- bool addit; + + rbconn = rb_entry(parent, struct nf_conncount_rb, node); + +@@ -454,24 +435,29 @@ count_tree(struct net *net, + } else if (diff > 0) { + parent = rcu_dereference_raw(parent->rb_right); + } else { +- /* same source network -> be counted! */ +- nf_conncount_lookup(net, &rbconn->list, tuple, zone, +- &addit); ++ int ret; + +- if (!addit) ++ if (!tuple) { ++ nf_conncount_gc_list(net, &rbconn->list); + return rbconn->list.count; ++ } + +- ret = nf_conncount_add(&rbconn->list, tuple, zone); +- if (ret == NF_CONNCOUNT_ERR) { +- return 0; /* hotdrop */ +- } else if (ret == NF_CONNCOUNT_ADDED) { +- return rbconn->list.count; +- } else { +- /* NF_CONNCOUNT_SKIP, rbconn is already +- * reclaimed by gc, insert a new tree node +- */ ++ spin_lock_bh(&rbconn->list.list_lock); ++ /* Node might be about to be free'd. ++ * We need to defer to insert_tree() in this case. ++ */ ++ if (rbconn->list.count == 0) { ++ spin_unlock_bh(&rbconn->list.list_lock); + break; + } ++ ++ /* same source network -> be counted! */ ++ ret = __nf_conncount_add(net, &rbconn->list, tuple, zone); ++ spin_unlock_bh(&rbconn->list.list_lock); ++ if (ret) ++ return 0; /* hotdrop */ ++ else ++ return rbconn->list.count; + } + } + +--- a/net/netfilter/nft_connlimit.c ++++ b/net/netfilter/nft_connlimit.c +@@ -30,7 +30,6 @@ static inline void nft_connlimit_do_eval + enum ip_conntrack_info ctinfo; + const struct nf_conn *ct; + unsigned int count; +- bool addit; + + tuple_ptr = &tuple; + +@@ -44,19 +43,12 @@ static inline void nft_connlimit_do_eval + return; + } + +- nf_conncount_lookup(nft_net(pkt), &priv->list, tuple_ptr, zone, +- &addit); +- count = priv->list.count; +- +- if (!addit) +- goto out; +- +- if (nf_conncount_add(&priv->list, tuple_ptr, zone) == NF_CONNCOUNT_ERR) { ++ if (nf_conncount_add(nft_net(pkt), &priv->list, tuple_ptr, zone)) { + regs->verdict.code = NF_DROP; + return; + } +- count++; +-out: ++ ++ count = priv->list.count; + + if ((count > priv->limit) ^ priv->invert) { + regs->verdict.code = NFT_BREAK; diff --git a/queue-4.20/netfilter-nf_conncount-move-all-list-iterations-under-spinlock.patch b/queue-4.20/netfilter-nf_conncount-move-all-list-iterations-under-spinlock.patch new file mode 100644 index 00000000000..72539b4b059 --- /dev/null +++ b/queue-4.20/netfilter-nf_conncount-move-all-list-iterations-under-spinlock.patch @@ -0,0 +1,155 @@ +From 2f971a8f425545da52ca0e6bee81f5b1ea0ccc5f Mon Sep 17 00:00:00 2001 +From: Pablo Neira Ayuso +Date: Fri, 28 Dec 2018 01:24:47 +0100 +Subject: netfilter: nf_conncount: move all list iterations under spinlock + +From: Pablo Neira Ayuso + +commit 2f971a8f425545da52ca0e6bee81f5b1ea0ccc5f upstream. + +Two CPUs may race to remove a connection from the list, the existing +conn->dead will result in a use-after-free. Use the per-list spinlock to +protect list iterations. + +As all accesses to the list now happen while holding the per-list lock, +we no longer need to delay free operations with rcu. + +Joint work with Florian. + +Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search") +Reviewed-by: Shawn Bohrer +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Greg Kroah-Hartman + +--- + net/netfilter/nf_conncount.c | 46 ++++++++++++++++++------------------------- + 1 file changed, 20 insertions(+), 26 deletions(-) + +--- a/net/netfilter/nf_conncount.c ++++ b/net/netfilter/nf_conncount.c +@@ -43,8 +43,6 @@ struct nf_conncount_tuple { + struct nf_conntrack_zone zone; + int cpu; + u32 jiffies32; +- bool dead; +- struct rcu_head rcu_head; + }; + + struct nf_conncount_rb { +@@ -83,36 +81,21 @@ static int key_diff(const u32 *a, const + return memcmp(a, b, klen * sizeof(u32)); + } + +-static void __conn_free(struct rcu_head *h) +-{ +- struct nf_conncount_tuple *conn; +- +- conn = container_of(h, struct nf_conncount_tuple, rcu_head); +- kmem_cache_free(conncount_conn_cachep, conn); +-} +- + static bool conn_free(struct nf_conncount_list *list, + struct nf_conncount_tuple *conn) + { + bool free_entry = false; + +- spin_lock_bh(&list->list_lock); +- +- if (conn->dead) { +- spin_unlock_bh(&list->list_lock); +- return free_entry; +- } ++ lockdep_assert_held(&list->list_lock); + + list->count--; +- conn->dead = true; +- list_del_rcu(&conn->node); ++ list_del(&conn->node); + if (list->count == 0) { + list->dead = true; + free_entry = true; + } + +- spin_unlock_bh(&list->list_lock); +- call_rcu(&conn->rcu_head, __conn_free); ++ kmem_cache_free(conncount_conn_cachep, conn); + return free_entry; + } + +@@ -242,7 +225,7 @@ void nf_conncount_list_init(struct nf_co + } + EXPORT_SYMBOL_GPL(nf_conncount_list_init); + +-/* Return true if the list is empty */ ++/* Return true if the list is empty. Must be called with BH disabled. */ + bool nf_conncount_gc_list(struct net *net, + struct nf_conncount_list *list) + { +@@ -253,12 +236,18 @@ bool nf_conncount_gc_list(struct net *ne + bool free_entry = false; + bool ret = false; + ++ /* don't bother if other cpu is already doing GC */ ++ if (!spin_trylock(&list->list_lock)) ++ return false; ++ + list_for_each_entry_safe(conn, conn_n, &list->head, node) { + found = find_or_evict(net, list, conn, &free_entry); + if (IS_ERR(found)) { + if (PTR_ERR(found) == -ENOENT) { +- if (free_entry) ++ if (free_entry) { ++ spin_unlock(&list->list_lock); + return true; ++ } + collected++; + } + continue; +@@ -271,23 +260,24 @@ bool nf_conncount_gc_list(struct net *ne + * closed already -> ditch it + */ + nf_ct_put(found_ct); +- if (conn_free(list, conn)) ++ if (conn_free(list, conn)) { ++ spin_unlock(&list->list_lock); + return true; ++ } + collected++; + continue; + } + + nf_ct_put(found_ct); + if (collected > CONNCOUNT_GC_MAX_NODES) +- return false; ++ break; + } + +- spin_lock_bh(&list->list_lock); + if (!list->count) { + list->dead = true; + ret = true; + } +- spin_unlock_bh(&list->list_lock); ++ spin_unlock(&list->list_lock); + + return ret; + } +@@ -478,6 +468,7 @@ static void tree_gc_worker(struct work_s + tree = data->gc_tree % CONNCOUNT_SLOTS; + root = &data->root[tree]; + ++ local_bh_disable(); + rcu_read_lock(); + for (node = rb_first(root); node != NULL; node = rb_next(node)) { + rbconn = rb_entry(node, struct nf_conncount_rb, node); +@@ -485,6 +476,9 @@ static void tree_gc_worker(struct work_s + gc_count++; + } + rcu_read_unlock(); ++ local_bh_enable(); ++ ++ cond_resched(); + + spin_lock_bh(&nf_conncount_locks[tree]); + if (gc_count < ARRAY_SIZE(gc_nodes)) diff --git a/queue-4.20/netfilter-nf_conncount-replace-conncount_lock_slots-with-conncount_slots.patch b/queue-4.20/netfilter-nf_conncount-replace-conncount_lock_slots-with-conncount_slots.patch new file mode 100644 index 00000000000..7c3accd6173 --- /dev/null +++ b/queue-4.20/netfilter-nf_conncount-replace-conncount_lock_slots-with-conncount_slots.patch @@ -0,0 +1,91 @@ +From c78e7818f16f687389174c4569243abbec8dc68f Mon Sep 17 00:00:00 2001 +From: Shawn Bohrer +Date: Fri, 28 Dec 2018 01:24:42 +0100 +Subject: netfilter: nf_conncount: replace CONNCOUNT_LOCK_SLOTS with CONNCOUNT_SLOTS + +From: Shawn Bohrer + +commit c78e7818f16f687389174c4569243abbec8dc68f upstream. + +Most of the time these were the same value anyway, but when +CONFIG_LOCKDEP was enabled we would use a smaller number of locks to +reduce overhead. Unfortunately having two values is confusing and not +worth the complexity. + +This fixes a bug where tree_gc_worker() would only GC up to +CONNCOUNT_LOCK_SLOTS trees which meant when CONFIG_LOCKDEP was enabled +not all trees would be GCed by tree_gc_worker(). + +Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search") +Signed-off-by: Florian Westphal +Signed-off-by: Shawn Bohrer +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Greg Kroah-Hartman + +--- + net/netfilter/nf_conncount.c | 19 +++++-------------- + 1 file changed, 5 insertions(+), 14 deletions(-) + +--- a/net/netfilter/nf_conncount.c ++++ b/net/netfilter/nf_conncount.c +@@ -33,12 +33,6 @@ + + #define CONNCOUNT_SLOTS 256U + +-#ifdef CONFIG_LOCKDEP +-#define CONNCOUNT_LOCK_SLOTS 8U +-#else +-#define CONNCOUNT_LOCK_SLOTS 256U +-#endif +- + #define CONNCOUNT_GC_MAX_NODES 8 + #define MAX_KEYLEN 5 + +@@ -60,7 +54,7 @@ struct nf_conncount_rb { + struct rcu_head rcu_head; + }; + +-static spinlock_t nf_conncount_locks[CONNCOUNT_LOCK_SLOTS] __cacheline_aligned_in_smp; ++static spinlock_t nf_conncount_locks[CONNCOUNT_SLOTS] __cacheline_aligned_in_smp; + + struct nf_conncount_data { + unsigned int keylen; +@@ -353,7 +347,7 @@ insert_tree(struct net *net, + unsigned int count = 0, gc_count = 0; + bool node_found = false; + +- spin_lock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); ++ spin_lock_bh(&nf_conncount_locks[hash]); + + parent = NULL; + rbnode = &(root->rb_node); +@@ -430,7 +424,7 @@ insert_tree(struct net *net, + rb_link_node_rcu(&rbconn->node, parent, rbnode); + rb_insert_color(&rbconn->node, root); + out_unlock: +- spin_unlock_bh(&nf_conncount_locks[hash % CONNCOUNT_LOCK_SLOTS]); ++ spin_unlock_bh(&nf_conncount_locks[hash]); + return count; + } + +@@ -499,7 +493,7 @@ static void tree_gc_worker(struct work_s + struct rb_node *node; + unsigned int tree, next_tree, gc_count = 0; + +- tree = data->gc_tree % CONNCOUNT_LOCK_SLOTS; ++ tree = data->gc_tree % CONNCOUNT_SLOTS; + root = &data->root[tree]; + + rcu_read_lock(); +@@ -621,10 +615,7 @@ static int __init nf_conncount_modinit(v + { + int i; + +- BUILD_BUG_ON(CONNCOUNT_LOCK_SLOTS > CONNCOUNT_SLOTS); +- BUILD_BUG_ON((CONNCOUNT_SLOTS % CONNCOUNT_LOCK_SLOTS) != 0); +- +- for (i = 0; i < CONNCOUNT_LOCK_SLOTS; ++i) ++ for (i = 0; i < CONNCOUNT_SLOTS; ++i) + spin_lock_init(&nf_conncount_locks[i]); + + conncount_conn_cachep = kmem_cache_create("nf_conncount_tuple", diff --git a/queue-4.20/netfilter-nf_conncount-restart-search-when-nodes-have-been-erased.patch b/queue-4.20/netfilter-nf_conncount-restart-search-when-nodes-have-been-erased.patch new file mode 100644 index 00000000000..4fb52a73f2e --- /dev/null +++ b/queue-4.20/netfilter-nf_conncount-restart-search-when-nodes-have-been-erased.patch @@ -0,0 +1,86 @@ +From e8cfb372b38a1b8979aa7f7631fb5e7b11c3793c Mon Sep 17 00:00:00 2001 +From: Florian Westphal +Date: Fri, 28 Dec 2018 01:24:45 +0100 +Subject: netfilter: nf_conncount: restart search when nodes have been erased + +From: Florian Westphal + +commit e8cfb372b38a1b8979aa7f7631fb5e7b11c3793c upstream. + +Shawn Bohrer reported a following crash: + |RIP: 0010:rb_erase+0xae/0x360 + [..] + Call Trace: + nf_conncount_destroy+0x59/0xc0 [nf_conncount] + cleanup_match+0x45/0x70 [ip_tables] + ... + +Shawn tracked this down to bogus 'parent' pointer: +Problem is that when we insert a new node, then there is a chance that +the 'parent' that we found was also passed to tree_nodes_free() (because +that node was empty) for erase+free. + +Instead of trying to be clever and detect when this happens, restart +the search if we have evicted one or more nodes. To prevent frequent +restarts, do not perform gc on the second round. + +Also, unconditionally schedule the gc worker. +The condition + + gc_count > ARRAY_SIZE(gc_nodes)) + +cannot be true unless tree grows very large, as the height of the tree +will be low even with hundreds of nodes present. + +Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search") +Reported-by: Shawn Bohrer +Reviewed-by: Shawn Bohrer +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Greg Kroah-Hartman + +--- + net/netfilter/nf_conncount.c | 18 +++++++----------- + 1 file changed, 7 insertions(+), 11 deletions(-) + +--- a/net/netfilter/nf_conncount.c ++++ b/net/netfilter/nf_conncount.c +@@ -346,9 +346,10 @@ insert_tree(struct net *net, + struct nf_conncount_tuple *conn; + unsigned int count = 0, gc_count = 0; + bool node_found = false; ++ bool do_gc = true; + + spin_lock_bh(&nf_conncount_locks[hash]); +- ++restart: + parent = NULL; + rbnode = &(root->rb_node); + while (*rbnode) { +@@ -381,21 +382,16 @@ insert_tree(struct net *net, + if (gc_count >= ARRAY_SIZE(gc_nodes)) + continue; + +- if (nf_conncount_gc_list(net, &rbconn->list)) ++ if (do_gc && nf_conncount_gc_list(net, &rbconn->list)) + gc_nodes[gc_count++] = rbconn; + } + + if (gc_count) { + tree_nodes_free(root, gc_nodes, gc_count); +- /* tree_node_free before new allocation permits +- * allocator to re-use newly free'd object. +- * +- * This is a rare event; in most cases we will find +- * existing node to re-use. (or gc_count is 0). +- */ +- +- if (gc_count >= ARRAY_SIZE(gc_nodes)) +- schedule_gc_worker(data, hash); ++ schedule_gc_worker(data, hash); ++ gc_count = 0; ++ do_gc = false; ++ goto restart; + } + + if (node_found) diff --git a/queue-4.20/netfilter-nf_conncount-speculative-garbage-collection-on-empty-lists.patch b/queue-4.20/netfilter-nf_conncount-speculative-garbage-collection-on-empty-lists.patch new file mode 100644 index 00000000000..43f0b436c57 --- /dev/null +++ b/queue-4.20/netfilter-nf_conncount-speculative-garbage-collection-on-empty-lists.patch @@ -0,0 +1,198 @@ +From c80f10bc973af2ace6b1414724eeff61eaa71837 Mon Sep 17 00:00:00 2001 +From: Pablo Neira Ayuso +Date: Fri, 28 Dec 2018 01:24:48 +0100 +Subject: netfilter: nf_conncount: speculative garbage collection on empty lists + +From: Pablo Neira Ayuso + +commit c80f10bc973af2ace6b1414724eeff61eaa71837 upstream. + +Instead of removing a empty list node that might be reintroduced soon +thereafter, tentatively place the empty list node on the list passed to +tree_nodes_free(), then re-check if the list is empty again before erasing +it from the tree. + +[ Florian: rebase on top of pending nf_conncount fixes ] + +Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search") +Reviewed-by: Shawn Bohrer +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Greg Kroah-Hartman + +--- + include/net/netfilter/nf_conntrack_count.h | 1 + net/netfilter/nf_conncount.c | 47 +++++++++-------------------- + 2 files changed, 15 insertions(+), 33 deletions(-) + +--- a/include/net/netfilter/nf_conntrack_count.h ++++ b/include/net/netfilter/nf_conntrack_count.h +@@ -9,7 +9,6 @@ struct nf_conncount_list { + spinlock_t list_lock; + struct list_head head; /* connections with the same filtering key */ + unsigned int count; /* length of list */ +- bool dead; + }; + + struct nf_conncount_data *nf_conncount_init(struct net *net, unsigned int family, +--- a/net/netfilter/nf_conncount.c ++++ b/net/netfilter/nf_conncount.c +@@ -81,27 +81,20 @@ static int key_diff(const u32 *a, const + return memcmp(a, b, klen * sizeof(u32)); + } + +-static bool conn_free(struct nf_conncount_list *list, ++static void conn_free(struct nf_conncount_list *list, + struct nf_conncount_tuple *conn) + { +- bool free_entry = false; +- + lockdep_assert_held(&list->list_lock); + + list->count--; + list_del(&conn->node); +- if (list->count == 0) { +- list->dead = true; +- free_entry = true; +- } + + kmem_cache_free(conncount_conn_cachep, conn); +- return free_entry; + } + + static const struct nf_conntrack_tuple_hash * + find_or_evict(struct net *net, struct nf_conncount_list *list, +- struct nf_conncount_tuple *conn, bool *free_entry) ++ struct nf_conncount_tuple *conn) + { + const struct nf_conntrack_tuple_hash *found; + unsigned long a, b; +@@ -121,7 +114,7 @@ find_or_evict(struct net *net, struct nf + */ + age = a - b; + if (conn->cpu == cpu || age >= 2) { +- *free_entry = conn_free(list, conn); ++ conn_free(list, conn); + return ERR_PTR(-ENOENT); + } + +@@ -137,14 +130,13 @@ static int __nf_conncount_add(struct net + struct nf_conncount_tuple *conn, *conn_n; + struct nf_conn *found_ct; + unsigned int collect = 0; +- bool free_entry = false; + + /* check the saved connections */ + list_for_each_entry_safe(conn, conn_n, &list->head, node) { + if (collect > CONNCOUNT_GC_MAX_NODES) + break; + +- found = find_or_evict(net, list, conn, &free_entry); ++ found = find_or_evict(net, list, conn); + if (IS_ERR(found)) { + /* Not found, but might be about to be confirmed */ + if (PTR_ERR(found) == -EAGAIN) { +@@ -221,7 +213,6 @@ void nf_conncount_list_init(struct nf_co + spin_lock_init(&list->list_lock); + INIT_LIST_HEAD(&list->head); + list->count = 0; +- list->dead = false; + } + EXPORT_SYMBOL_GPL(nf_conncount_list_init); + +@@ -233,7 +224,6 @@ bool nf_conncount_gc_list(struct net *ne + struct nf_conncount_tuple *conn, *conn_n; + struct nf_conn *found_ct; + unsigned int collected = 0; +- bool free_entry = false; + bool ret = false; + + /* don't bother if other cpu is already doing GC */ +@@ -241,15 +231,10 @@ bool nf_conncount_gc_list(struct net *ne + return false; + + list_for_each_entry_safe(conn, conn_n, &list->head, node) { +- found = find_or_evict(net, list, conn, &free_entry); ++ found = find_or_evict(net, list, conn); + if (IS_ERR(found)) { +- if (PTR_ERR(found) == -ENOENT) { +- if (free_entry) { +- spin_unlock(&list->list_lock); +- return true; +- } ++ if (PTR_ERR(found) == -ENOENT) + collected++; +- } + continue; + } + +@@ -260,10 +245,7 @@ bool nf_conncount_gc_list(struct net *ne + * closed already -> ditch it + */ + nf_ct_put(found_ct); +- if (conn_free(list, conn)) { +- spin_unlock(&list->list_lock); +- return true; +- } ++ conn_free(list, conn); + collected++; + continue; + } +@@ -273,10 +255,8 @@ bool nf_conncount_gc_list(struct net *ne + break; + } + +- if (!list->count) { +- list->dead = true; ++ if (!list->count) + ret = true; +- } + spin_unlock(&list->list_lock); + + return ret; +@@ -291,6 +271,7 @@ static void __tree_nodes_free(struct rcu + kmem_cache_free(conncount_rb_cachep, rbconn); + } + ++/* caller must hold tree nf_conncount_locks[] lock */ + static void tree_nodes_free(struct rb_root *root, + struct nf_conncount_rb *gc_nodes[], + unsigned int gc_count) +@@ -300,8 +281,10 @@ static void tree_nodes_free(struct rb_ro + while (gc_count) { + rbconn = gc_nodes[--gc_count]; + spin_lock(&rbconn->list.list_lock); +- rb_erase(&rbconn->node, root); +- call_rcu(&rbconn->rcu_head, __tree_nodes_free); ++ if (!rbconn->list.count) { ++ rb_erase(&rbconn->node, root); ++ call_rcu(&rbconn->rcu_head, __tree_nodes_free); ++ } + spin_unlock(&rbconn->list.list_lock); + } + } +@@ -318,7 +301,6 @@ insert_tree(struct net *net, + struct rb_root *root, + unsigned int hash, + const u32 *key, +- u8 keylen, + const struct nf_conntrack_tuple *tuple, + const struct nf_conntrack_zone *zone) + { +@@ -327,6 +309,7 @@ insert_tree(struct net *net, + struct nf_conncount_rb *rbconn; + struct nf_conncount_tuple *conn; + unsigned int count = 0, gc_count = 0; ++ u8 keylen = data->keylen; + bool do_gc = true; + + spin_lock_bh(&nf_conncount_locks[hash]); +@@ -454,7 +437,7 @@ count_tree(struct net *net, + if (!tuple) + return 0; + +- return insert_tree(net, data, root, hash, key, keylen, tuple, zone); ++ return insert_tree(net, data, root, hash, key, tuple, zone); + } + + static void tree_gc_worker(struct work_struct *work) diff --git a/queue-4.20/netfilter-nf_conncount-split-gc-in-two-phases.patch b/queue-4.20/netfilter-nf_conncount-split-gc-in-two-phases.patch new file mode 100644 index 00000000000..8a92ff6a75d --- /dev/null +++ b/queue-4.20/netfilter-nf_conncount-split-gc-in-two-phases.patch @@ -0,0 +1,77 @@ +From f7fcc98dfc2d136722007fec0debbed761679b94 Mon Sep 17 00:00:00 2001 +From: Florian Westphal +Date: Fri, 28 Dec 2018 01:24:44 +0100 +Subject: netfilter: nf_conncount: split gc in two phases + +From: Florian Westphal + +commit f7fcc98dfc2d136722007fec0debbed761679b94 upstream. + +The lockless workqueue garbage collector can race with packet path +garbage collector to delete list nodes, as it calls tree_nodes_free() +with the addresses of nodes that might have been free'd already from +another cpu. + +To fix this, split gc into two phases. + +One phase to perform gc on the connections: From a locking perspective, +this is the same as count_tree(): we hold rcu lock, but we do not +change the tree, we only change the nodes' contents. + +The second phase acquires the tree lock and reaps empty nodes. +This avoids a race condition of the garbage collection vs. packet path: +If a node has been free'd already, the second phase won't find it anymore. + +This second phase is, from locking perspective, same as insert_tree(). + +The former only modifies nodes (list content, count), latter modifies +the tree itself (rb_erase or rb_insert). + +Fixes: 5c789e131cbb9 ("netfilter: nf_conncount: Add list lock and gc worker, and RCU for init tree search") +Reviewed-by: Shawn Bohrer +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Greg Kroah-Hartman + +--- + net/netfilter/nf_conncount.c | 22 +++++++++++++++++++--- + 1 file changed, 19 insertions(+), 3 deletions(-) + +--- a/net/netfilter/nf_conncount.c ++++ b/net/netfilter/nf_conncount.c +@@ -500,16 +500,32 @@ static void tree_gc_worker(struct work_s + for (node = rb_first(root); node != NULL; node = rb_next(node)) { + rbconn = rb_entry(node, struct nf_conncount_rb, node); + if (nf_conncount_gc_list(data->net, &rbconn->list)) +- gc_nodes[gc_count++] = rbconn; ++ gc_count++; + } + rcu_read_unlock(); + + spin_lock_bh(&nf_conncount_locks[tree]); ++ if (gc_count < ARRAY_SIZE(gc_nodes)) ++ goto next; /* do not bother */ + +- if (gc_count) { +- tree_nodes_free(root, gc_nodes, gc_count); ++ gc_count = 0; ++ node = rb_first(root); ++ while (node != NULL) { ++ rbconn = rb_entry(node, struct nf_conncount_rb, node); ++ node = rb_next(node); ++ ++ if (rbconn->list.count > 0) ++ continue; ++ ++ gc_nodes[gc_count++] = rbconn; ++ if (gc_count >= ARRAY_SIZE(gc_nodes)) { ++ tree_nodes_free(root, gc_nodes, gc_count); ++ gc_count = 0; ++ } + } + ++ tree_nodes_free(root, gc_nodes, gc_count); ++next: + clear_bit(tree, data->pending_trees); + + next_tree = (tree + 1) % CONNCOUNT_SLOTS; diff --git a/queue-4.20/series b/queue-4.20/series index 900517d6332..8241e1f4d55 100644 --- a/queue-4.20/series +++ b/queue-4.20/series @@ -1,5 +1,13 @@ tty-ldsem-wake-up-readers-after-timed-out-down_write.patch -can-gw-ensure-dlc-boundaries-after-can-frame-modification.patch tty-hold-tty_ldisc_lock-during-tty_reopen.patch tty-simplify-tty-count-math-in-tty_reopen.patch tty-don-t-hold-ldisc-lock-in-tty_reopen-if-ldisc-present.patch +can-gw-ensure-dlc-boundaries-after-can-frame-modification.patch +netfilter-nf_conncount-replace-conncount_lock_slots-with-conncount_slots.patch +netfilter-nf_conncount-don-t-skip-eviction-when-age-is-negative.patch +netfilter-nf_conncount-split-gc-in-two-phases.patch +netfilter-nf_conncount-restart-search-when-nodes-have-been-erased.patch +netfilter-nf_conncount-merge-lookup-and-add-functions.patch +netfilter-nf_conncount-move-all-list-iterations-under-spinlock.patch +netfilter-nf_conncount-speculative-garbage-collection-on-empty-lists.patch +netfilter-nf_conncount-fix-argument-order-to-find_next_bit.patch