From f0c07a01c42cc37f641c12ed86ac68fb60a46a2e Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 2 Mar 2020 20:21:45 +0100 Subject: [PATCH] 5.4-stable patches added patches: net-smc-transfer-fasync_list-in-case-of-fallback.patch netfilter-ipset-fix-forceadd-evaluation-path.patch netfilter-ipset-fix-info-rcu-detected-stall-in-hash_xxx-reports.patch netfilter-xt_hashlimit-reduce-hashlimit_mutex-scope-for-htable_put.patch vhost-check-docket-sk_family-instead-of-call-getname.patch --- ...sfer-fasync_list-in-case-of-fallback.patch | 37 + ...r-ipset-fix-forceadd-evaluation-path.patch | 34 + ...u-detected-stall-in-hash_xxx-reports.patch | 1311 +++++++++++++++++ ...hashlimit_mutex-scope-for-htable_put.patch | 78 + queue-5.4/series | 5 + ...et-sk_family-instead-of-call-getname.patch | 57 + 6 files changed, 1522 insertions(+) create mode 100644 queue-5.4/net-smc-transfer-fasync_list-in-case-of-fallback.patch create mode 100644 queue-5.4/netfilter-ipset-fix-forceadd-evaluation-path.patch create mode 100644 queue-5.4/netfilter-ipset-fix-info-rcu-detected-stall-in-hash_xxx-reports.patch create mode 100644 queue-5.4/netfilter-xt_hashlimit-reduce-hashlimit_mutex-scope-for-htable_put.patch create mode 100644 queue-5.4/vhost-check-docket-sk_family-instead-of-call-getname.patch diff --git a/queue-5.4/net-smc-transfer-fasync_list-in-case-of-fallback.patch b/queue-5.4/net-smc-transfer-fasync_list-in-case-of-fallback.patch new file mode 100644 index 00000000000..8382722779c --- /dev/null +++ b/queue-5.4/net-smc-transfer-fasync_list-in-case-of-fallback.patch @@ -0,0 +1,37 @@ +From 67f562e3e147750a02b2a91d21a163fc44a1d13e Mon Sep 17 00:00:00 2001 +From: Ursula Braun +Date: Fri, 14 Feb 2020 08:58:59 +0100 +Subject: net/smc: transfer fasync_list in case of fallback + +From: Ursula Braun + +commit 67f562e3e147750a02b2a91d21a163fc44a1d13e upstream. + +SMC does not work together with FASTOPEN. If sendmsg() is called with +flag MSG_FASTOPEN in SMC_INIT state, the SMC-socket switches to +fallback mode. To handle the previous ioctl FIOASYNC call correctly +in this case, it is necessary to transfer the socket wait queue +fasync_list to the internal TCP socket. + +Reported-by: syzbot+4b1fe8105f8044a26162@syzkaller.appspotmail.com +Fixes: ee9dfbef02d18 ("net/smc: handle sockopts forcing fallback") +Signed-off-by: Ursula Braun +Signed-off-by: Karsten Graul +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman + +--- + net/smc/af_smc.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/net/smc/af_smc.c ++++ b/net/smc/af_smc.c +@@ -467,6 +467,8 @@ static void smc_switch_to_fallback(struc + if (smc->sk.sk_socket && smc->sk.sk_socket->file) { + smc->clcsock->file = smc->sk.sk_socket->file; + smc->clcsock->file->private_data = smc->clcsock; ++ smc->clcsock->wq.fasync_list = ++ smc->sk.sk_socket->wq.fasync_list; + } + } + diff --git a/queue-5.4/netfilter-ipset-fix-forceadd-evaluation-path.patch b/queue-5.4/netfilter-ipset-fix-forceadd-evaluation-path.patch new file mode 100644 index 00000000000..a3faf020da4 --- /dev/null +++ b/queue-5.4/netfilter-ipset-fix-forceadd-evaluation-path.patch @@ -0,0 +1,34 @@ +From 8af1c6fbd9239877998c7f5a591cb2c88d41fb66 Mon Sep 17 00:00:00 2001 +From: Jozsef Kadlecsik +Date: Sat, 22 Feb 2020 12:01:43 +0100 +Subject: netfilter: ipset: Fix forceadd evaluation path + +From: Jozsef Kadlecsik + +commit 8af1c6fbd9239877998c7f5a591cb2c88d41fb66 upstream. + +When the forceadd option is enabled, the hash:* types should find and replace +the first entry in the bucket with the new one if there are no reuseable +(deleted or timed out) entries. However, the position index was just not set +to zero and remained the invalid -1 if there were no reuseable entries. + +Reported-by: syzbot+6a86565c74ebe30aea18@syzkaller.appspotmail.com +Fixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and protocol version 7") +Signed-off-by: Jozsef Kadlecsik +Signed-off-by: Greg Kroah-Hartman + +--- + net/netfilter/ipset/ip_set_hash_gen.h | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/net/netfilter/ipset/ip_set_hash_gen.h ++++ b/net/netfilter/ipset/ip_set_hash_gen.h +@@ -931,6 +931,8 @@ mtype_add(struct ip_set *set, void *valu + } + } + if (reuse || forceadd) { ++ if (j == -1) ++ j = 0; + data = ahash_data(n, j, set->dsize); + if (!deleted) { + #ifdef IP_SET_HASH_WITH_NETS diff --git a/queue-5.4/netfilter-ipset-fix-info-rcu-detected-stall-in-hash_xxx-reports.patch b/queue-5.4/netfilter-ipset-fix-info-rcu-detected-stall-in-hash_xxx-reports.patch new file mode 100644 index 00000000000..eb24c176871 --- /dev/null +++ b/queue-5.4/netfilter-ipset-fix-info-rcu-detected-stall-in-hash_xxx-reports.patch @@ -0,0 +1,1311 @@ +From f66ee0410b1c3481ee75e5db9b34547b4d582465 Mon Sep 17 00:00:00 2001 +From: Jozsef Kadlecsik +Date: Tue, 11 Feb 2020 23:20:43 +0100 +Subject: netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports + +From: Jozsef Kadlecsik + +commit f66ee0410b1c3481ee75e5db9b34547b4d582465 upstream. + +In the case of huge hash:* types of sets, due to the single spinlock of +a set the processing of the whole set under spinlock protection could take +too long. + +There were four places where the whole hash table of the set was processed +from bucket to bucket under holding the spinlock: + +- During resizing a set, the original set was locked to exclude kernel side + add/del element operations (userspace add/del is excluded by the + nfnetlink mutex). The original set is actually just read during the + resize, so the spinlocking is replaced with rcu locking of regions. + However, thus there can be parallel kernel side add/del of entries. + In order not to loose those operations a backlog is added and replayed + after the successful resize. +- Garbage collection of timed out entries was also protected by the spinlock. + In order not to lock too long, region locking is introduced and a single + region is processed in one gc go. Also, the simple timer based gc running + is replaced with a workqueue based solution. The internal book-keeping + (number of elements, size of extensions) is moved to region level due to + the region locking. +- Adding elements: when the max number of the elements is reached, the gc + was called to evict the timed out entries. The new approach is that the gc + is called just for the matching region, assuming that if the region + (proportionally) seems to be full, then the whole set does. We could scan + the other regions to check every entry under rcu locking, but for huge + sets it'd mean a slowdown at adding elements. +- Listing the set header data: when the set was defined with timeout + support, the garbage collector was called to clean up timed out entries + to get the correct element numbers and set size values. Now the set is + scanned to check non-timed out entries, without actually calling the gc + for the whole set. + +Thanks to Florian Westphal for helping me to solve the SOFTIRQ-safe -> +SOFTIRQ-unsafe lock order issues during working on the patch. + +Reported-by: syzbot+4b0e9d4ff3cf117837e5@syzkaller.appspotmail.com +Reported-by: syzbot+c27b8d5010f45c666ed1@syzkaller.appspotmail.com +Reported-by: syzbot+68a806795ac89df3aa1c@syzkaller.appspotmail.com +Fixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and protocol version 7") +Signed-off-by: Jozsef Kadlecsik +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/netfilter/ipset/ip_set.h | 11 + net/netfilter/ipset/ip_set_core.c | 34 + + net/netfilter/ipset/ip_set_hash_gen.h | 633 ++++++++++++++++++++++----------- + 3 files changed, 472 insertions(+), 206 deletions(-) + +--- a/include/linux/netfilter/ipset/ip_set.h ++++ b/include/linux/netfilter/ipset/ip_set.h +@@ -121,6 +121,7 @@ struct ip_set_ext { + u32 timeout; + u8 packets_op; + u8 bytes_op; ++ bool target; + }; + + struct ip_set; +@@ -187,6 +188,14 @@ struct ip_set_type_variant { + /* Return true if "b" set is the same as "a" + * according to the create set parameters */ + bool (*same_set)(const struct ip_set *a, const struct ip_set *b); ++ /* Region-locking is used */ ++ bool region_lock; ++}; ++ ++struct ip_set_region { ++ spinlock_t lock; /* Region lock */ ++ size_t ext_size; /* Size of the dynamic extensions */ ++ u32 elements; /* Number of elements vs timeout */ + }; + + /* The core set type structure */ +@@ -681,7 +690,7 @@ ip_set_init_skbinfo(struct ip_set_skbinf + } + + #define IP_SET_INIT_KEXT(skb, opt, set) \ +- { .bytes = (skb)->len, .packets = 1, \ ++ { .bytes = (skb)->len, .packets = 1, .target = true,\ + .timeout = ip_set_adt_opt_timeout(opt, set) } + + #define IP_SET_INIT_UEXT(set) \ +--- a/net/netfilter/ipset/ip_set_core.c ++++ b/net/netfilter/ipset/ip_set_core.c +@@ -557,6 +557,20 @@ ip_set_rcu_get(struct net *net, ip_set_i + return set; + } + ++static inline void ++ip_set_lock(struct ip_set *set) ++{ ++ if (!set->variant->region_lock) ++ spin_lock_bh(&set->lock); ++} ++ ++static inline void ++ip_set_unlock(struct ip_set *set) ++{ ++ if (!set->variant->region_lock) ++ spin_unlock_bh(&set->lock); ++} ++ + int + ip_set_test(ip_set_id_t index, const struct sk_buff *skb, + const struct xt_action_param *par, struct ip_set_adt_opt *opt) +@@ -578,9 +592,9 @@ ip_set_test(ip_set_id_t index, const str + if (ret == -EAGAIN) { + /* Type requests element to be completed */ + pr_debug("element must be completed, ADD is triggered\n"); +- spin_lock_bh(&set->lock); ++ ip_set_lock(set); + set->variant->kadt(set, skb, par, IPSET_ADD, opt); +- spin_unlock_bh(&set->lock); ++ ip_set_unlock(set); + ret = 1; + } else { + /* --return-nomatch: invert matched element */ +@@ -609,9 +623,9 @@ ip_set_add(ip_set_id_t index, const stru + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + return -IPSET_ERR_TYPE_MISMATCH; + +- spin_lock_bh(&set->lock); ++ ip_set_lock(set); + ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt); +- spin_unlock_bh(&set->lock); ++ ip_set_unlock(set); + + return ret; + } +@@ -631,9 +645,9 @@ ip_set_del(ip_set_id_t index, const stru + !(opt->family == set->family || set->family == NFPROTO_UNSPEC)) + return -IPSET_ERR_TYPE_MISMATCH; + +- spin_lock_bh(&set->lock); ++ ip_set_lock(set); + ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt); +- spin_unlock_bh(&set->lock); ++ ip_set_unlock(set); + + return ret; + } +@@ -1098,9 +1112,9 @@ ip_set_flush_set(struct ip_set *set) + { + pr_debug("set: %s\n", set->name); + +- spin_lock_bh(&set->lock); ++ ip_set_lock(set); + set->variant->flush(set); +- spin_unlock_bh(&set->lock); ++ ip_set_unlock(set); + } + + static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb, +@@ -1523,9 +1537,9 @@ call_ad(struct sock *ctnl, struct sk_buf + bool eexist = flags & IPSET_FLAG_EXIST, retried = false; + + do { +- spin_lock_bh(&set->lock); ++ ip_set_lock(set); + ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried); +- spin_unlock_bh(&set->lock); ++ ip_set_unlock(set); + retried = true; + } while (ret == -EAGAIN && + set->variant->resize && +--- a/net/netfilter/ipset/ip_set_hash_gen.h ++++ b/net/netfilter/ipset/ip_set_hash_gen.h +@@ -7,13 +7,21 @@ + #include + #include + #include ++#include + #include + +-#define __ipset_dereference_protected(p, c) rcu_dereference_protected(p, c) +-#define ipset_dereference_protected(p, set) \ +- __ipset_dereference_protected(p, lockdep_is_held(&(set)->lock)) +- +-#define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1) ++#define __ipset_dereference(p) \ ++ rcu_dereference_protected(p, 1) ++#define ipset_dereference_nfnl(p) \ ++ rcu_dereference_protected(p, \ ++ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) ++#define ipset_dereference_set(p, set) \ ++ rcu_dereference_protected(p, \ ++ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \ ++ lockdep_is_held(&(set)->lock)) ++#define ipset_dereference_bh_nfnl(p) \ ++ rcu_dereference_bh_check(p, \ ++ lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET)) + + /* Hashing which uses arrays to resolve clashing. The hash table is resized + * (doubled) when searching becomes too long. +@@ -72,11 +80,35 @@ struct hbucket { + __aligned(__alignof__(u64)); + }; + ++/* Region size for locking == 2^HTABLE_REGION_BITS */ ++#define HTABLE_REGION_BITS 10 ++#define ahash_numof_locks(htable_bits) \ ++ ((htable_bits) < HTABLE_REGION_BITS ? 1 \ ++ : jhash_size((htable_bits) - HTABLE_REGION_BITS)) ++#define ahash_sizeof_regions(htable_bits) \ ++ (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) ++#define ahash_region(n, htable_bits) \ ++ ((n) % ahash_numof_locks(htable_bits)) ++#define ahash_bucket_start(h, htable_bits) \ ++ ((htable_bits) < HTABLE_REGION_BITS ? 0 \ ++ : (h) * jhash_size(HTABLE_REGION_BITS)) ++#define ahash_bucket_end(h, htable_bits) \ ++ ((htable_bits) < HTABLE_REGION_BITS ? jhash_size(htable_bits) \ ++ : ((h) + 1) * jhash_size(HTABLE_REGION_BITS)) ++ ++struct htable_gc { ++ struct delayed_work dwork; ++ struct ip_set *set; /* Set the gc belongs to */ ++ u32 region; /* Last gc run position */ ++}; ++ + /* The hash table: the table size stored here in order to make resizing easy */ + struct htable { + atomic_t ref; /* References for resizing */ +- atomic_t uref; /* References for dumping */ ++ atomic_t uref; /* References for dumping and gc */ + u8 htable_bits; /* size of hash table == 2^htable_bits */ ++ u32 maxelem; /* Maxelem per region */ ++ struct ip_set_region *hregion; /* Region locks and ext sizes */ + struct hbucket __rcu *bucket[0]; /* hashtable buckets */ + }; + +@@ -162,6 +194,10 @@ htable_bits(u32 hashsize) + #define NLEN 0 + #endif /* IP_SET_HASH_WITH_NETS */ + ++#define SET_ELEM_EXPIRED(set, d) \ ++ (SET_WITH_TIMEOUT(set) && \ ++ ip_set_timeout_expired(ext_timeout(d, set))) ++ + #endif /* _IP_SET_HASH_GEN_H */ + + #ifndef MTYPE +@@ -205,10 +241,12 @@ htable_bits(u32 hashsize) + #undef mtype_test_cidrs + #undef mtype_test + #undef mtype_uref +-#undef mtype_expire + #undef mtype_resize ++#undef mtype_ext_size ++#undef mtype_resize_ad + #undef mtype_head + #undef mtype_list ++#undef mtype_gc_do + #undef mtype_gc + #undef mtype_gc_init + #undef mtype_variant +@@ -247,10 +285,12 @@ htable_bits(u32 hashsize) + #define mtype_test_cidrs IPSET_TOKEN(MTYPE, _test_cidrs) + #define mtype_test IPSET_TOKEN(MTYPE, _test) + #define mtype_uref IPSET_TOKEN(MTYPE, _uref) +-#define mtype_expire IPSET_TOKEN(MTYPE, _expire) + #define mtype_resize IPSET_TOKEN(MTYPE, _resize) ++#define mtype_ext_size IPSET_TOKEN(MTYPE, _ext_size) ++#define mtype_resize_ad IPSET_TOKEN(MTYPE, _resize_ad) + #define mtype_head IPSET_TOKEN(MTYPE, _head) + #define mtype_list IPSET_TOKEN(MTYPE, _list) ++#define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do) + #define mtype_gc IPSET_TOKEN(MTYPE, _gc) + #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) + #define mtype_variant IPSET_TOKEN(MTYPE, _variant) +@@ -275,8 +315,7 @@ htable_bits(u32 hashsize) + /* The generic hash structure */ + struct htype { + struct htable __rcu *table; /* the hash table */ +- struct timer_list gc; /* garbage collection when timeout enabled */ +- struct ip_set *set; /* attached to this ip_set */ ++ struct htable_gc gc; /* gc workqueue */ + u32 maxelem; /* max elements in the hash */ + u32 initval; /* random jhash init value */ + #ifdef IP_SET_HASH_WITH_MARKMASK +@@ -288,21 +327,33 @@ struct htype { + #ifdef IP_SET_HASH_WITH_NETMASK + u8 netmask; /* netmask value for subnets to store */ + #endif ++ struct list_head ad; /* Resize add|del backlist */ + struct mtype_elem next; /* temporary storage for uadd */ + #ifdef IP_SET_HASH_WITH_NETS + struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */ + #endif + }; + ++/* ADD|DEL entries saved during resize */ ++struct mtype_resize_ad { ++ struct list_head list; ++ enum ipset_adt ad; /* ADD|DEL element */ ++ struct mtype_elem d; /* Element value */ ++ struct ip_set_ext ext; /* Extensions for ADD */ ++ struct ip_set_ext mext; /* Target extensions for ADD */ ++ u32 flags; /* Flags for ADD */ ++}; ++ + #ifdef IP_SET_HASH_WITH_NETS + /* Network cidr size book keeping when the hash stores different + * sized networks. cidr == real cidr + 1 to support /0. + */ + static void +-mtype_add_cidr(struct htype *h, u8 cidr, u8 n) ++mtype_add_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n) + { + int i, j; + ++ spin_lock_bh(&set->lock); + /* Add in increasing prefix order, so larger cidr first */ + for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) { + if (j != -1) { +@@ -311,7 +362,7 @@ mtype_add_cidr(struct htype *h, u8 cidr, + j = i; + } else if (h->nets[i].cidr[n] == cidr) { + h->nets[CIDR_POS(cidr)].nets[n]++; +- return; ++ goto unlock; + } + } + if (j != -1) { +@@ -320,24 +371,29 @@ mtype_add_cidr(struct htype *h, u8 cidr, + } + h->nets[i].cidr[n] = cidr; + h->nets[CIDR_POS(cidr)].nets[n] = 1; ++unlock: ++ spin_unlock_bh(&set->lock); + } + + static void +-mtype_del_cidr(struct htype *h, u8 cidr, u8 n) ++mtype_del_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n) + { + u8 i, j, net_end = NLEN - 1; + ++ spin_lock_bh(&set->lock); + for (i = 0; i < NLEN; i++) { + if (h->nets[i].cidr[n] != cidr) + continue; + h->nets[CIDR_POS(cidr)].nets[n]--; + if (h->nets[CIDR_POS(cidr)].nets[n] > 0) +- return; ++ goto unlock; + for (j = i; j < net_end && h->nets[j].cidr[n]; j++) + h->nets[j].cidr[n] = h->nets[j + 1].cidr[n]; + h->nets[j].cidr[n] = 0; +- return; ++ goto unlock; + } ++unlock: ++ spin_unlock_bh(&set->lock); + } + #endif + +@@ -345,7 +401,7 @@ mtype_del_cidr(struct htype *h, u8 cidr, + static size_t + mtype_ahash_memsize(const struct htype *h, const struct htable *t) + { +- return sizeof(*h) + sizeof(*t); ++ return sizeof(*h) + sizeof(*t) + ahash_sizeof_regions(t->htable_bits); + } + + /* Get the ith element from the array block n */ +@@ -369,24 +425,29 @@ mtype_flush(struct ip_set *set) + struct htype *h = set->data; + struct htable *t; + struct hbucket *n; +- u32 i; ++ u32 r, i; + +- t = ipset_dereference_protected(h->table, set); +- for (i = 0; i < jhash_size(t->htable_bits); i++) { +- n = __ipset_dereference_protected(hbucket(t, i), 1); +- if (!n) +- continue; +- if (set->extensions & IPSET_EXT_DESTROY) +- mtype_ext_cleanup(set, n); +- /* FIXME: use slab cache */ +- rcu_assign_pointer(hbucket(t, i), NULL); +- kfree_rcu(n, rcu); ++ t = ipset_dereference_nfnl(h->table); ++ for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) { ++ spin_lock_bh(&t->hregion[r].lock); ++ for (i = ahash_bucket_start(r, t->htable_bits); ++ i < ahash_bucket_end(r, t->htable_bits); i++) { ++ n = __ipset_dereference(hbucket(t, i)); ++ if (!n) ++ continue; ++ if (set->extensions & IPSET_EXT_DESTROY) ++ mtype_ext_cleanup(set, n); ++ /* FIXME: use slab cache */ ++ rcu_assign_pointer(hbucket(t, i), NULL); ++ kfree_rcu(n, rcu); ++ } ++ t->hregion[r].ext_size = 0; ++ t->hregion[r].elements = 0; ++ spin_unlock_bh(&t->hregion[r].lock); + } + #ifdef IP_SET_HASH_WITH_NETS + memset(h->nets, 0, sizeof(h->nets)); + #endif +- set->elements = 0; +- set->ext_size = 0; + } + + /* Destroy the hashtable part of the set */ +@@ -397,7 +458,7 @@ mtype_ahash_destroy(struct ip_set *set, + u32 i; + + for (i = 0; i < jhash_size(t->htable_bits); i++) { +- n = __ipset_dereference_protected(hbucket(t, i), 1); ++ n = __ipset_dereference(hbucket(t, i)); + if (!n) + continue; + if (set->extensions & IPSET_EXT_DESTROY && ext_destroy) +@@ -406,6 +467,7 @@ mtype_ahash_destroy(struct ip_set *set, + kfree(n); + } + ++ ip_set_free(t->hregion); + ip_set_free(t); + } + +@@ -414,28 +476,21 @@ static void + mtype_destroy(struct ip_set *set) + { + struct htype *h = set->data; ++ struct list_head *l, *lt; + + if (SET_WITH_TIMEOUT(set)) +- del_timer_sync(&h->gc); ++ cancel_delayed_work_sync(&h->gc.dwork); + +- mtype_ahash_destroy(set, +- __ipset_dereference_protected(h->table, 1), true); ++ mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true); ++ list_for_each_safe(l, lt, &h->ad) { ++ list_del(l); ++ kfree(l); ++ } + kfree(h); + + set->data = NULL; + } + +-static void +-mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) +-{ +- struct htype *h = set->data; +- +- timer_setup(&h->gc, gc, 0); +- mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); +- pr_debug("gc initialized, run in every %u\n", +- IPSET_GC_PERIOD(set->timeout)); +-} +- + static bool + mtype_same_set(const struct ip_set *a, const struct ip_set *b) + { +@@ -454,11 +509,9 @@ mtype_same_set(const struct ip_set *a, c + a->extensions == b->extensions; + } + +-/* Delete expired elements from the hashtable */ + static void +-mtype_expire(struct ip_set *set, struct htype *h) ++mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r) + { +- struct htable *t; + struct hbucket *n, *tmp; + struct mtype_elem *data; + u32 i, j, d; +@@ -466,10 +519,12 @@ mtype_expire(struct ip_set *set, struct + #ifdef IP_SET_HASH_WITH_NETS + u8 k; + #endif ++ u8 htable_bits = t->htable_bits; + +- t = ipset_dereference_protected(h->table, set); +- for (i = 0; i < jhash_size(t->htable_bits); i++) { +- n = __ipset_dereference_protected(hbucket(t, i), 1); ++ spin_lock_bh(&t->hregion[r].lock); ++ for (i = ahash_bucket_start(r, htable_bits); ++ i < ahash_bucket_end(r, htable_bits); i++) { ++ n = __ipset_dereference(hbucket(t, i)); + if (!n) + continue; + for (j = 0, d = 0; j < n->pos; j++) { +@@ -485,58 +540,100 @@ mtype_expire(struct ip_set *set, struct + smp_mb__after_atomic(); + #ifdef IP_SET_HASH_WITH_NETS + for (k = 0; k < IPSET_NET_COUNT; k++) +- mtype_del_cidr(h, ++ mtype_del_cidr(set, h, + NCIDR_PUT(DCIDR_GET(data->cidr, k)), + k); + #endif ++ t->hregion[r].elements--; + ip_set_ext_destroy(set, data); +- set->elements--; + d++; + } + if (d >= AHASH_INIT_SIZE) { + if (d >= n->size) { ++ t->hregion[r].ext_size -= ++ ext_size(n->size, dsize); + rcu_assign_pointer(hbucket(t, i), NULL); + kfree_rcu(n, rcu); + continue; + } + tmp = kzalloc(sizeof(*tmp) + +- (n->size - AHASH_INIT_SIZE) * dsize, +- GFP_ATOMIC); ++ (n->size - AHASH_INIT_SIZE) * dsize, ++ GFP_ATOMIC); + if (!tmp) +- /* Still try to delete expired elements */ ++ /* Still try to delete expired elements. */ + continue; + tmp->size = n->size - AHASH_INIT_SIZE; + for (j = 0, d = 0; j < n->pos; j++) { + if (!test_bit(j, n->used)) + continue; + data = ahash_data(n, j, dsize); +- memcpy(tmp->value + d * dsize, data, dsize); ++ memcpy(tmp->value + d * dsize, ++ data, dsize); + set_bit(d, tmp->used); + d++; + } + tmp->pos = d; +- set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize); ++ t->hregion[r].ext_size -= ++ ext_size(AHASH_INIT_SIZE, dsize); + rcu_assign_pointer(hbucket(t, i), tmp); + kfree_rcu(n, rcu); + } + } ++ spin_unlock_bh(&t->hregion[r].lock); + } + + static void +-mtype_gc(struct timer_list *t) ++mtype_gc(struct work_struct *work) + { +- struct htype *h = from_timer(h, t, gc); +- struct ip_set *set = h->set; ++ struct htable_gc *gc; ++ struct ip_set *set; ++ struct htype *h; ++ struct htable *t; ++ u32 r, numof_locks; ++ unsigned int next_run; ++ ++ gc = container_of(work, struct htable_gc, dwork.work); ++ set = gc->set; ++ h = set->data; + +- pr_debug("called\n"); + spin_lock_bh(&set->lock); +- mtype_expire(set, h); ++ t = ipset_dereference_set(h->table, set); ++ atomic_inc(&t->uref); ++ numof_locks = ahash_numof_locks(t->htable_bits); ++ r = gc->region++; ++ if (r >= numof_locks) { ++ r = gc->region = 0; ++ } ++ next_run = (IPSET_GC_PERIOD(set->timeout) * HZ) / numof_locks; ++ if (next_run < HZ/10) ++ next_run = HZ/10; + spin_unlock_bh(&set->lock); + +- h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ; +- add_timer(&h->gc); ++ mtype_gc_do(set, h, t, r); ++ ++ if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { ++ pr_debug("Table destroy after resize by expire: %p\n", t); ++ mtype_ahash_destroy(set, t, false); ++ } ++ ++ queue_delayed_work(system_power_efficient_wq, &gc->dwork, next_run); ++ + } + ++static void ++mtype_gc_init(struct htable_gc *gc) ++{ ++ INIT_DEFERRABLE_WORK(&gc->dwork, mtype_gc); ++ queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ); ++} ++ ++static int ++mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, ++ struct ip_set_ext *mext, u32 flags); ++static int ++mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext, ++ struct ip_set_ext *mext, u32 flags); ++ + /* Resize a hash: create a new hash table with doubling the hashsize + * and inserting the elements to it. Repeat until we succeed or + * fail due to memory pressures. +@@ -547,7 +644,7 @@ mtype_resize(struct ip_set *set, bool re + struct htype *h = set->data; + struct htable *t, *orig; + u8 htable_bits; +- size_t extsize, dsize = set->dsize; ++ size_t dsize = set->dsize; + #ifdef IP_SET_HASH_WITH_NETS + u8 flags; + struct mtype_elem *tmp; +@@ -555,7 +652,9 @@ mtype_resize(struct ip_set *set, bool re + struct mtype_elem *data; + struct mtype_elem *d; + struct hbucket *n, *m; +- u32 i, j, key; ++ struct list_head *l, *lt; ++ struct mtype_resize_ad *x; ++ u32 i, j, r, nr, key; + int ret; + + #ifdef IP_SET_HASH_WITH_NETS +@@ -563,10 +662,8 @@ mtype_resize(struct ip_set *set, bool re + if (!tmp) + return -ENOMEM; + #endif +- rcu_read_lock_bh(); +- orig = rcu_dereference_bh_nfnl(h->table); ++ orig = ipset_dereference_bh_nfnl(h->table); + htable_bits = orig->htable_bits; +- rcu_read_unlock_bh(); + + retry: + ret = 0; +@@ -583,88 +680,124 @@ retry: + ret = -ENOMEM; + goto out; + } ++ t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits)); ++ if (!t->hregion) { ++ kfree(t); ++ ret = -ENOMEM; ++ goto out; ++ } + t->htable_bits = htable_bits; ++ t->maxelem = h->maxelem / ahash_numof_locks(htable_bits); ++ for (i = 0; i < ahash_numof_locks(htable_bits); i++) ++ spin_lock_init(&t->hregion[i].lock); + +- spin_lock_bh(&set->lock); +- orig = __ipset_dereference_protected(h->table, 1); +- /* There can't be another parallel resizing, but dumping is possible */ ++ /* There can't be another parallel resizing, ++ * but dumping, gc, kernel side add/del are possible ++ */ ++ orig = ipset_dereference_bh_nfnl(h->table); + atomic_set(&orig->ref, 1); + atomic_inc(&orig->uref); +- extsize = 0; + pr_debug("attempt to resize set %s from %u to %u, t %p\n", + set->name, orig->htable_bits, htable_bits, orig); +- for (i = 0; i < jhash_size(orig->htable_bits); i++) { +- n = __ipset_dereference_protected(hbucket(orig, i), 1); +- if (!n) +- continue; +- for (j = 0; j < n->pos; j++) { +- if (!test_bit(j, n->used)) ++ for (r = 0; r < ahash_numof_locks(orig->htable_bits); r++) { ++ /* Expire may replace a hbucket with another one */ ++ rcu_read_lock_bh(); ++ for (i = ahash_bucket_start(r, orig->htable_bits); ++ i < ahash_bucket_end(r, orig->htable_bits); i++) { ++ n = __ipset_dereference(hbucket(orig, i)); ++ if (!n) + continue; +- data = ahash_data(n, j, dsize); ++ for (j = 0; j < n->pos; j++) { ++ if (!test_bit(j, n->used)) ++ continue; ++ data = ahash_data(n, j, dsize); ++ if (SET_ELEM_EXPIRED(set, data)) ++ continue; + #ifdef IP_SET_HASH_WITH_NETS +- /* We have readers running parallel with us, +- * so the live data cannot be modified. +- */ +- flags = 0; +- memcpy(tmp, data, dsize); +- data = tmp; +- mtype_data_reset_flags(data, &flags); +-#endif +- key = HKEY(data, h->initval, htable_bits); +- m = __ipset_dereference_protected(hbucket(t, key), 1); +- if (!m) { +- m = kzalloc(sizeof(*m) + ++ /* We have readers running parallel with us, ++ * so the live data cannot be modified. ++ */ ++ flags = 0; ++ memcpy(tmp, data, dsize); ++ data = tmp; ++ mtype_data_reset_flags(data, &flags); ++#endif ++ key = HKEY(data, h->initval, htable_bits); ++ m = __ipset_dereference(hbucket(t, key)); ++ nr = ahash_region(key, htable_bits); ++ if (!m) { ++ m = kzalloc(sizeof(*m) + + AHASH_INIT_SIZE * dsize, + GFP_ATOMIC); +- if (!m) { +- ret = -ENOMEM; +- goto cleanup; +- } +- m->size = AHASH_INIT_SIZE; +- extsize += ext_size(AHASH_INIT_SIZE, dsize); +- RCU_INIT_POINTER(hbucket(t, key), m); +- } else if (m->pos >= m->size) { +- struct hbucket *ht; +- +- if (m->size >= AHASH_MAX(h)) { +- ret = -EAGAIN; +- } else { +- ht = kzalloc(sizeof(*ht) + ++ if (!m) { ++ ret = -ENOMEM; ++ goto cleanup; ++ } ++ m->size = AHASH_INIT_SIZE; ++ t->hregion[nr].ext_size += ++ ext_size(AHASH_INIT_SIZE, ++ dsize); ++ RCU_INIT_POINTER(hbucket(t, key), m); ++ } else if (m->pos >= m->size) { ++ struct hbucket *ht; ++ ++ if (m->size >= AHASH_MAX(h)) { ++ ret = -EAGAIN; ++ } else { ++ ht = kzalloc(sizeof(*ht) + + (m->size + AHASH_INIT_SIZE) + * dsize, + GFP_ATOMIC); +- if (!ht) +- ret = -ENOMEM; ++ if (!ht) ++ ret = -ENOMEM; ++ } ++ if (ret < 0) ++ goto cleanup; ++ memcpy(ht, m, sizeof(struct hbucket) + ++ m->size * dsize); ++ ht->size = m->size + AHASH_INIT_SIZE; ++ t->hregion[nr].ext_size += ++ ext_size(AHASH_INIT_SIZE, ++ dsize); ++ kfree(m); ++ m = ht; ++ RCU_INIT_POINTER(hbucket(t, key), ht); + } +- if (ret < 0) +- goto cleanup; +- memcpy(ht, m, sizeof(struct hbucket) + +- m->size * dsize); +- ht->size = m->size + AHASH_INIT_SIZE; +- extsize += ext_size(AHASH_INIT_SIZE, dsize); +- kfree(m); +- m = ht; +- RCU_INIT_POINTER(hbucket(t, key), ht); +- } +- d = ahash_data(m, m->pos, dsize); +- memcpy(d, data, dsize); +- set_bit(m->pos++, m->used); ++ d = ahash_data(m, m->pos, dsize); ++ memcpy(d, data, dsize); ++ set_bit(m->pos++, m->used); ++ t->hregion[nr].elements++; + #ifdef IP_SET_HASH_WITH_NETS +- mtype_data_reset_flags(d, &flags); ++ mtype_data_reset_flags(d, &flags); + #endif ++ } + } ++ rcu_read_unlock_bh(); + } +- rcu_assign_pointer(h->table, t); +- set->ext_size = extsize; + +- spin_unlock_bh(&set->lock); ++ /* There can't be any other writer. */ ++ rcu_assign_pointer(h->table, t); + + /* Give time to other readers of the set */ + synchronize_rcu(); + + pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name, + orig->htable_bits, orig, t->htable_bits, t); +- /* If there's nobody else dumping the table, destroy it */ ++ /* Add/delete elements processed by the SET target during resize. ++ * Kernel-side add cannot trigger a resize and userspace actions ++ * are serialized by the mutex. ++ */ ++ list_for_each_safe(l, lt, &h->ad) { ++ x = list_entry(l, struct mtype_resize_ad, list); ++ if (x->ad == IPSET_ADD) { ++ mtype_add(set, &x->d, &x->ext, &x->mext, x->flags); ++ } else { ++ mtype_del(set, &x->d, NULL, NULL, 0); ++ } ++ list_del(l); ++ kfree(l); ++ } ++ /* If there's nobody else using the table, destroy it */ + if (atomic_dec_and_test(&orig->uref)) { + pr_debug("Table destroy by resize %p\n", orig); + mtype_ahash_destroy(set, orig, false); +@@ -677,15 +810,44 @@ out: + return ret; + + cleanup: ++ rcu_read_unlock_bh(); + atomic_set(&orig->ref, 0); + atomic_dec(&orig->uref); +- spin_unlock_bh(&set->lock); + mtype_ahash_destroy(set, t, false); + if (ret == -EAGAIN) + goto retry; + goto out; + } + ++/* Get the current number of elements and ext_size in the set */ ++static void ++mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size) ++{ ++ struct htype *h = set->data; ++ const struct htable *t; ++ u32 i, j, r; ++ struct hbucket *n; ++ struct mtype_elem *data; ++ ++ t = rcu_dereference_bh(h->table); ++ for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) { ++ for (i = ahash_bucket_start(r, t->htable_bits); ++ i < ahash_bucket_end(r, t->htable_bits); i++) { ++ n = rcu_dereference_bh(hbucket(t, i)); ++ if (!n) ++ continue; ++ for (j = 0; j < n->pos; j++) { ++ if (!test_bit(j, n->used)) ++ continue; ++ data = ahash_data(n, j, set->dsize); ++ if (!SET_ELEM_EXPIRED(set, data)) ++ (*elements)++; ++ } ++ } ++ *ext_size += t->hregion[r].ext_size; ++ } ++} ++ + /* Add an element to a hash and update the internal counters when succeeded, + * otherwise report the proper error code. + */ +@@ -698,32 +860,49 @@ mtype_add(struct ip_set *set, void *valu + const struct mtype_elem *d = value; + struct mtype_elem *data; + struct hbucket *n, *old = ERR_PTR(-ENOENT); +- int i, j = -1; ++ int i, j = -1, ret; + bool flag_exist = flags & IPSET_FLAG_EXIST; + bool deleted = false, forceadd = false, reuse = false; +- u32 key, multi = 0; ++ u32 r, key, multi = 0, elements, maxelem; + +- if (set->elements >= h->maxelem) { +- if (SET_WITH_TIMEOUT(set)) +- /* FIXME: when set is full, we slow down here */ +- mtype_expire(set, h); +- if (set->elements >= h->maxelem && SET_WITH_FORCEADD(set)) ++ rcu_read_lock_bh(); ++ t = rcu_dereference_bh(h->table); ++ key = HKEY(value, h->initval, t->htable_bits); ++ r = ahash_region(key, t->htable_bits); ++ atomic_inc(&t->uref); ++ elements = t->hregion[r].elements; ++ maxelem = t->maxelem; ++ if (elements >= maxelem) { ++ u32 e; ++ if (SET_WITH_TIMEOUT(set)) { ++ rcu_read_unlock_bh(); ++ mtype_gc_do(set, h, t, r); ++ rcu_read_lock_bh(); ++ } ++ maxelem = h->maxelem; ++ elements = 0; ++ for (e = 0; e < ahash_numof_locks(t->htable_bits); e++) ++ elements += t->hregion[e].elements; ++ if (elements >= maxelem && SET_WITH_FORCEADD(set)) + forceadd = true; + } ++ rcu_read_unlock_bh(); + +- t = ipset_dereference_protected(h->table, set); +- key = HKEY(value, h->initval, t->htable_bits); +- n = __ipset_dereference_protected(hbucket(t, key), 1); ++ spin_lock_bh(&t->hregion[r].lock); ++ n = rcu_dereference_bh(hbucket(t, key)); + if (!n) { +- if (forceadd || set->elements >= h->maxelem) ++ if (forceadd || elements >= maxelem) + goto set_full; + old = NULL; + n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize, + GFP_ATOMIC); +- if (!n) +- return -ENOMEM; ++ if (!n) { ++ ret = -ENOMEM; ++ goto unlock; ++ } + n->size = AHASH_INIT_SIZE; +- set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize); ++ t->hregion[r].ext_size += ++ ext_size(AHASH_INIT_SIZE, set->dsize); + goto copy_elem; + } + for (i = 0; i < n->pos; i++) { +@@ -737,19 +916,16 @@ mtype_add(struct ip_set *set, void *valu + } + data = ahash_data(n, i, set->dsize); + if (mtype_data_equal(data, d, &multi)) { +- if (flag_exist || +- (SET_WITH_TIMEOUT(set) && +- ip_set_timeout_expired(ext_timeout(data, set)))) { ++ if (flag_exist || SET_ELEM_EXPIRED(set, data)) { + /* Just the extensions could be overwritten */ + j = i; + goto overwrite_extensions; + } +- return -IPSET_ERR_EXIST; ++ ret = -IPSET_ERR_EXIST; ++ goto unlock; + } + /* Reuse first timed out entry */ +- if (SET_WITH_TIMEOUT(set) && +- ip_set_timeout_expired(ext_timeout(data, set)) && +- j == -1) { ++ if (SET_ELEM_EXPIRED(set, data) && j == -1) { + j = i; + reuse = true; + } +@@ -759,16 +935,16 @@ mtype_add(struct ip_set *set, void *valu + if (!deleted) { + #ifdef IP_SET_HASH_WITH_NETS + for (i = 0; i < IPSET_NET_COUNT; i++) +- mtype_del_cidr(h, ++ mtype_del_cidr(set, h, + NCIDR_PUT(DCIDR_GET(data->cidr, i)), + i); + #endif + ip_set_ext_destroy(set, data); +- set->elements--; ++ t->hregion[r].elements--; + } + goto copy_data; + } +- if (set->elements >= h->maxelem) ++ if (elements >= maxelem) + goto set_full; + /* Create a new slot */ + if (n->pos >= n->size) { +@@ -776,28 +952,32 @@ mtype_add(struct ip_set *set, void *valu + if (n->size >= AHASH_MAX(h)) { + /* Trigger rehashing */ + mtype_data_next(&h->next, d); +- return -EAGAIN; ++ ret = -EAGAIN; ++ goto resize; + } + old = n; + n = kzalloc(sizeof(*n) + + (old->size + AHASH_INIT_SIZE) * set->dsize, + GFP_ATOMIC); +- if (!n) +- return -ENOMEM; ++ if (!n) { ++ ret = -ENOMEM; ++ goto unlock; ++ } + memcpy(n, old, sizeof(struct hbucket) + + old->size * set->dsize); + n->size = old->size + AHASH_INIT_SIZE; +- set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize); ++ t->hregion[r].ext_size += ++ ext_size(AHASH_INIT_SIZE, set->dsize); + } + + copy_elem: + j = n->pos++; + data = ahash_data(n, j, set->dsize); + copy_data: +- set->elements++; ++ t->hregion[r].elements++; + #ifdef IP_SET_HASH_WITH_NETS + for (i = 0; i < IPSET_NET_COUNT; i++) +- mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i); ++ mtype_add_cidr(set, h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i); + #endif + memcpy(data, d, sizeof(struct mtype_elem)); + overwrite_extensions: +@@ -820,13 +1000,41 @@ overwrite_extensions: + if (old) + kfree_rcu(old, rcu); + } ++ ret = 0; ++resize: ++ spin_unlock_bh(&t->hregion[r].lock); ++ if (atomic_read(&t->ref) && ext->target) { ++ /* Resize is in process and kernel side add, save values */ ++ struct mtype_resize_ad *x; ++ ++ x = kzalloc(sizeof(struct mtype_resize_ad), GFP_ATOMIC); ++ if (!x) ++ /* Don't bother */ ++ goto out; ++ x->ad = IPSET_ADD; ++ memcpy(&x->d, value, sizeof(struct mtype_elem)); ++ memcpy(&x->ext, ext, sizeof(struct ip_set_ext)); ++ memcpy(&x->mext, mext, sizeof(struct ip_set_ext)); ++ x->flags = flags; ++ spin_lock_bh(&set->lock); ++ list_add_tail(&x->list, &h->ad); ++ spin_unlock_bh(&set->lock); ++ } ++ goto out; + +- return 0; + set_full: + if (net_ratelimit()) + pr_warn("Set %s is full, maxelem %u reached\n", +- set->name, h->maxelem); +- return -IPSET_ERR_HASH_FULL; ++ set->name, maxelem); ++ ret = -IPSET_ERR_HASH_FULL; ++unlock: ++ spin_unlock_bh(&t->hregion[r].lock); ++out: ++ if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { ++ pr_debug("Table destroy after resize by add: %p\n", t); ++ mtype_ahash_destroy(set, t, false); ++ } ++ return ret; + } + + /* Delete an element from the hash and free up space if possible. +@@ -840,13 +1048,23 @@ mtype_del(struct ip_set *set, void *valu + const struct mtype_elem *d = value; + struct mtype_elem *data; + struct hbucket *n; +- int i, j, k, ret = -IPSET_ERR_EXIST; ++ struct mtype_resize_ad *x = NULL; ++ int i, j, k, r, ret = -IPSET_ERR_EXIST; + u32 key, multi = 0; + size_t dsize = set->dsize; + +- t = ipset_dereference_protected(h->table, set); ++ /* Userspace add and resize is excluded by the mutex. ++ * Kernespace add does not trigger resize. ++ */ ++ rcu_read_lock_bh(); ++ t = rcu_dereference_bh(h->table); + key = HKEY(value, h->initval, t->htable_bits); +- n = __ipset_dereference_protected(hbucket(t, key), 1); ++ r = ahash_region(key, t->htable_bits); ++ atomic_inc(&t->uref); ++ rcu_read_unlock_bh(); ++ ++ spin_lock_bh(&t->hregion[r].lock); ++ n = rcu_dereference_bh(hbucket(t, key)); + if (!n) + goto out; + for (i = 0, k = 0; i < n->pos; i++) { +@@ -857,8 +1075,7 @@ mtype_del(struct ip_set *set, void *valu + data = ahash_data(n, i, dsize); + if (!mtype_data_equal(data, d, &multi)) + continue; +- if (SET_WITH_TIMEOUT(set) && +- ip_set_timeout_expired(ext_timeout(data, set))) ++ if (SET_ELEM_EXPIRED(set, data)) + goto out; + + ret = 0; +@@ -866,20 +1083,33 @@ mtype_del(struct ip_set *set, void *valu + smp_mb__after_atomic(); + if (i + 1 == n->pos) + n->pos--; +- set->elements--; ++ t->hregion[r].elements--; + #ifdef IP_SET_HASH_WITH_NETS + for (j = 0; j < IPSET_NET_COUNT; j++) +- mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)), +- j); ++ mtype_del_cidr(set, h, ++ NCIDR_PUT(DCIDR_GET(d->cidr, j)), j); + #endif + ip_set_ext_destroy(set, data); + ++ if (atomic_read(&t->ref) && ext->target) { ++ /* Resize is in process and kernel side del, ++ * save values ++ */ ++ x = kzalloc(sizeof(struct mtype_resize_ad), ++ GFP_ATOMIC); ++ if (x) { ++ x->ad = IPSET_DEL; ++ memcpy(&x->d, value, ++ sizeof(struct mtype_elem)); ++ x->flags = flags; ++ } ++ } + for (; i < n->pos; i++) { + if (!test_bit(i, n->used)) + k++; + } + if (n->pos == 0 && k == 0) { +- set->ext_size -= ext_size(n->size, dsize); ++ t->hregion[r].ext_size -= ext_size(n->size, dsize); + rcu_assign_pointer(hbucket(t, key), NULL); + kfree_rcu(n, rcu); + } else if (k >= AHASH_INIT_SIZE) { +@@ -898,7 +1128,8 @@ mtype_del(struct ip_set *set, void *valu + k++; + } + tmp->pos = k; +- set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize); ++ t->hregion[r].ext_size -= ++ ext_size(AHASH_INIT_SIZE, dsize); + rcu_assign_pointer(hbucket(t, key), tmp); + kfree_rcu(n, rcu); + } +@@ -906,6 +1137,16 @@ mtype_del(struct ip_set *set, void *valu + } + + out: ++ spin_unlock_bh(&t->hregion[r].lock); ++ if (x) { ++ spin_lock_bh(&set->lock); ++ list_add(&x->list, &h->ad); ++ spin_unlock_bh(&set->lock); ++ } ++ if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { ++ pr_debug("Table destroy after resize by del: %p\n", t); ++ mtype_ahash_destroy(set, t, false); ++ } + return ret; + } + +@@ -991,6 +1232,7 @@ mtype_test(struct ip_set *set, void *val + int i, ret = 0; + u32 key, multi = 0; + ++ rcu_read_lock_bh(); + t = rcu_dereference_bh(h->table); + #ifdef IP_SET_HASH_WITH_NETS + /* If we test an IP address and not a network address, +@@ -1022,6 +1264,7 @@ mtype_test(struct ip_set *set, void *val + goto out; + } + out: ++ rcu_read_unlock_bh(); + return ret; + } + +@@ -1033,23 +1276,14 @@ mtype_head(struct ip_set *set, struct sk + const struct htable *t; + struct nlattr *nested; + size_t memsize; ++ u32 elements = 0; ++ size_t ext_size = 0; + u8 htable_bits; + +- /* If any members have expired, set->elements will be wrong +- * mytype_expire function will update it with the right count. +- * we do not hold set->lock here, so grab it first. +- * set->elements can still be incorrect in the case of a huge set, +- * because elements might time out during the listing. +- */ +- if (SET_WITH_TIMEOUT(set)) { +- spin_lock_bh(&set->lock); +- mtype_expire(set, h); +- spin_unlock_bh(&set->lock); +- } +- + rcu_read_lock_bh(); +- t = rcu_dereference_bh_nfnl(h->table); +- memsize = mtype_ahash_memsize(h, t) + set->ext_size; ++ t = rcu_dereference_bh(h->table); ++ mtype_ext_size(set, &elements, &ext_size); ++ memsize = mtype_ahash_memsize(h, t) + ext_size + set->ext_size; + htable_bits = t->htable_bits; + rcu_read_unlock_bh(); + +@@ -1071,7 +1305,7 @@ mtype_head(struct ip_set *set, struct sk + #endif + if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) || + nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) || +- nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements))) ++ nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements))) + goto nla_put_failure; + if (unlikely(ip_set_put_flags(skb, set))) + goto nla_put_failure; +@@ -1091,15 +1325,15 @@ mtype_uref(struct ip_set *set, struct ne + + if (start) { + rcu_read_lock_bh(); +- t = rcu_dereference_bh_nfnl(h->table); ++ t = ipset_dereference_bh_nfnl(h->table); + atomic_inc(&t->uref); + cb->args[IPSET_CB_PRIVATE] = (unsigned long)t; + rcu_read_unlock_bh(); + } else if (cb->args[IPSET_CB_PRIVATE]) { + t = (struct htable *)cb->args[IPSET_CB_PRIVATE]; + if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) { +- /* Resizing didn't destroy the hash table */ +- pr_debug("Table destroy by dump: %p\n", t); ++ pr_debug("Table destroy after resize " ++ " by dump: %p\n", t); + mtype_ahash_destroy(set, t, false); + } + cb->args[IPSET_CB_PRIVATE] = 0; +@@ -1141,8 +1375,7 @@ mtype_list(const struct ip_set *set, + if (!test_bit(i, n->used)) + continue; + e = ahash_data(n, i, set->dsize); +- if (SET_WITH_TIMEOUT(set) && +- ip_set_timeout_expired(ext_timeout(e, set))) ++ if (SET_ELEM_EXPIRED(set, e)) + continue; + pr_debug("list hash %lu hbucket %p i %u, data %p\n", + cb->args[IPSET_CB_ARG0], n, i, e); +@@ -1208,6 +1441,7 @@ static const struct ip_set_type_variant + .uref = mtype_uref, + .resize = mtype_resize, + .same_set = mtype_same_set, ++ .region_lock = true, + }; + + #ifdef IP_SET_EMIT_CREATE +@@ -1226,6 +1460,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net * + size_t hsize; + struct htype *h; + struct htable *t; ++ u32 i; + + pr_debug("Create set %s with family %s\n", + set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6"); +@@ -1294,6 +1529,15 @@ IPSET_TOKEN(HTYPE, _create)(struct net * + kfree(h); + return -ENOMEM; + } ++ t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits)); ++ if (!t->hregion) { ++ kfree(t); ++ kfree(h); ++ return -ENOMEM; ++ } ++ h->gc.set = set; ++ for (i = 0; i < ahash_numof_locks(hbits); i++) ++ spin_lock_init(&t->hregion[i].lock); + h->maxelem = maxelem; + #ifdef IP_SET_HASH_WITH_NETMASK + h->netmask = netmask; +@@ -1304,9 +1548,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net * + get_random_bytes(&h->initval, sizeof(h->initval)); + + t->htable_bits = hbits; ++ t->maxelem = h->maxelem / ahash_numof_locks(hbits); + RCU_INIT_POINTER(h->table, t); + +- h->set = set; ++ INIT_LIST_HEAD(&h->ad); + set->data = h; + #ifndef IP_SET_PROTO_UNDEF + if (set->family == NFPROTO_IPV4) { +@@ -1329,12 +1574,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net * + #ifndef IP_SET_PROTO_UNDEF + if (set->family == NFPROTO_IPV4) + #endif +- IPSET_TOKEN(HTYPE, 4_gc_init)(set, +- IPSET_TOKEN(HTYPE, 4_gc)); ++ IPSET_TOKEN(HTYPE, 4_gc_init)(&h->gc); + #ifndef IP_SET_PROTO_UNDEF + else +- IPSET_TOKEN(HTYPE, 6_gc_init)(set, +- IPSET_TOKEN(HTYPE, 6_gc)); ++ IPSET_TOKEN(HTYPE, 6_gc_init)(&h->gc); + #endif + } + pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n", diff --git a/queue-5.4/netfilter-xt_hashlimit-reduce-hashlimit_mutex-scope-for-htable_put.patch b/queue-5.4/netfilter-xt_hashlimit-reduce-hashlimit_mutex-scope-for-htable_put.patch new file mode 100644 index 00000000000..4babd895218 --- /dev/null +++ b/queue-5.4/netfilter-xt_hashlimit-reduce-hashlimit_mutex-scope-for-htable_put.patch @@ -0,0 +1,78 @@ +From c4a3922d2d20c710f827d3a115ee338e8d0467df Mon Sep 17 00:00:00 2001 +From: Cong Wang +Date: Sun, 2 Feb 2020 20:30:52 -0800 +Subject: netfilter: xt_hashlimit: reduce hashlimit_mutex scope for htable_put() + +From: Cong Wang + +commit c4a3922d2d20c710f827d3a115ee338e8d0467df upstream. + +It is unnecessary to hold hashlimit_mutex for htable_destroy() +as it is already removed from the global hashtable and its +refcount is already zero. + +Also, switch hinfo->use to refcount_t so that we don't have +to hold the mutex until it reaches zero in htable_put(). + +Reported-and-tested-by: syzbot+adf6c6c2be1c3a718121@syzkaller.appspotmail.com +Acked-by: Florian Westphal +Signed-off-by: Cong Wang +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Greg Kroah-Hartman + +--- + net/netfilter/xt_hashlimit.c | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +--- a/net/netfilter/xt_hashlimit.c ++++ b/net/netfilter/xt_hashlimit.c +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + #include + + #define XT_HASHLIMIT_ALL (XT_HASHLIMIT_HASH_DIP | XT_HASHLIMIT_HASH_DPT | \ +@@ -114,7 +115,7 @@ struct dsthash_ent { + + struct xt_hashlimit_htable { + struct hlist_node node; /* global list of all htables */ +- int use; ++ refcount_t use; + u_int8_t family; + bool rnd_initialized; + +@@ -315,7 +316,7 @@ static int htable_create(struct net *net + for (i = 0; i < hinfo->cfg.size; i++) + INIT_HLIST_HEAD(&hinfo->hash[i]); + +- hinfo->use = 1; ++ refcount_set(&hinfo->use, 1); + hinfo->count = 0; + hinfo->family = family; + hinfo->rnd_initialized = false; +@@ -434,7 +435,7 @@ static struct xt_hashlimit_htable *htabl + hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) { + if (!strcmp(name, hinfo->name) && + hinfo->family == family) { +- hinfo->use++; ++ refcount_inc(&hinfo->use); + return hinfo; + } + } +@@ -443,12 +444,11 @@ static struct xt_hashlimit_htable *htabl + + static void htable_put(struct xt_hashlimit_htable *hinfo) + { +- mutex_lock(&hashlimit_mutex); +- if (--hinfo->use == 0) { ++ if (refcount_dec_and_mutex_lock(&hinfo->use, &hashlimit_mutex)) { + hlist_del(&hinfo->node); ++ mutex_unlock(&hashlimit_mutex); + htable_destroy(hinfo); + } +- mutex_unlock(&hashlimit_mutex); + } + + /* The algorithm used is the Simple Token Bucket Filter (TBF) diff --git a/queue-5.4/series b/queue-5.4/series index a84269fff3c..ae8e64bfc45 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -84,3 +84,8 @@ revert-pm-devfreq-modify-the-device-name-as-devfreq-x-for-sysfs.patch amdgpu-gmc_v9-save-restore-sdpif-regs-during-s3.patch cpufreq-fix-policy-initialization-for-internal-governor-drivers.patch io_uring-fix-32-bit-compatability-with-sendmsg-recvmsg.patch +netfilter-ipset-fix-info-rcu-detected-stall-in-hash_xxx-reports.patch +net-smc-transfer-fasync_list-in-case-of-fallback.patch +vhost-check-docket-sk_family-instead-of-call-getname.patch +netfilter-ipset-fix-forceadd-evaluation-path.patch +netfilter-xt_hashlimit-reduce-hashlimit_mutex-scope-for-htable_put.patch diff --git a/queue-5.4/vhost-check-docket-sk_family-instead-of-call-getname.patch b/queue-5.4/vhost-check-docket-sk_family-instead-of-call-getname.patch new file mode 100644 index 00000000000..69776124fd1 --- /dev/null +++ b/queue-5.4/vhost-check-docket-sk_family-instead-of-call-getname.patch @@ -0,0 +1,57 @@ +From 42d84c8490f9f0931786f1623191fcab397c3d64 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= +Date: Fri, 21 Feb 2020 12:06:56 +0100 +Subject: vhost: Check docket sk_family instead of call getname +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Eugenio Pérez + +commit 42d84c8490f9f0931786f1623191fcab397c3d64 upstream. + +Doing so, we save one call to get data we already have in the struct. + +Also, since there is no guarantee that getname use sockaddr_ll +parameter beyond its size, we add a little bit of security here. +It should do not do beyond MAX_ADDR_LEN, but syzbot found that +ax25_getname writes more (72 bytes, the size of full_sockaddr_ax25, +versus 20 + 32 bytes of sockaddr_ll + MAX_ADDR_LEN in syzbot repro). + +Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server") +Reported-by: syzbot+f2a62d07a5198c819c7b@syzkaller.appspotmail.com +Signed-off-by: Eugenio Pérez +Acked-by: Michael S. Tsirkin +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/vhost/net.c | 10 +--------- + 1 file changed, 1 insertion(+), 9 deletions(-) + +--- a/drivers/vhost/net.c ++++ b/drivers/vhost/net.c +@@ -1414,10 +1414,6 @@ static int vhost_net_release(struct inod + + static struct socket *get_raw_socket(int fd) + { +- struct { +- struct sockaddr_ll sa; +- char buf[MAX_ADDR_LEN]; +- } uaddr; + int r; + struct socket *sock = sockfd_lookup(fd, &r); + +@@ -1430,11 +1426,7 @@ static struct socket *get_raw_socket(int + goto err; + } + +- r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, 0); +- if (r < 0) +- goto err; +- +- if (uaddr.sa.sll_family != AF_PACKET) { ++ if (sock->sk->sk_family != AF_PACKET) { + r = -EPFNOSUPPORT; + goto err; + } -- 2.47.3