5.4-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 2 Mar 2020 19:21:45 +0000 (20:21 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 2 Mar 2020 19:21:45 +0000 (20:21 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 2 Mar 2020 19:21:45 +0000 (20:21 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 2 Mar 2020 19:21:45 +0000 (20:21 +0100)
diff --git a/queue-5.4/net-smc-transfer-fasync_list-in-case-of-fallback.patch b/queue-5.4/net-smc-transfer-fasync_list-in-case-of-fallback.patch

new file mode 100644 (file)

index 0000000..8382722
--- /dev/null
+++ b/queue-5.4/net-smc-transfer-fasync_list-in-case-of-fallback.patch
@@ -0,0 +1,37 @@
+From 67f562e3e147750a02b2a91d21a163fc44a1d13e Mon Sep 17 00:00:00 2001
+From: Ursula Braun <ubraun@linux.ibm.com>
+Date: Fri, 14 Feb 2020 08:58:59 +0100
+Subject: net/smc: transfer fasync_list in case of fallback
+
+From: Ursula Braun <ubraun@linux.ibm.com>
+
+commit 67f562e3e147750a02b2a91d21a163fc44a1d13e upstream.
+
+SMC does not work together with FASTOPEN. If sendmsg() is called with
+flag MSG_FASTOPEN in SMC_INIT state, the SMC-socket switches to
+fallback mode. To handle the previous ioctl FIOASYNC call correctly
+in this case, it is necessary to transfer the socket wait queue
+fasync_list to the internal TCP socket.
+
+Reported-by: syzbot+4b1fe8105f8044a26162@syzkaller.appspotmail.com
+Fixes: ee9dfbef02d18 ("net/smc: handle sockopts forcing fallback")
+Signed-off-by: Ursula Braun <ubraun@linux.ibm.com>
+Signed-off-by: Karsten Graul <kgraul@linux.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/smc/af_smc.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/net/smc/af_smc.c
++++ b/net/smc/af_smc.c
+@@ -467,6 +467,8 @@ static void smc_switch_to_fallback(struc
+       if (smc->sk.sk_socket && smc->sk.sk_socket->file) {
+               smc->clcsock->file = smc->sk.sk_socket->file;
+               smc->clcsock->file->private_data = smc->clcsock;
++              smc->clcsock->wq.fasync_list =
++                      smc->sk.sk_socket->wq.fasync_list;
+       }
+ }
+ 
diff --git a/queue-5.4/netfilter-ipset-fix-forceadd-evaluation-path.patch b/queue-5.4/netfilter-ipset-fix-forceadd-evaluation-path.patch

new file mode 100644 (file)

index 0000000..a3faf02
--- /dev/null
+++ b/queue-5.4/netfilter-ipset-fix-forceadd-evaluation-path.patch
@@ -0,0 +1,34 @@
+From 8af1c6fbd9239877998c7f5a591cb2c88d41fb66 Mon Sep 17 00:00:00 2001
+From: Jozsef Kadlecsik <kadlec@netfilter.org>
+Date: Sat, 22 Feb 2020 12:01:43 +0100
+Subject: netfilter: ipset: Fix forceadd evaluation path
+
+From: Jozsef Kadlecsik <kadlec@netfilter.org>
+
+commit 8af1c6fbd9239877998c7f5a591cb2c88d41fb66 upstream.
+
+When the forceadd option is enabled, the hash:* types should find and replace
+the first entry in the bucket with the new one if there are no reuseable
+(deleted or timed out) entries. However, the position index was just not set
+to zero and remained the invalid -1 if there were no reuseable entries.
+
+Reported-by: syzbot+6a86565c74ebe30aea18@syzkaller.appspotmail.com
+Fixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and protocol version 7")
+Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/netfilter/ipset/ip_set_hash_gen.h |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/net/netfilter/ipset/ip_set_hash_gen.h
++++ b/net/netfilter/ipset/ip_set_hash_gen.h
+@@ -931,6 +931,8 @@ mtype_add(struct ip_set *set, void *valu
+               }
+       }
+       if (reuse || forceadd) {
++              if (j == -1)
++                      j = 0;
+               data = ahash_data(n, j, set->dsize);
+               if (!deleted) {
+ #ifdef IP_SET_HASH_WITH_NETS
diff --git a/queue-5.4/netfilter-ipset-fix-info-rcu-detected-stall-in-hash_xxx-reports.patch b/queue-5.4/netfilter-ipset-fix-info-rcu-detected-stall-in-hash_xxx-reports.patch

new file mode 100644 (file)

index 0000000..eb24c17
--- /dev/null
+++ b/queue-5.4/netfilter-ipset-fix-info-rcu-detected-stall-in-hash_xxx-reports.patch
@@ -0,0 +1,1311 @@
+From f66ee0410b1c3481ee75e5db9b34547b4d582465 Mon Sep 17 00:00:00 2001
+From: Jozsef Kadlecsik <kadlec@netfilter.org>
+Date: Tue, 11 Feb 2020 23:20:43 +0100
+Subject: netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports
+
+From: Jozsef Kadlecsik <kadlec@netfilter.org>
+
+commit f66ee0410b1c3481ee75e5db9b34547b4d582465 upstream.
+
+In the case of huge hash:* types of sets, due to the single spinlock of
+a set the processing of the whole set under spinlock protection could take
+too long.
+
+There were four places where the whole hash table of the set was processed
+from bucket to bucket under holding the spinlock:
+
+- During resizing a set, the original set was locked to exclude kernel side
+  add/del element operations (userspace add/del is excluded by the
+  nfnetlink mutex). The original set is actually just read during the
+  resize, so the spinlocking is replaced with rcu locking of regions.
+  However, thus there can be parallel kernel side add/del of entries.
+  In order not to loose those operations a backlog is added and replayed
+  after the successful resize.
+- Garbage collection of timed out entries was also protected by the spinlock.
+  In order not to lock too long, region locking is introduced and a single
+  region is processed in one gc go. Also, the simple timer based gc running
+  is replaced with a workqueue based solution. The internal book-keeping
+  (number of elements, size of extensions) is moved to region level due to
+  the region locking.
+- Adding elements: when the max number of the elements is reached, the gc
+  was called to evict the timed out entries. The new approach is that the gc
+  is called just for the matching region, assuming that if the region
+  (proportionally) seems to be full, then the whole set does. We could scan
+  the other regions to check every entry under rcu locking, but for huge
+  sets it'd mean a slowdown at adding elements.
+- Listing the set header data: when the set was defined with timeout
+  support, the garbage collector was called to clean up timed out entries
+  to get the correct element numbers and set size values. Now the set is
+  scanned to check non-timed out entries, without actually calling the gc
+  for the whole set.
+
+Thanks to Florian Westphal for helping me to solve the SOFTIRQ-safe ->
+SOFTIRQ-unsafe lock order issues during working on the patch.
+
+Reported-by: syzbot+4b0e9d4ff3cf117837e5@syzkaller.appspotmail.com
+Reported-by: syzbot+c27b8d5010f45c666ed1@syzkaller.appspotmail.com
+Reported-by: syzbot+68a806795ac89df3aa1c@syzkaller.appspotmail.com
+Fixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and protocol version 7")
+Signed-off-by: Jozsef Kadlecsik <kadlec@netfilter.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/netfilter/ipset/ip_set.h |   11 
+ net/netfilter/ipset/ip_set_core.c      |   34 +
+ net/netfilter/ipset/ip_set_hash_gen.h  |  633 ++++++++++++++++++++++-----------
+ 3 files changed, 472 insertions(+), 206 deletions(-)
+
+--- a/include/linux/netfilter/ipset/ip_set.h
++++ b/include/linux/netfilter/ipset/ip_set.h
+@@ -121,6 +121,7 @@ struct ip_set_ext {
+       u32 timeout;
+       u8 packets_op;
+       u8 bytes_op;
++      bool target;
+ };
+ 
+ struct ip_set;
+@@ -187,6 +188,14 @@ struct ip_set_type_variant {
+       /* Return true if "b" set is the same as "a"
+        * according to the create set parameters */
+       bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
++      /* Region-locking is used */
++      bool region_lock;
++};
++
++struct ip_set_region {
++      spinlock_t lock;        /* Region lock */
++      size_t ext_size;        /* Size of the dynamic extensions */
++      u32 elements;           /* Number of elements vs timeout */
+ };
+ 
+ /* The core set type structure */
+@@ -681,7 +690,7 @@ ip_set_init_skbinfo(struct ip_set_skbinf
+ }
+ 
+ #define IP_SET_INIT_KEXT(skb, opt, set)                       \
+-      { .bytes = (skb)->len, .packets = 1,            \
++      { .bytes = (skb)->len, .packets = 1, .target = true,\
+         .timeout = ip_set_adt_opt_timeout(opt, set) }
+ 
+ #define IP_SET_INIT_UEXT(set)                         \
+--- a/net/netfilter/ipset/ip_set_core.c
++++ b/net/netfilter/ipset/ip_set_core.c
+@@ -557,6 +557,20 @@ ip_set_rcu_get(struct net *net, ip_set_i
+       return set;
+ }
+ 
++static inline void
++ip_set_lock(struct ip_set *set)
++{
++      if (!set->variant->region_lock)
++              spin_lock_bh(&set->lock);
++}
++
++static inline void
++ip_set_unlock(struct ip_set *set)
++{
++      if (!set->variant->region_lock)
++              spin_unlock_bh(&set->lock);
++}
++
+ int
+ ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
+           const struct xt_action_param *par, struct ip_set_adt_opt *opt)
+@@ -578,9 +592,9 @@ ip_set_test(ip_set_id_t index, const str
+       if (ret == -EAGAIN) {
+               /* Type requests element to be completed */
+               pr_debug("element must be completed, ADD is triggered\n");
+-              spin_lock_bh(&set->lock);
++              ip_set_lock(set);
+               set->variant->kadt(set, skb, par, IPSET_ADD, opt);
+-              spin_unlock_bh(&set->lock);
++              ip_set_unlock(set);
+               ret = 1;
+       } else {
+               /* --return-nomatch: invert matched element */
+@@ -609,9 +623,9 @@ ip_set_add(ip_set_id_t index, const stru
+           !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
+               return -IPSET_ERR_TYPE_MISMATCH;
+ 
+-      spin_lock_bh(&set->lock);
++      ip_set_lock(set);
+       ret = set->variant->kadt(set, skb, par, IPSET_ADD, opt);
+-      spin_unlock_bh(&set->lock);
++      ip_set_unlock(set);
+ 
+       return ret;
+ }
+@@ -631,9 +645,9 @@ ip_set_del(ip_set_id_t index, const stru
+           !(opt->family == set->family || set->family == NFPROTO_UNSPEC))
+               return -IPSET_ERR_TYPE_MISMATCH;
+ 
+-      spin_lock_bh(&set->lock);
++      ip_set_lock(set);
+       ret = set->variant->kadt(set, skb, par, IPSET_DEL, opt);
+-      spin_unlock_bh(&set->lock);
++      ip_set_unlock(set);
+ 
+       return ret;
+ }
+@@ -1098,9 +1112,9 @@ ip_set_flush_set(struct ip_set *set)
+ {
+       pr_debug("set: %s\n",  set->name);
+ 
+-      spin_lock_bh(&set->lock);
++      ip_set_lock(set);
+       set->variant->flush(set);
+-      spin_unlock_bh(&set->lock);
++      ip_set_unlock(set);
+ }
+ 
+ static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb,
+@@ -1523,9 +1537,9 @@ call_ad(struct sock *ctnl, struct sk_buf
+       bool eexist = flags & IPSET_FLAG_EXIST, retried = false;
+ 
+       do {
+-              spin_lock_bh(&set->lock);
++              ip_set_lock(set);
+               ret = set->variant->uadt(set, tb, adt, &lineno, flags, retried);
+-              spin_unlock_bh(&set->lock);
++              ip_set_unlock(set);
+               retried = true;
+       } while (ret == -EAGAIN &&
+                set->variant->resize &&
+--- a/net/netfilter/ipset/ip_set_hash_gen.h
++++ b/net/netfilter/ipset/ip_set_hash_gen.h
+@@ -7,13 +7,21 @@
+ #include <linux/rcupdate.h>
+ #include <linux/jhash.h>
+ #include <linux/types.h>
++#include <linux/netfilter/nfnetlink.h>
+ #include <linux/netfilter/ipset/ip_set.h>
+ 
+-#define __ipset_dereference_protected(p, c)   rcu_dereference_protected(p, c)
+-#define ipset_dereference_protected(p, set) \
+-      __ipset_dereference_protected(p, lockdep_is_held(&(set)->lock))
+-
+-#define rcu_dereference_bh_nfnl(p)    rcu_dereference_bh_check(p, 1)
++#define __ipset_dereference(p)                \
++      rcu_dereference_protected(p, 1)
++#define ipset_dereference_nfnl(p)     \
++      rcu_dereference_protected(p,    \
++              lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
++#define ipset_dereference_set(p, set)         \
++      rcu_dereference_protected(p,    \
++              lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET) || \
++              lockdep_is_held(&(set)->lock))
++#define ipset_dereference_bh_nfnl(p)  \
++      rcu_dereference_bh_check(p,     \
++              lockdep_nfnl_is_held(NFNL_SUBSYS_IPSET))
+ 
+ /* Hashing which uses arrays to resolve clashing. The hash table is resized
+  * (doubled) when searching becomes too long.
+@@ -72,11 +80,35 @@ struct hbucket {
+               __aligned(__alignof__(u64));
+ };
+ 
++/* Region size for locking == 2^HTABLE_REGION_BITS */
++#define HTABLE_REGION_BITS    10
++#define ahash_numof_locks(htable_bits)                \
++      ((htable_bits) < HTABLE_REGION_BITS ? 1 \
++              : jhash_size((htable_bits) - HTABLE_REGION_BITS))
++#define ahash_sizeof_regions(htable_bits)             \
++      (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region))
++#define ahash_region(n, htable_bits)          \
++      ((n) % ahash_numof_locks(htable_bits))
++#define ahash_bucket_start(h,  htable_bits)   \
++      ((htable_bits) < HTABLE_REGION_BITS ? 0 \
++              : (h) * jhash_size(HTABLE_REGION_BITS))
++#define ahash_bucket_end(h,  htable_bits)     \
++      ((htable_bits) < HTABLE_REGION_BITS ? jhash_size(htable_bits)   \
++              : ((h) + 1) * jhash_size(HTABLE_REGION_BITS))
++
++struct htable_gc {
++      struct delayed_work dwork;
++      struct ip_set *set;     /* Set the gc belongs to */
++      u32 region;             /* Last gc run position */
++};
++
+ /* The hash table: the table size stored here in order to make resizing easy */
+ struct htable {
+       atomic_t ref;           /* References for resizing */
+-      atomic_t uref;          /* References for dumping */
++      atomic_t uref;          /* References for dumping and gc */
+       u8 htable_bits;         /* size of hash table == 2^htable_bits */
++      u32 maxelem;            /* Maxelem per region */
++      struct ip_set_region *hregion;  /* Region locks and ext sizes */
+       struct hbucket __rcu *bucket[0]; /* hashtable buckets */
+ };
+ 
+@@ -162,6 +194,10 @@ htable_bits(u32 hashsize)
+ #define NLEN                  0
+ #endif /* IP_SET_HASH_WITH_NETS */
+ 
++#define SET_ELEM_EXPIRED(set, d)      \
++      (SET_WITH_TIMEOUT(set) &&       \
++       ip_set_timeout_expired(ext_timeout(d, set)))
++
+ #endif /* _IP_SET_HASH_GEN_H */
+ 
+ #ifndef MTYPE
+@@ -205,10 +241,12 @@ htable_bits(u32 hashsize)
+ #undef mtype_test_cidrs
+ #undef mtype_test
+ #undef mtype_uref
+-#undef mtype_expire
+ #undef mtype_resize
++#undef mtype_ext_size
++#undef mtype_resize_ad
+ #undef mtype_head
+ #undef mtype_list
++#undef mtype_gc_do
+ #undef mtype_gc
+ #undef mtype_gc_init
+ #undef mtype_variant
+@@ -247,10 +285,12 @@ htable_bits(u32 hashsize)
+ #define mtype_test_cidrs      IPSET_TOKEN(MTYPE, _test_cidrs)
+ #define mtype_test            IPSET_TOKEN(MTYPE, _test)
+ #define mtype_uref            IPSET_TOKEN(MTYPE, _uref)
+-#define mtype_expire          IPSET_TOKEN(MTYPE, _expire)
+ #define mtype_resize          IPSET_TOKEN(MTYPE, _resize)
++#define mtype_ext_size                IPSET_TOKEN(MTYPE, _ext_size)
++#define mtype_resize_ad               IPSET_TOKEN(MTYPE, _resize_ad)
+ #define mtype_head            IPSET_TOKEN(MTYPE, _head)
+ #define mtype_list            IPSET_TOKEN(MTYPE, _list)
++#define mtype_gc_do           IPSET_TOKEN(MTYPE, _gc_do)
+ #define mtype_gc              IPSET_TOKEN(MTYPE, _gc)
+ #define mtype_gc_init         IPSET_TOKEN(MTYPE, _gc_init)
+ #define mtype_variant         IPSET_TOKEN(MTYPE, _variant)
+@@ -275,8 +315,7 @@ htable_bits(u32 hashsize)
+ /* The generic hash structure */
+ struct htype {
+       struct htable __rcu *table; /* the hash table */
+-      struct timer_list gc;   /* garbage collection when timeout enabled */
+-      struct ip_set *set;     /* attached to this ip_set */
++      struct htable_gc gc;    /* gc workqueue */
+       u32 maxelem;            /* max elements in the hash */
+       u32 initval;            /* random jhash init value */
+ #ifdef IP_SET_HASH_WITH_MARKMASK
+@@ -288,21 +327,33 @@ struct htype {
+ #ifdef IP_SET_HASH_WITH_NETMASK
+       u8 netmask;             /* netmask value for subnets to store */
+ #endif
++      struct list_head ad;    /* Resize add|del backlist */
+       struct mtype_elem next; /* temporary storage for uadd */
+ #ifdef IP_SET_HASH_WITH_NETS
+       struct net_prefixes nets[NLEN]; /* book-keeping of prefixes */
+ #endif
+ };
+ 
++/* ADD|DEL entries saved during resize */
++struct mtype_resize_ad {
++      struct list_head list;
++      enum ipset_adt ad;      /* ADD|DEL element */
++      struct mtype_elem d;    /* Element value */
++      struct ip_set_ext ext;  /* Extensions for ADD */
++      struct ip_set_ext mext; /* Target extensions for ADD */
++      u32 flags;              /* Flags for ADD */
++};
++
+ #ifdef IP_SET_HASH_WITH_NETS
+ /* Network cidr size book keeping when the hash stores different
+  * sized networks. cidr == real cidr + 1 to support /0.
+  */
+ static void
+-mtype_add_cidr(struct htype *h, u8 cidr, u8 n)
++mtype_add_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n)
+ {
+       int i, j;
+ 
++      spin_lock_bh(&set->lock);
+       /* Add in increasing prefix order, so larger cidr first */
+       for (i = 0, j = -1; i < NLEN && h->nets[i].cidr[n]; i++) {
+               if (j != -1) {
+@@ -311,7 +362,7 @@ mtype_add_cidr(struct htype *h, u8 cidr,
+                       j = i;
+               } else if (h->nets[i].cidr[n] == cidr) {
+                       h->nets[CIDR_POS(cidr)].nets[n]++;
+-                      return;
++                      goto unlock;
+               }
+       }
+       if (j != -1) {
+@@ -320,24 +371,29 @@ mtype_add_cidr(struct htype *h, u8 cidr,
+       }
+       h->nets[i].cidr[n] = cidr;
+       h->nets[CIDR_POS(cidr)].nets[n] = 1;
++unlock:
++      spin_unlock_bh(&set->lock);
+ }
+ 
+ static void
+-mtype_del_cidr(struct htype *h, u8 cidr, u8 n)
++mtype_del_cidr(struct ip_set *set, struct htype *h, u8 cidr, u8 n)
+ {
+       u8 i, j, net_end = NLEN - 1;
+ 
++      spin_lock_bh(&set->lock);
+       for (i = 0; i < NLEN; i++) {
+               if (h->nets[i].cidr[n] != cidr)
+                       continue;
+               h->nets[CIDR_POS(cidr)].nets[n]--;
+               if (h->nets[CIDR_POS(cidr)].nets[n] > 0)
+-                      return;
++                      goto unlock;
+               for (j = i; j < net_end && h->nets[j].cidr[n]; j++)
+                       h->nets[j].cidr[n] = h->nets[j + 1].cidr[n];
+               h->nets[j].cidr[n] = 0;
+-              return;
++              goto unlock;
+       }
++unlock:
++      spin_unlock_bh(&set->lock);
+ }
+ #endif
+ 
+@@ -345,7 +401,7 @@ mtype_del_cidr(struct htype *h, u8 cidr,
+ static size_t
+ mtype_ahash_memsize(const struct htype *h, const struct htable *t)
+ {
+-      return sizeof(*h) + sizeof(*t);
++      return sizeof(*h) + sizeof(*t) + ahash_sizeof_regions(t->htable_bits);
+ }
+ 
+ /* Get the ith element from the array block n */
+@@ -369,24 +425,29 @@ mtype_flush(struct ip_set *set)
+       struct htype *h = set->data;
+       struct htable *t;
+       struct hbucket *n;
+-      u32 i;
++      u32 r, i;
+ 
+-      t = ipset_dereference_protected(h->table, set);
+-      for (i = 0; i < jhash_size(t->htable_bits); i++) {
+-              n = __ipset_dereference_protected(hbucket(t, i), 1);
+-              if (!n)
+-                      continue;
+-              if (set->extensions & IPSET_EXT_DESTROY)
+-                      mtype_ext_cleanup(set, n);
+-              /* FIXME: use slab cache */
+-              rcu_assign_pointer(hbucket(t, i), NULL);
+-              kfree_rcu(n, rcu);
++      t = ipset_dereference_nfnl(h->table);
++      for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) {
++              spin_lock_bh(&t->hregion[r].lock);
++              for (i = ahash_bucket_start(r, t->htable_bits);
++                   i < ahash_bucket_end(r, t->htable_bits); i++) {
++                      n = __ipset_dereference(hbucket(t, i));
++                      if (!n)
++                              continue;
++                      if (set->extensions & IPSET_EXT_DESTROY)
++                              mtype_ext_cleanup(set, n);
++                      /* FIXME: use slab cache */
++                      rcu_assign_pointer(hbucket(t, i), NULL);
++                      kfree_rcu(n, rcu);
++              }
++              t->hregion[r].ext_size = 0;
++              t->hregion[r].elements = 0;
++              spin_unlock_bh(&t->hregion[r].lock);
+       }
+ #ifdef IP_SET_HASH_WITH_NETS
+       memset(h->nets, 0, sizeof(h->nets));
+ #endif
+-      set->elements = 0;
+-      set->ext_size = 0;
+ }
+ 
+ /* Destroy the hashtable part of the set */
+@@ -397,7 +458,7 @@ mtype_ahash_destroy(struct ip_set *set,
+       u32 i;
+ 
+       for (i = 0; i < jhash_size(t->htable_bits); i++) {
+-              n = __ipset_dereference_protected(hbucket(t, i), 1);
++              n = __ipset_dereference(hbucket(t, i));
+               if (!n)
+                       continue;
+               if (set->extensions & IPSET_EXT_DESTROY && ext_destroy)
+@@ -406,6 +467,7 @@ mtype_ahash_destroy(struct ip_set *set,
+               kfree(n);
+       }
+ 
++      ip_set_free(t->hregion);
+       ip_set_free(t);
+ }
+ 
+@@ -414,28 +476,21 @@ static void
+ mtype_destroy(struct ip_set *set)
+ {
+       struct htype *h = set->data;
++      struct list_head *l, *lt;
+ 
+       if (SET_WITH_TIMEOUT(set))
+-              del_timer_sync(&h->gc);
++              cancel_delayed_work_sync(&h->gc.dwork);
+ 
+-      mtype_ahash_destroy(set,
+-                          __ipset_dereference_protected(h->table, 1), true);
++      mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
++      list_for_each_safe(l, lt, &h->ad) {
++              list_del(l);
++              kfree(l);
++      }
+       kfree(h);
+ 
+       set->data = NULL;
+ }
+ 
+-static void
+-mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t))
+-{
+-      struct htype *h = set->data;
+-
+-      timer_setup(&h->gc, gc, 0);
+-      mod_timer(&h->gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
+-      pr_debug("gc initialized, run in every %u\n",
+-               IPSET_GC_PERIOD(set->timeout));
+-}
+-
+ static bool
+ mtype_same_set(const struct ip_set *a, const struct ip_set *b)
+ {
+@@ -454,11 +509,9 @@ mtype_same_set(const struct ip_set *a, c
+              a->extensions == b->extensions;
+ }
+ 
+-/* Delete expired elements from the hashtable */
+ static void
+-mtype_expire(struct ip_set *set, struct htype *h)
++mtype_gc_do(struct ip_set *set, struct htype *h, struct htable *t, u32 r)
+ {
+-      struct htable *t;
+       struct hbucket *n, *tmp;
+       struct mtype_elem *data;
+       u32 i, j, d;
+@@ -466,10 +519,12 @@ mtype_expire(struct ip_set *set, struct
+ #ifdef IP_SET_HASH_WITH_NETS
+       u8 k;
+ #endif
++      u8 htable_bits = t->htable_bits;
+ 
+-      t = ipset_dereference_protected(h->table, set);
+-      for (i = 0; i < jhash_size(t->htable_bits); i++) {
+-              n = __ipset_dereference_protected(hbucket(t, i), 1);
++      spin_lock_bh(&t->hregion[r].lock);
++      for (i = ahash_bucket_start(r, htable_bits);
++           i < ahash_bucket_end(r, htable_bits); i++) {
++              n = __ipset_dereference(hbucket(t, i));
+               if (!n)
+                       continue;
+               for (j = 0, d = 0; j < n->pos; j++) {
+@@ -485,58 +540,100 @@ mtype_expire(struct ip_set *set, struct
+                       smp_mb__after_atomic();
+ #ifdef IP_SET_HASH_WITH_NETS
+                       for (k = 0; k < IPSET_NET_COUNT; k++)
+-                              mtype_del_cidr(h,
++                              mtype_del_cidr(set, h,
+                                       NCIDR_PUT(DCIDR_GET(data->cidr, k)),
+                                       k);
+ #endif
++                      t->hregion[r].elements--;
+                       ip_set_ext_destroy(set, data);
+-                      set->elements--;
+                       d++;
+               }
+               if (d >= AHASH_INIT_SIZE) {
+                       if (d >= n->size) {
++                              t->hregion[r].ext_size -=
++                                      ext_size(n->size, dsize);
+                               rcu_assign_pointer(hbucket(t, i), NULL);
+                               kfree_rcu(n, rcu);
+                               continue;
+                       }
+                       tmp = kzalloc(sizeof(*tmp) +
+-                                    (n->size - AHASH_INIT_SIZE) * dsize,
+-                                    GFP_ATOMIC);
++                              (n->size - AHASH_INIT_SIZE) * dsize,
++                              GFP_ATOMIC);
+                       if (!tmp)
+-                              /* Still try to delete expired elements */
++                              /* Still try to delete expired elements. */
+                               continue;
+                       tmp->size = n->size - AHASH_INIT_SIZE;
+                       for (j = 0, d = 0; j < n->pos; j++) {
+                               if (!test_bit(j, n->used))
+                                       continue;
+                               data = ahash_data(n, j, dsize);
+-                              memcpy(tmp->value + d * dsize, data, dsize);
++                              memcpy(tmp->value + d * dsize,
++                                     data, dsize);
+                               set_bit(d, tmp->used);
+                               d++;
+                       }
+                       tmp->pos = d;
+-                      set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
++                      t->hregion[r].ext_size -=
++                              ext_size(AHASH_INIT_SIZE, dsize);
+                       rcu_assign_pointer(hbucket(t, i), tmp);
+                       kfree_rcu(n, rcu);
+               }
+       }
++      spin_unlock_bh(&t->hregion[r].lock);
+ }
+ 
+ static void
+-mtype_gc(struct timer_list *t)
++mtype_gc(struct work_struct *work)
+ {
+-      struct htype *h = from_timer(h, t, gc);
+-      struct ip_set *set = h->set;
++      struct htable_gc *gc;
++      struct ip_set *set;
++      struct htype *h;
++      struct htable *t;
++      u32 r, numof_locks;
++      unsigned int next_run;
++
++      gc = container_of(work, struct htable_gc, dwork.work);
++      set = gc->set;
++      h = set->data;
+ 
+-      pr_debug("called\n");
+       spin_lock_bh(&set->lock);
+-      mtype_expire(set, h);
++      t = ipset_dereference_set(h->table, set);
++      atomic_inc(&t->uref);
++      numof_locks = ahash_numof_locks(t->htable_bits);
++      r = gc->region++;
++      if (r >= numof_locks) {
++              r = gc->region = 0;
++      }
++      next_run = (IPSET_GC_PERIOD(set->timeout) * HZ) / numof_locks;
++      if (next_run < HZ/10)
++              next_run = HZ/10;
+       spin_unlock_bh(&set->lock);
+ 
+-      h->gc.expires = jiffies + IPSET_GC_PERIOD(set->timeout) * HZ;
+-      add_timer(&h->gc);
++      mtype_gc_do(set, h, t, r);
++
++      if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
++              pr_debug("Table destroy after resize by expire: %p\n", t);
++              mtype_ahash_destroy(set, t, false);
++      }
++
++      queue_delayed_work(system_power_efficient_wq, &gc->dwork, next_run);
++
+ }
+ 
++static void
++mtype_gc_init(struct htable_gc *gc)
++{
++      INIT_DEFERRABLE_WORK(&gc->dwork, mtype_gc);
++      queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
++}
++
++static int
++mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
++        struct ip_set_ext *mext, u32 flags);
++static int
++mtype_del(struct ip_set *set, void *value, const struct ip_set_ext *ext,
++        struct ip_set_ext *mext, u32 flags);
++
+ /* Resize a hash: create a new hash table with doubling the hashsize
+  * and inserting the elements to it. Repeat until we succeed or
+  * fail due to memory pressures.
+@@ -547,7 +644,7 @@ mtype_resize(struct ip_set *set, bool re
+       struct htype *h = set->data;
+       struct htable *t, *orig;
+       u8 htable_bits;
+-      size_t extsize, dsize = set->dsize;
++      size_t dsize = set->dsize;
+ #ifdef IP_SET_HASH_WITH_NETS
+       u8 flags;
+       struct mtype_elem *tmp;
+@@ -555,7 +652,9 @@ mtype_resize(struct ip_set *set, bool re
+       struct mtype_elem *data;
+       struct mtype_elem *d;
+       struct hbucket *n, *m;
+-      u32 i, j, key;
++      struct list_head *l, *lt;
++      struct mtype_resize_ad *x;
++      u32 i, j, r, nr, key;
+       int ret;
+ 
+ #ifdef IP_SET_HASH_WITH_NETS
+@@ -563,10 +662,8 @@ mtype_resize(struct ip_set *set, bool re
+       if (!tmp)
+               return -ENOMEM;
+ #endif
+-      rcu_read_lock_bh();
+-      orig = rcu_dereference_bh_nfnl(h->table);
++      orig = ipset_dereference_bh_nfnl(h->table);
+       htable_bits = orig->htable_bits;
+-      rcu_read_unlock_bh();
+ 
+ retry:
+       ret = 0;
+@@ -583,88 +680,124 @@ retry:
+               ret = -ENOMEM;
+               goto out;
+       }
++      t->hregion = ip_set_alloc(ahash_sizeof_regions(htable_bits));
++      if (!t->hregion) {
++              kfree(t);
++              ret = -ENOMEM;
++              goto out;
++      }
+       t->htable_bits = htable_bits;
++      t->maxelem = h->maxelem / ahash_numof_locks(htable_bits);
++      for (i = 0; i < ahash_numof_locks(htable_bits); i++)
++              spin_lock_init(&t->hregion[i].lock);
+ 
+-      spin_lock_bh(&set->lock);
+-      orig = __ipset_dereference_protected(h->table, 1);
+-      /* There can't be another parallel resizing, but dumping is possible */
++      /* There can't be another parallel resizing,
++       * but dumping, gc, kernel side add/del are possible
++       */
++      orig = ipset_dereference_bh_nfnl(h->table);
+       atomic_set(&orig->ref, 1);
+       atomic_inc(&orig->uref);
+-      extsize = 0;
+       pr_debug("attempt to resize set %s from %u to %u, t %p\n",
+                set->name, orig->htable_bits, htable_bits, orig);
+-      for (i = 0; i < jhash_size(orig->htable_bits); i++) {
+-              n = __ipset_dereference_protected(hbucket(orig, i), 1);
+-              if (!n)
+-                      continue;
+-              for (j = 0; j < n->pos; j++) {
+-                      if (!test_bit(j, n->used))
++      for (r = 0; r < ahash_numof_locks(orig->htable_bits); r++) {
++              /* Expire may replace a hbucket with another one */
++              rcu_read_lock_bh();
++              for (i = ahash_bucket_start(r, orig->htable_bits);
++                   i < ahash_bucket_end(r, orig->htable_bits); i++) {
++                      n = __ipset_dereference(hbucket(orig, i));
++                      if (!n)
+                               continue;
+-                      data = ahash_data(n, j, dsize);
++                      for (j = 0; j < n->pos; j++) {
++                              if (!test_bit(j, n->used))
++                                      continue;
++                              data = ahash_data(n, j, dsize);
++                              if (SET_ELEM_EXPIRED(set, data))
++                                      continue;
+ #ifdef IP_SET_HASH_WITH_NETS
+-                      /* We have readers running parallel with us,
+-                       * so the live data cannot be modified.
+-                       */
+-                      flags = 0;
+-                      memcpy(tmp, data, dsize);
+-                      data = tmp;
+-                      mtype_data_reset_flags(data, &flags);
+-#endif
+-                      key = HKEY(data, h->initval, htable_bits);
+-                      m = __ipset_dereference_protected(hbucket(t, key), 1);
+-                      if (!m) {
+-                              m = kzalloc(sizeof(*m) +
++                              /* We have readers running parallel with us,
++                               * so the live data cannot be modified.
++                               */
++                              flags = 0;
++                              memcpy(tmp, data, dsize);
++                              data = tmp;
++                              mtype_data_reset_flags(data, &flags);
++#endif
++                              key = HKEY(data, h->initval, htable_bits);
++                              m = __ipset_dereference(hbucket(t, key));
++                              nr = ahash_region(key, htable_bits);
++                              if (!m) {
++                                      m = kzalloc(sizeof(*m) +
+                                           AHASH_INIT_SIZE * dsize,
+                                           GFP_ATOMIC);
+-                              if (!m) {
+-                                      ret = -ENOMEM;
+-                                      goto cleanup;
+-                              }
+-                              m->size = AHASH_INIT_SIZE;
+-                              extsize += ext_size(AHASH_INIT_SIZE, dsize);
+-                              RCU_INIT_POINTER(hbucket(t, key), m);
+-                      } else if (m->pos >= m->size) {
+-                              struct hbucket *ht;
+-
+-                              if (m->size >= AHASH_MAX(h)) {
+-                                      ret = -EAGAIN;
+-                              } else {
+-                                      ht = kzalloc(sizeof(*ht) +
++                                      if (!m) {
++                                              ret = -ENOMEM;
++                                              goto cleanup;
++                                      }
++                                      m->size = AHASH_INIT_SIZE;
++                                      t->hregion[nr].ext_size +=
++                                              ext_size(AHASH_INIT_SIZE,
++                                                       dsize);
++                                      RCU_INIT_POINTER(hbucket(t, key), m);
++                              } else if (m->pos >= m->size) {
++                                      struct hbucket *ht;
++
++                                      if (m->size >= AHASH_MAX(h)) {
++                                              ret = -EAGAIN;
++                                      } else {
++                                              ht = kzalloc(sizeof(*ht) +
+                                               (m->size + AHASH_INIT_SIZE)
+                                               * dsize,
+                                               GFP_ATOMIC);
+-                                      if (!ht)
+-                                              ret = -ENOMEM;
++                                              if (!ht)
++                                                      ret = -ENOMEM;
++                                      }
++                                      if (ret < 0)
++                                              goto cleanup;
++                                      memcpy(ht, m, sizeof(struct hbucket) +
++                                             m->size * dsize);
++                                      ht->size = m->size + AHASH_INIT_SIZE;
++                                      t->hregion[nr].ext_size +=
++                                              ext_size(AHASH_INIT_SIZE,
++                                                       dsize);
++                                      kfree(m);
++                                      m = ht;
++                                      RCU_INIT_POINTER(hbucket(t, key), ht);
+                               }
+-                              if (ret < 0)
+-                                      goto cleanup;
+-                              memcpy(ht, m, sizeof(struct hbucket) +
+-                                            m->size * dsize);
+-                              ht->size = m->size + AHASH_INIT_SIZE;
+-                              extsize += ext_size(AHASH_INIT_SIZE, dsize);
+-                              kfree(m);
+-                              m = ht;
+-                              RCU_INIT_POINTER(hbucket(t, key), ht);
+-                      }
+-                      d = ahash_data(m, m->pos, dsize);
+-                      memcpy(d, data, dsize);
+-                      set_bit(m->pos++, m->used);
++                              d = ahash_data(m, m->pos, dsize);
++                              memcpy(d, data, dsize);
++                              set_bit(m->pos++, m->used);
++                              t->hregion[nr].elements++;
+ #ifdef IP_SET_HASH_WITH_NETS
+-                      mtype_data_reset_flags(d, &flags);
++                              mtype_data_reset_flags(d, &flags);
+ #endif
++                      }
+               }
++              rcu_read_unlock_bh();
+       }
+-      rcu_assign_pointer(h->table, t);
+-      set->ext_size = extsize;
+ 
+-      spin_unlock_bh(&set->lock);
++      /* There can't be any other writer. */
++      rcu_assign_pointer(h->table, t);
+ 
+       /* Give time to other readers of the set */
+       synchronize_rcu();
+ 
+       pr_debug("set %s resized from %u (%p) to %u (%p)\n", set->name,
+                orig->htable_bits, orig, t->htable_bits, t);
+-      /* If there's nobody else dumping the table, destroy it */
++      /* Add/delete elements processed by the SET target during resize.
++       * Kernel-side add cannot trigger a resize and userspace actions
++       * are serialized by the mutex.
++       */
++      list_for_each_safe(l, lt, &h->ad) {
++              x = list_entry(l, struct mtype_resize_ad, list);
++              if (x->ad == IPSET_ADD) {
++                      mtype_add(set, &x->d, &x->ext, &x->mext, x->flags);
++              } else {
++                      mtype_del(set, &x->d, NULL, NULL, 0);
++              }
++              list_del(l);
++              kfree(l);
++      }
++      /* If there's nobody else using the table, destroy it */
+       if (atomic_dec_and_test(&orig->uref)) {
+               pr_debug("Table destroy by resize %p\n", orig);
+               mtype_ahash_destroy(set, orig, false);
+@@ -677,15 +810,44 @@ out:
+       return ret;
+ 
+ cleanup:
++      rcu_read_unlock_bh();
+       atomic_set(&orig->ref, 0);
+       atomic_dec(&orig->uref);
+-      spin_unlock_bh(&set->lock);
+       mtype_ahash_destroy(set, t, false);
+       if (ret == -EAGAIN)
+               goto retry;
+       goto out;
+ }
+ 
++/* Get the current number of elements and ext_size in the set  */
++static void
++mtype_ext_size(struct ip_set *set, u32 *elements, size_t *ext_size)
++{
++      struct htype *h = set->data;
++      const struct htable *t;
++      u32 i, j, r;
++      struct hbucket *n;
++      struct mtype_elem *data;
++
++      t = rcu_dereference_bh(h->table);
++      for (r = 0; r < ahash_numof_locks(t->htable_bits); r++) {
++              for (i = ahash_bucket_start(r, t->htable_bits);
++                   i < ahash_bucket_end(r, t->htable_bits); i++) {
++                      n = rcu_dereference_bh(hbucket(t, i));
++                      if (!n)
++                              continue;
++                      for (j = 0; j < n->pos; j++) {
++                              if (!test_bit(j, n->used))
++                                      continue;
++                              data = ahash_data(n, j, set->dsize);
++                              if (!SET_ELEM_EXPIRED(set, data))
++                                      (*elements)++;
++                      }
++              }
++              *ext_size += t->hregion[r].ext_size;
++      }
++}
++
+ /* Add an element to a hash and update the internal counters when succeeded,
+  * otherwise report the proper error code.
+  */
+@@ -698,32 +860,49 @@ mtype_add(struct ip_set *set, void *valu
+       const struct mtype_elem *d = value;
+       struct mtype_elem *data;
+       struct hbucket *n, *old = ERR_PTR(-ENOENT);
+-      int i, j = -1;
++      int i, j = -1, ret;
+       bool flag_exist = flags & IPSET_FLAG_EXIST;
+       bool deleted = false, forceadd = false, reuse = false;
+-      u32 key, multi = 0;
++      u32 r, key, multi = 0, elements, maxelem;
+ 
+-      if (set->elements >= h->maxelem) {
+-              if (SET_WITH_TIMEOUT(set))
+-                      /* FIXME: when set is full, we slow down here */
+-                      mtype_expire(set, h);
+-              if (set->elements >= h->maxelem && SET_WITH_FORCEADD(set))
++      rcu_read_lock_bh();
++      t = rcu_dereference_bh(h->table);
++      key = HKEY(value, h->initval, t->htable_bits);
++      r = ahash_region(key, t->htable_bits);
++      atomic_inc(&t->uref);
++      elements = t->hregion[r].elements;
++      maxelem = t->maxelem;
++      if (elements >= maxelem) {
++              u32 e;
++              if (SET_WITH_TIMEOUT(set)) {
++                      rcu_read_unlock_bh();
++                      mtype_gc_do(set, h, t, r);
++                      rcu_read_lock_bh();
++              }
++              maxelem = h->maxelem;
++              elements = 0;
++              for (e = 0; e < ahash_numof_locks(t->htable_bits); e++)
++                      elements += t->hregion[e].elements;
++              if (elements >= maxelem && SET_WITH_FORCEADD(set))
+                       forceadd = true;
+       }
++      rcu_read_unlock_bh();
+ 
+-      t = ipset_dereference_protected(h->table, set);
+-      key = HKEY(value, h->initval, t->htable_bits);
+-      n = __ipset_dereference_protected(hbucket(t, key), 1);
++      spin_lock_bh(&t->hregion[r].lock);
++      n = rcu_dereference_bh(hbucket(t, key));
+       if (!n) {
+-              if (forceadd || set->elements >= h->maxelem)
++              if (forceadd || elements >= maxelem)
+                       goto set_full;
+               old = NULL;
+               n = kzalloc(sizeof(*n) + AHASH_INIT_SIZE * set->dsize,
+                           GFP_ATOMIC);
+-              if (!n)
+-                      return -ENOMEM;
++              if (!n) {
++                      ret = -ENOMEM;
++                      goto unlock;
++              }
+               n->size = AHASH_INIT_SIZE;
+-              set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
++              t->hregion[r].ext_size +=
++                      ext_size(AHASH_INIT_SIZE, set->dsize);
+               goto copy_elem;
+       }
+       for (i = 0; i < n->pos; i++) {
+@@ -737,19 +916,16 @@ mtype_add(struct ip_set *set, void *valu
+               }
+               data = ahash_data(n, i, set->dsize);
+               if (mtype_data_equal(data, d, &multi)) {
+-                      if (flag_exist ||
+-                          (SET_WITH_TIMEOUT(set) &&
+-                           ip_set_timeout_expired(ext_timeout(data, set)))) {
++                      if (flag_exist || SET_ELEM_EXPIRED(set, data)) {
+                               /* Just the extensions could be overwritten */
+                               j = i;
+                               goto overwrite_extensions;
+                       }
+-                      return -IPSET_ERR_EXIST;
++                      ret = -IPSET_ERR_EXIST;
++                      goto unlock;
+               }
+               /* Reuse first timed out entry */
+-              if (SET_WITH_TIMEOUT(set) &&
+-                  ip_set_timeout_expired(ext_timeout(data, set)) &&
+-                  j == -1) {
++              if (SET_ELEM_EXPIRED(set, data) && j == -1) {
+                       j = i;
+                       reuse = true;
+               }
+@@ -759,16 +935,16 @@ mtype_add(struct ip_set *set, void *valu
+               if (!deleted) {
+ #ifdef IP_SET_HASH_WITH_NETS
+                       for (i = 0; i < IPSET_NET_COUNT; i++)
+-                              mtype_del_cidr(h,
++                              mtype_del_cidr(set, h,
+                                       NCIDR_PUT(DCIDR_GET(data->cidr, i)),
+                                       i);
+ #endif
+                       ip_set_ext_destroy(set, data);
+-                      set->elements--;
++                      t->hregion[r].elements--;
+               }
+               goto copy_data;
+       }
+-      if (set->elements >= h->maxelem)
++      if (elements >= maxelem)
+               goto set_full;
+       /* Create a new slot */
+       if (n->pos >= n->size) {
+@@ -776,28 +952,32 @@ mtype_add(struct ip_set *set, void *valu
+               if (n->size >= AHASH_MAX(h)) {
+                       /* Trigger rehashing */
+                       mtype_data_next(&h->next, d);
+-                      return -EAGAIN;
++                      ret = -EAGAIN;
++                      goto resize;
+               }
+               old = n;
+               n = kzalloc(sizeof(*n) +
+                           (old->size + AHASH_INIT_SIZE) * set->dsize,
+                           GFP_ATOMIC);
+-              if (!n)
+-                      return -ENOMEM;
++              if (!n) {
++                      ret = -ENOMEM;
++                      goto unlock;
++              }
+               memcpy(n, old, sizeof(struct hbucket) +
+                      old->size * set->dsize);
+               n->size = old->size + AHASH_INIT_SIZE;
+-              set->ext_size += ext_size(AHASH_INIT_SIZE, set->dsize);
++              t->hregion[r].ext_size +=
++                      ext_size(AHASH_INIT_SIZE, set->dsize);
+       }
+ 
+ copy_elem:
+       j = n->pos++;
+       data = ahash_data(n, j, set->dsize);
+ copy_data:
+-      set->elements++;
++      t->hregion[r].elements++;
+ #ifdef IP_SET_HASH_WITH_NETS
+       for (i = 0; i < IPSET_NET_COUNT; i++)
+-              mtype_add_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
++              mtype_add_cidr(set, h, NCIDR_PUT(DCIDR_GET(d->cidr, i)), i);
+ #endif
+       memcpy(data, d, sizeof(struct mtype_elem));
+ overwrite_extensions:
+@@ -820,13 +1000,41 @@ overwrite_extensions:
+               if (old)
+                       kfree_rcu(old, rcu);
+       }
++      ret = 0;
++resize:
++      spin_unlock_bh(&t->hregion[r].lock);
++      if (atomic_read(&t->ref) && ext->target) {
++              /* Resize is in process and kernel side add, save values */
++              struct mtype_resize_ad *x;
++
++              x = kzalloc(sizeof(struct mtype_resize_ad), GFP_ATOMIC);
++              if (!x)
++                      /* Don't bother */
++                      goto out;
++              x->ad = IPSET_ADD;
++              memcpy(&x->d, value, sizeof(struct mtype_elem));
++              memcpy(&x->ext, ext, sizeof(struct ip_set_ext));
++              memcpy(&x->mext, mext, sizeof(struct ip_set_ext));
++              x->flags = flags;
++              spin_lock_bh(&set->lock);
++              list_add_tail(&x->list, &h->ad);
++              spin_unlock_bh(&set->lock);
++      }
++      goto out;
+ 
+-      return 0;
+ set_full:
+       if (net_ratelimit())
+               pr_warn("Set %s is full, maxelem %u reached\n",
+-                      set->name, h->maxelem);
+-      return -IPSET_ERR_HASH_FULL;
++                      set->name, maxelem);
++      ret = -IPSET_ERR_HASH_FULL;
++unlock:
++      spin_unlock_bh(&t->hregion[r].lock);
++out:
++      if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
++              pr_debug("Table destroy after resize by add: %p\n", t);
++              mtype_ahash_destroy(set, t, false);
++      }
++      return ret;
+ }
+ 
+ /* Delete an element from the hash and free up space if possible.
+@@ -840,13 +1048,23 @@ mtype_del(struct ip_set *set, void *valu
+       const struct mtype_elem *d = value;
+       struct mtype_elem *data;
+       struct hbucket *n;
+-      int i, j, k, ret = -IPSET_ERR_EXIST;
++      struct mtype_resize_ad *x = NULL;
++      int i, j, k, r, ret = -IPSET_ERR_EXIST;
+       u32 key, multi = 0;
+       size_t dsize = set->dsize;
+ 
+-      t = ipset_dereference_protected(h->table, set);
++      /* Userspace add and resize is excluded by the mutex.
++       * Kernespace add does not trigger resize.
++       */
++      rcu_read_lock_bh();
++      t = rcu_dereference_bh(h->table);
+       key = HKEY(value, h->initval, t->htable_bits);
+-      n = __ipset_dereference_protected(hbucket(t, key), 1);
++      r = ahash_region(key, t->htable_bits);
++      atomic_inc(&t->uref);
++      rcu_read_unlock_bh();
++
++      spin_lock_bh(&t->hregion[r].lock);
++      n = rcu_dereference_bh(hbucket(t, key));
+       if (!n)
+               goto out;
+       for (i = 0, k = 0; i < n->pos; i++) {
+@@ -857,8 +1075,7 @@ mtype_del(struct ip_set *set, void *valu
+               data = ahash_data(n, i, dsize);
+               if (!mtype_data_equal(data, d, &multi))
+                       continue;
+-              if (SET_WITH_TIMEOUT(set) &&
+-                  ip_set_timeout_expired(ext_timeout(data, set)))
++              if (SET_ELEM_EXPIRED(set, data))
+                       goto out;
+ 
+               ret = 0;
+@@ -866,20 +1083,33 @@ mtype_del(struct ip_set *set, void *valu
+               smp_mb__after_atomic();
+               if (i + 1 == n->pos)
+                       n->pos--;
+-              set->elements--;
++              t->hregion[r].elements--;
+ #ifdef IP_SET_HASH_WITH_NETS
+               for (j = 0; j < IPSET_NET_COUNT; j++)
+-                      mtype_del_cidr(h, NCIDR_PUT(DCIDR_GET(d->cidr, j)),
+-                                     j);
++                      mtype_del_cidr(set, h,
++                                     NCIDR_PUT(DCIDR_GET(d->cidr, j)), j);
+ #endif
+               ip_set_ext_destroy(set, data);
+ 
++              if (atomic_read(&t->ref) && ext->target) {
++                      /* Resize is in process and kernel side del,
++                       * save values
++                       */
++                      x = kzalloc(sizeof(struct mtype_resize_ad),
++                                  GFP_ATOMIC);
++                      if (x) {
++                              x->ad = IPSET_DEL;
++                              memcpy(&x->d, value,
++                                     sizeof(struct mtype_elem));
++                              x->flags = flags;
++                      }
++              }
+               for (; i < n->pos; i++) {
+                       if (!test_bit(i, n->used))
+                               k++;
+               }
+               if (n->pos == 0 && k == 0) {
+-                      set->ext_size -= ext_size(n->size, dsize);
++                      t->hregion[r].ext_size -= ext_size(n->size, dsize);
+                       rcu_assign_pointer(hbucket(t, key), NULL);
+                       kfree_rcu(n, rcu);
+               } else if (k >= AHASH_INIT_SIZE) {
+@@ -898,7 +1128,8 @@ mtype_del(struct ip_set *set, void *valu
+                               k++;
+                       }
+                       tmp->pos = k;
+-                      set->ext_size -= ext_size(AHASH_INIT_SIZE, dsize);
++                      t->hregion[r].ext_size -=
++                              ext_size(AHASH_INIT_SIZE, dsize);
+                       rcu_assign_pointer(hbucket(t, key), tmp);
+                       kfree_rcu(n, rcu);
+               }
+@@ -906,6 +1137,16 @@ mtype_del(struct ip_set *set, void *valu
+       }
+ 
+ out:
++      spin_unlock_bh(&t->hregion[r].lock);
++      if (x) {
++              spin_lock_bh(&set->lock);
++              list_add(&x->list, &h->ad);
++              spin_unlock_bh(&set->lock);
++      }
++      if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
++              pr_debug("Table destroy after resize by del: %p\n", t);
++              mtype_ahash_destroy(set, t, false);
++      }
+       return ret;
+ }
+ 
+@@ -991,6 +1232,7 @@ mtype_test(struct ip_set *set, void *val
+       int i, ret = 0;
+       u32 key, multi = 0;
+ 
++      rcu_read_lock_bh();
+       t = rcu_dereference_bh(h->table);
+ #ifdef IP_SET_HASH_WITH_NETS
+       /* If we test an IP address and not a network address,
+@@ -1022,6 +1264,7 @@ mtype_test(struct ip_set *set, void *val
+                       goto out;
+       }
+ out:
++      rcu_read_unlock_bh();
+       return ret;
+ }
+ 
+@@ -1033,23 +1276,14 @@ mtype_head(struct ip_set *set, struct sk
+       const struct htable *t;
+       struct nlattr *nested;
+       size_t memsize;
++      u32 elements = 0;
++      size_t ext_size = 0;
+       u8 htable_bits;
+ 
+-      /* If any members have expired, set->elements will be wrong
+-       * mytype_expire function will update it with the right count.
+-       * we do not hold set->lock here, so grab it first.
+-       * set->elements can still be incorrect in the case of a huge set,
+-       * because elements might time out during the listing.
+-       */
+-      if (SET_WITH_TIMEOUT(set)) {
+-              spin_lock_bh(&set->lock);
+-              mtype_expire(set, h);
+-              spin_unlock_bh(&set->lock);
+-      }
+-
+       rcu_read_lock_bh();
+-      t = rcu_dereference_bh_nfnl(h->table);
+-      memsize = mtype_ahash_memsize(h, t) + set->ext_size;
++      t = rcu_dereference_bh(h->table);
++      mtype_ext_size(set, &elements, &ext_size);
++      memsize = mtype_ahash_memsize(h, t) + ext_size + set->ext_size;
+       htable_bits = t->htable_bits;
+       rcu_read_unlock_bh();
+ 
+@@ -1071,7 +1305,7 @@ mtype_head(struct ip_set *set, struct sk
+ #endif
+       if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
+           nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)) ||
+-          nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(set->elements)))
++          nla_put_net32(skb, IPSET_ATTR_ELEMENTS, htonl(elements)))
+               goto nla_put_failure;
+       if (unlikely(ip_set_put_flags(skb, set)))
+               goto nla_put_failure;
+@@ -1091,15 +1325,15 @@ mtype_uref(struct ip_set *set, struct ne
+ 
+       if (start) {
+               rcu_read_lock_bh();
+-              t = rcu_dereference_bh_nfnl(h->table);
++              t = ipset_dereference_bh_nfnl(h->table);
+               atomic_inc(&t->uref);
+               cb->args[IPSET_CB_PRIVATE] = (unsigned long)t;
+               rcu_read_unlock_bh();
+       } else if (cb->args[IPSET_CB_PRIVATE]) {
+               t = (struct htable *)cb->args[IPSET_CB_PRIVATE];
+               if (atomic_dec_and_test(&t->uref) && atomic_read(&t->ref)) {
+-                      /* Resizing didn't destroy the hash table */
+-                      pr_debug("Table destroy by dump: %p\n", t);
++                      pr_debug("Table destroy after resize "
++                               " by dump: %p\n", t);
+                       mtype_ahash_destroy(set, t, false);
+               }
+               cb->args[IPSET_CB_PRIVATE] = 0;
+@@ -1141,8 +1375,7 @@ mtype_list(const struct ip_set *set,
+                       if (!test_bit(i, n->used))
+                               continue;
+                       e = ahash_data(n, i, set->dsize);
+-                      if (SET_WITH_TIMEOUT(set) &&
+-                          ip_set_timeout_expired(ext_timeout(e, set)))
++                      if (SET_ELEM_EXPIRED(set, e))
+                               continue;
+                       pr_debug("list hash %lu hbucket %p i %u, data %p\n",
+                                cb->args[IPSET_CB_ARG0], n, i, e);
+@@ -1208,6 +1441,7 @@ static const struct ip_set_type_variant
+       .uref   = mtype_uref,
+       .resize = mtype_resize,
+       .same_set = mtype_same_set,
++      .region_lock = true,
+ };
+ 
+ #ifdef IP_SET_EMIT_CREATE
+@@ -1226,6 +1460,7 @@ IPSET_TOKEN(HTYPE, _create)(struct net *
+       size_t hsize;
+       struct htype *h;
+       struct htable *t;
++      u32 i;
+ 
+       pr_debug("Create set %s with family %s\n",
+                set->name, set->family == NFPROTO_IPV4 ? "inet" : "inet6");
+@@ -1294,6 +1529,15 @@ IPSET_TOKEN(HTYPE, _create)(struct net *
+               kfree(h);
+               return -ENOMEM;
+       }
++      t->hregion = ip_set_alloc(ahash_sizeof_regions(hbits));
++      if (!t->hregion) {
++              kfree(t);
++              kfree(h);
++              return -ENOMEM;
++      }
++      h->gc.set = set;
++      for (i = 0; i < ahash_numof_locks(hbits); i++)
++              spin_lock_init(&t->hregion[i].lock);
+       h->maxelem = maxelem;
+ #ifdef IP_SET_HASH_WITH_NETMASK
+       h->netmask = netmask;
+@@ -1304,9 +1548,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *
+       get_random_bytes(&h->initval, sizeof(h->initval));
+ 
+       t->htable_bits = hbits;
++      t->maxelem = h->maxelem / ahash_numof_locks(hbits);
+       RCU_INIT_POINTER(h->table, t);
+ 
+-      h->set = set;
++      INIT_LIST_HEAD(&h->ad);
+       set->data = h;
+ #ifndef IP_SET_PROTO_UNDEF
+       if (set->family == NFPROTO_IPV4) {
+@@ -1329,12 +1574,10 @@ IPSET_TOKEN(HTYPE, _create)(struct net *
+ #ifndef IP_SET_PROTO_UNDEF
+               if (set->family == NFPROTO_IPV4)
+ #endif
+-                      IPSET_TOKEN(HTYPE, 4_gc_init)(set,
+-                              IPSET_TOKEN(HTYPE, 4_gc));
++                      IPSET_TOKEN(HTYPE, 4_gc_init)(&h->gc);
+ #ifndef IP_SET_PROTO_UNDEF
+               else
+-                      IPSET_TOKEN(HTYPE, 6_gc_init)(set,
+-                              IPSET_TOKEN(HTYPE, 6_gc));
++                      IPSET_TOKEN(HTYPE, 6_gc_init)(&h->gc);
+ #endif
+       }
+       pr_debug("create %s hashsize %u (%u) maxelem %u: %p(%p)\n",
diff --git a/queue-5.4/netfilter-xt_hashlimit-reduce-hashlimit_mutex-scope-for-htable_put.patch b/queue-5.4/netfilter-xt_hashlimit-reduce-hashlimit_mutex-scope-for-htable_put.patch

new file mode 100644 (file)

index 0000000..4babd89
--- /dev/null
+++ b/queue-5.4/netfilter-xt_hashlimit-reduce-hashlimit_mutex-scope-for-htable_put.patch
@@ -0,0 +1,78 @@
+From c4a3922d2d20c710f827d3a115ee338e8d0467df Mon Sep 17 00:00:00 2001
+From: Cong Wang <xiyou.wangcong@gmail.com>
+Date: Sun, 2 Feb 2020 20:30:52 -0800
+Subject: netfilter: xt_hashlimit: reduce hashlimit_mutex scope for htable_put()
+
+From: Cong Wang <xiyou.wangcong@gmail.com>
+
+commit c4a3922d2d20c710f827d3a115ee338e8d0467df upstream.
+
+It is unnecessary to hold hashlimit_mutex for htable_destroy()
+as it is already removed from the global hashtable and its
+refcount is already zero.
+
+Also, switch hinfo->use to refcount_t so that we don't have
+to hold the mutex until it reaches zero in htable_put().
+
+Reported-and-tested-by: syzbot+adf6c6c2be1c3a718121@syzkaller.appspotmail.com
+Acked-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ net/netfilter/xt_hashlimit.c |   12 ++++++------
+ 1 file changed, 6 insertions(+), 6 deletions(-)
+
+--- a/net/netfilter/xt_hashlimit.c
++++ b/net/netfilter/xt_hashlimit.c
+@@ -36,6 +36,7 @@
+ #include <linux/netfilter_ipv6/ip6_tables.h>
+ #include <linux/mutex.h>
+ #include <linux/kernel.h>
++#include <linux/refcount.h>
+ #include <uapi/linux/netfilter/xt_hashlimit.h>
+ 
+ #define XT_HASHLIMIT_ALL (XT_HASHLIMIT_HASH_DIP | XT_HASHLIMIT_HASH_DPT | \
+@@ -114,7 +115,7 @@ struct dsthash_ent {
+ 
+ struct xt_hashlimit_htable {
+       struct hlist_node node;         /* global list of all htables */
+-      int use;
++      refcount_t use;
+       u_int8_t family;
+       bool rnd_initialized;
+ 
+@@ -315,7 +316,7 @@ static int htable_create(struct net *net
+       for (i = 0; i < hinfo->cfg.size; i++)
+               INIT_HLIST_HEAD(&hinfo->hash[i]);
+ 
+-      hinfo->use = 1;
++      refcount_set(&hinfo->use, 1);
+       hinfo->count = 0;
+       hinfo->family = family;
+       hinfo->rnd_initialized = false;
+@@ -434,7 +435,7 @@ static struct xt_hashlimit_htable *htabl
+       hlist_for_each_entry(hinfo, &hashlimit_net->htables, node) {
+               if (!strcmp(name, hinfo->name) &&
+                   hinfo->family == family) {
+-                      hinfo->use++;
++                      refcount_inc(&hinfo->use);
+                       return hinfo;
+               }
+       }
+@@ -443,12 +444,11 @@ static struct xt_hashlimit_htable *htabl
+ 
+ static void htable_put(struct xt_hashlimit_htable *hinfo)
+ {
+-      mutex_lock(&hashlimit_mutex);
+-      if (--hinfo->use == 0) {
++      if (refcount_dec_and_mutex_lock(&hinfo->use, &hashlimit_mutex)) {
+               hlist_del(&hinfo->node);
++              mutex_unlock(&hashlimit_mutex);
+               htable_destroy(hinfo);
+       }
+-      mutex_unlock(&hashlimit_mutex);
+ }
+ 
+ /* The algorithm used is the Simple Token Bucket Filter (TBF)
diff --git a/queue-5.4/series b/queue-5.4/series

index a84269fff3c43be7a62d327aeb69dd7dcea2b96a..ae8e64bfc45fa92f4219880d908cc5717999a9a6 100644 (file)
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -84,3 +84,8 @@ revert-pm-devfreq-modify-the-device-name-as-devfreq-x-for-sysfs.patch
  amdgpu-gmc_v9-save-restore-sdpif-regs-during-s3.patch
  cpufreq-fix-policy-initialization-for-internal-governor-drivers.patch
  io_uring-fix-32-bit-compatability-with-sendmsg-recvmsg.patch
+netfilter-ipset-fix-info-rcu-detected-stall-in-hash_xxx-reports.patch
+net-smc-transfer-fasync_list-in-case-of-fallback.patch
+vhost-check-docket-sk_family-instead-of-call-getname.patch
+netfilter-ipset-fix-forceadd-evaluation-path.patch
+netfilter-xt_hashlimit-reduce-hashlimit_mutex-scope-for-htable_put.patch
diff --git a/queue-5.4/vhost-check-docket-sk_family-instead-of-call-getname.patch b/queue-5.4/vhost-check-docket-sk_family-instead-of-call-getname.patch

new file mode 100644 (file)

index 0000000..6977612
--- /dev/null
+++ b/queue-5.4/vhost-check-docket-sk_family-instead-of-call-getname.patch
@@ -0,0 +1,57 @@
+From 42d84c8490f9f0931786f1623191fcab397c3d64 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Eugenio=20P=C3=A9rez?= <eperezma@redhat.com>
+Date: Fri, 21 Feb 2020 12:06:56 +0100
+Subject: vhost: Check docket sk_family instead of call getname
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Eugenio Pérez <eperezma@redhat.com>
+
+commit 42d84c8490f9f0931786f1623191fcab397c3d64 upstream.
+
+Doing so, we save one call to get data we already have in the struct.
+
+Also, since there is no guarantee that getname use sockaddr_ll
+parameter beyond its size, we add a little bit of security here.
+It should do not do beyond MAX_ADDR_LEN, but syzbot found that
+ax25_getname writes more (72 bytes, the size of full_sockaddr_ax25,
+versus 20 + 32 bytes of sockaddr_ll + MAX_ADDR_LEN in syzbot repro).
+
+Fixes: 3a4d5c94e9593 ("vhost_net: a kernel-level virtio server")
+Reported-by: syzbot+f2a62d07a5198c819c7b@syzkaller.appspotmail.com
+Signed-off-by: Eugenio Pérez <eperezma@redhat.com>
+Acked-by: Michael S. Tsirkin <mst@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/vhost/net.c |   10 +---------
+ 1 file changed, 1 insertion(+), 9 deletions(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -1414,10 +1414,6 @@ static int vhost_net_release(struct inod
+ 
+ static struct socket *get_raw_socket(int fd)
+ {
+-      struct {
+-              struct sockaddr_ll sa;
+-              char  buf[MAX_ADDR_LEN];
+-      } uaddr;
+       int r;
+       struct socket *sock = sockfd_lookup(fd, &r);
+ 
+@@ -1430,11 +1426,7 @@ static struct socket *get_raw_socket(int
+               goto err;
+       }
+ 
+-      r = sock->ops->getname(sock, (struct sockaddr *)&uaddr.sa, 0);
+-      if (r < 0)
+-              goto err;
+-
+-      if (uaddr.sa.sll_family != AF_PACKET) {
++      if (sock->sk->sk_family != AF_PACKET) {
+               r = -EPFNOSUPPORT;
+               goto err;
+       }
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 2 Mar 2020 19:21:45 +0000 (20:21 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 2 Mar 2020 19:21:45 +0000 (20:21 +0100)
queue-5.4/net-smc-transfer-fasync_list-in-case-of-fallback.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/netfilter-ipset-fix-forceadd-evaluation-path.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/netfilter-ipset-fix-info-rcu-detected-stall-in-hash_xxx-reports.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/netfilter-xt_hashlimit-reduce-hashlimit_mutex-scope-for-htable_put.patch	[new file with mode: 0644]	patch \| blob
queue-5.4/series		patch \| blob \| blame \| history
queue-5.4/vhost-check-docket-sk_family-instead-of-call-getname.patch	[new file with mode: 0644]	patch \| blob