]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.13-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 15 Sep 2017 06:21:56 +0000 (23:21 -0700)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 15 Sep 2017 06:21:56 +0000 (23:21 -0700)
added patches:
gianfar-fix-tx-flow-control-deactivation.patch
ip6_gre-update-mtu-properly-in-ip6gre_err.patch
ip_tunnel-fix-setting-ttl-and-tos-value-in-collect_md-mode.patch
ipv6-fix-memory-leak-with-multiple-tables-during-netns-destruction.patch
ipv6-fix-typo-in-fib6_net_exit.patch
revert-net-fix-percpu-memory-leaks.patch
revert-net-use-lib-percpu_counter-api-for-fragmentation-mem-accounting.patch
sctp-fix-missing-wake-ups-in-some-situations.patch
tcp-fix-a-request-socket-leak.patch
udp-drop-head-states-only-when-all-skb-references-are-gone.patch
vhost_net-correctly-check-tx-avail-during-rx-busy-polling.patch

queue-4.13/gianfar-fix-tx-flow-control-deactivation.patch [new file with mode: 0644]
queue-4.13/ip6_gre-update-mtu-properly-in-ip6gre_err.patch [new file with mode: 0644]
queue-4.13/ip_tunnel-fix-setting-ttl-and-tos-value-in-collect_md-mode.patch [new file with mode: 0644]
queue-4.13/ipv6-fix-memory-leak-with-multiple-tables-during-netns-destruction.patch [new file with mode: 0644]
queue-4.13/ipv6-fix-typo-in-fib6_net_exit.patch [new file with mode: 0644]
queue-4.13/revert-net-fix-percpu-memory-leaks.patch [new file with mode: 0644]
queue-4.13/revert-net-use-lib-percpu_counter-api-for-fragmentation-mem-accounting.patch [new file with mode: 0644]
queue-4.13/sctp-fix-missing-wake-ups-in-some-situations.patch [new file with mode: 0644]
queue-4.13/tcp-fix-a-request-socket-leak.patch [new file with mode: 0644]
queue-4.13/udp-drop-head-states-only-when-all-skb-references-are-gone.patch [new file with mode: 0644]
queue-4.13/vhost_net-correctly-check-tx-avail-during-rx-busy-polling.patch [new file with mode: 0644]

diff --git a/queue-4.13/gianfar-fix-tx-flow-control-deactivation.patch b/queue-4.13/gianfar-fix-tx-flow-control-deactivation.patch
new file mode 100644 (file)
index 0000000..37f6bd9
--- /dev/null
@@ -0,0 +1,39 @@
+From foo@baz Thu Sep 14 23:20:44 PDT 2017
+From: Claudiu Manoil <claudiu.manoil@nxp.com>
+Date: Mon, 4 Sep 2017 10:45:28 +0300
+Subject: gianfar: Fix Tx flow control deactivation
+
+From: Claudiu Manoil <claudiu.manoil@nxp.com>
+
+
+[ Upstream commit 5d621672bc1a1e5090c1ac5432a18c79e0e13e03 ]
+
+The wrong register is checked for the Tx flow control bit,
+it should have been maccfg1 not maccfg2.
+This went unnoticed for so long probably because the impact is
+hardly visible, not to mention the tangled code from adjust_link().
+First, link flow control (i.e. handling of Rx/Tx link level pause frames)
+is disabled by default (needs to be enabled via 'ethtool -A').
+Secondly, maccfg2 always returns 0 for tx_flow_oldval (except for a few
+old boards), which results in Tx flow control remaining always on
+once activated.
+
+Fixes: 45b679c9a3ccd9e34f28e6ec677b812a860eb8eb ("gianfar: Implement PAUSE frame generation support")
+Signed-off-by: Claudiu Manoil <claudiu.manoil@nxp.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/freescale/gianfar.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/freescale/gianfar.c
++++ b/drivers/net/ethernet/freescale/gianfar.c
+@@ -3687,7 +3687,7 @@ static noinline void gfar_update_link_st
+               u32 tempval1 = gfar_read(&regs->maccfg1);
+               u32 tempval = gfar_read(&regs->maccfg2);
+               u32 ecntrl = gfar_read(&regs->ecntrl);
+-              u32 tx_flow_oldval = (tempval & MACCFG1_TX_FLOW);
++              u32 tx_flow_oldval = (tempval1 & MACCFG1_TX_FLOW);
+               if (phydev->duplex != priv->oldduplex) {
+                       if (!(phydev->duplex))
diff --git a/queue-4.13/ip6_gre-update-mtu-properly-in-ip6gre_err.patch b/queue-4.13/ip6_gre-update-mtu-properly-in-ip6gre_err.patch
new file mode 100644 (file)
index 0000000..201195f
--- /dev/null
@@ -0,0 +1,46 @@
+From foo@baz Thu Sep 14 23:20:44 PDT 2017
+From: Xin Long <lucien.xin@gmail.com>
+Date: Tue, 5 Sep 2017 17:26:33 +0800
+Subject: ip6_gre: update mtu properly in ip6gre_err
+
+From: Xin Long <lucien.xin@gmail.com>
+
+
+[ Upstream commit 5c25f30c93fdc5bf25e62101aeaae7a4f9b421b3 ]
+
+Now when probessing ICMPV6_PKT_TOOBIG, ip6gre_err only subtracts the
+offset of gre header from mtu info. The expected mtu of gre device
+should also subtract gre header. Otherwise, the next packets still
+can't be sent out.
+
+Jianlin found this issue when using the topo:
+  client(ip6gre)<---->(nic1)route(nic2)<----->(ip6gre)server
+
+and reducing nic2's mtu, then both tcp and sctp's performance with
+big size data became 0.
+
+This patch is to fix it by also subtracting grehdr (tun->tun_hlen)
+from mtu info when updating gre device's mtu in ip6gre_err(). It
+also needs to subtract ETH_HLEN if gre dev'type is ARPHRD_ETHER.
+
+Reported-by: Jianlin Shi <jishi@redhat.com>
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/ip6_gre.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/net/ipv6/ip6_gre.c
++++ b/net/ipv6/ip6_gre.c
+@@ -432,7 +432,9 @@ static void ip6gre_err(struct sk_buff *s
+               }
+               break;
+       case ICMPV6_PKT_TOOBIG:
+-              mtu = be32_to_cpu(info) - offset;
++              mtu = be32_to_cpu(info) - offset - t->tun_hlen;
++              if (t->dev->type == ARPHRD_ETHER)
++                      mtu -= ETH_HLEN;
+               if (mtu < IPV6_MIN_MTU)
+                       mtu = IPV6_MIN_MTU;
+               t->dev->mtu = mtu;
diff --git a/queue-4.13/ip_tunnel-fix-setting-ttl-and-tos-value-in-collect_md-mode.patch b/queue-4.13/ip_tunnel-fix-setting-ttl-and-tos-value-in-collect_md-mode.patch
new file mode 100644 (file)
index 0000000..e512ac1
--- /dev/null
@@ -0,0 +1,36 @@
+From foo@baz Thu Sep 14 23:20:44 PDT 2017
+From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
+Date: Thu, 7 Sep 2017 14:08:34 +0800
+Subject: ip_tunnel: fix setting ttl and tos value in collect_md mode
+
+From: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
+
+
+[ Upstream commit 0f693f1995cf002432b70f43ce73f79bf8d0b6c9 ]
+
+ttl and tos variables are declared and assigned, but are not used in
+iptunnel_xmit() function.
+
+Fixes: cfc7381b3002 ("ip_tunnel: add collect_md mode to IPIP tunnel")
+Cc: Alexei Starovoitov <ast@fb.com>
+Signed-off-by: Haishuang Yan <yanhaishuang@cmss.chinamobile.com>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/ip_tunnel.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/ip_tunnel.c
++++ b/net/ipv4/ip_tunnel.c
+@@ -618,8 +618,8 @@ void ip_md_tunnel_xmit(struct sk_buff *s
+               ip_rt_put(rt);
+               goto tx_dropped;
+       }
+-      iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, key->tos,
+-                    key->ttl, df, !net_eq(tunnel->net, dev_net(dev)));
++      iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
++                    df, !net_eq(tunnel->net, dev_net(dev)));
+       return;
+ tx_error:
+       dev->stats.tx_errors++;
diff --git a/queue-4.13/ipv6-fix-memory-leak-with-multiple-tables-during-netns-destruction.patch b/queue-4.13/ipv6-fix-memory-leak-with-multiple-tables-during-netns-destruction.patch
new file mode 100644 (file)
index 0000000..718748a
--- /dev/null
@@ -0,0 +1,76 @@
+From foo@baz Thu Sep 14 23:20:44 PDT 2017
+From: Sabrina Dubroca <sd@queasysnail.net>
+Date: Fri, 8 Sep 2017 10:26:19 +0200
+Subject: ipv6: fix memory leak with multiple tables during netns destruction
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+
+[ Upstream commit ba1cc08d9488c94cb8d94f545305688b72a2a300 ]
+
+fib6_net_exit only frees the main and local tables. If another table was
+created with fib6_alloc_table, we leak it when the netns is destroyed.
+
+Fix this in the same way ip_fib_net_exit cleans up tables, by walking
+through the whole hashtable of fib6_table's. We can get rid of the
+special cases for local and main, since they're also part of the
+hashtable.
+
+Reproducer:
+    ip netns add x
+    ip -net x -6 rule add from 6003:1::/64 table 100
+    ip netns del x
+
+Reported-by: Jianlin Shi <jishi@redhat.com>
+Fixes: 58f09b78b730 ("[NETNS][IPV6] ip6_fib - make it per network namespace")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/ip6_fib.c |   25 +++++++++++++++++++------
+ 1 file changed, 19 insertions(+), 6 deletions(-)
+
+--- a/net/ipv6/ip6_fib.c
++++ b/net/ipv6/ip6_fib.c
+@@ -198,6 +198,12 @@ static void rt6_release(struct rt6_info
+       }
+ }
++static void fib6_free_table(struct fib6_table *table)
++{
++      inetpeer_invalidate_tree(&table->tb6_peers);
++      kfree(table);
++}
++
+ static void fib6_link_table(struct net *net, struct fib6_table *tb)
+ {
+       unsigned int h;
+@@ -1915,15 +1921,22 @@ out_timer:
+ static void fib6_net_exit(struct net *net)
+ {
++      unsigned int i;
++
+       rt6_ifdown(net, NULL);
+       del_timer_sync(&net->ipv6.ip6_fib_timer);
+-#ifdef CONFIG_IPV6_MULTIPLE_TABLES
+-      inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers);
+-      kfree(net->ipv6.fib6_local_tbl);
+-#endif
+-      inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers);
+-      kfree(net->ipv6.fib6_main_tbl);
++      for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
++              struct hlist_head *head = &net->ipv6.fib_table_hash[i];
++              struct hlist_node *tmp;
++              struct fib6_table *tb;
++
++              hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
++                      hlist_del(&tb->tb6_hlist);
++                      fib6_free_table(tb);
++              }
++      }
++
+       kfree(net->ipv6.fib_table_hash);
+       kfree(net->ipv6.rt6_stats);
+ }
diff --git a/queue-4.13/ipv6-fix-typo-in-fib6_net_exit.patch b/queue-4.13/ipv6-fix-typo-in-fib6_net_exit.patch
new file mode 100644 (file)
index 0000000..759a390
--- /dev/null
@@ -0,0 +1,31 @@
+From foo@baz Thu Sep 14 23:20:44 PDT 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Fri, 8 Sep 2017 15:48:47 -0700
+Subject: ipv6: fix typo in fib6_net_exit()
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 32a805baf0fb70b6dbedefcd7249ac7f580f9e3b ]
+
+IPv6 FIB should use FIB6_TABLE_HASHSZ, not FIB_TABLE_HASHSZ.
+
+Fixes: ba1cc08d9488 ("ipv6: fix memory leak with multiple tables during netns destruction")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/ip6_fib.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ipv6/ip6_fib.c
++++ b/net/ipv6/ip6_fib.c
+@@ -1926,7 +1926,7 @@ static void fib6_net_exit(struct net *ne
+       rt6_ifdown(net, NULL);
+       del_timer_sync(&net->ipv6.ip6_fib_timer);
+-      for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
++      for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
+               struct hlist_head *head = &net->ipv6.fib_table_hash[i];
+               struct hlist_node *tmp;
+               struct fib6_table *tb;
diff --git a/queue-4.13/revert-net-fix-percpu-memory-leaks.patch b/queue-4.13/revert-net-fix-percpu-memory-leaks.patch
new file mode 100644 (file)
index 0000000..1fa5b71
--- /dev/null
@@ -0,0 +1,151 @@
+From foo@baz Thu Sep 14 23:20:44 PDT 2017
+From: Jesper Dangaard Brouer <brouer@redhat.com>
+Date: Fri, 1 Sep 2017 11:26:13 +0200
+Subject: Revert "net: fix percpu memory leaks"
+
+From: Jesper Dangaard Brouer <brouer@redhat.com>
+
+
+[ Upstream commit 5a63643e583b6a9789d7a225ae076fb4e603991c ]
+
+This reverts commit 1d6119baf0610f813eb9d9580eb4fd16de5b4ceb.
+
+After reverting commit 6d7b857d541e ("net: use lib/percpu_counter API
+for fragmentation mem accounting") then here is no need for this
+fix-up patch.  As percpu_counter is no longer used, it cannot
+memory leak it any-longer.
+
+Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting")
+Fixes: 1d6119baf061 ("net: fix percpu memory leaks")
+Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/inet_frag.h                 |    7 +------
+ net/ieee802154/6lowpan/reassembly.c     |   11 +++--------
+ net/ipv4/ip_fragment.c                  |   12 +++---------
+ net/ipv6/netfilter/nf_conntrack_reasm.c |   12 +++---------
+ net/ipv6/reassembly.c                   |   12 +++---------
+ 5 files changed, 13 insertions(+), 41 deletions(-)
+
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -103,15 +103,10 @@ struct inet_frags {
+ int inet_frags_init(struct inet_frags *);
+ void inet_frags_fini(struct inet_frags *);
+-static inline int inet_frags_init_net(struct netns_frags *nf)
++static inline void inet_frags_init_net(struct netns_frags *nf)
+ {
+       atomic_set(&nf->mem, 0);
+-      return 0;
+ }
+-static inline void inet_frags_uninit_net(struct netns_frags *nf)
+-{
+-}
+-
+ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
+ void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f);
+--- a/net/ieee802154/6lowpan/reassembly.c
++++ b/net/ieee802154/6lowpan/reassembly.c
+@@ -580,19 +580,14 @@ static int __net_init lowpan_frags_init_
+ {
+       struct netns_ieee802154_lowpan *ieee802154_lowpan =
+               net_ieee802154_lowpan(net);
+-      int res;
+       ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+       ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+       ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT;
+-      res = inet_frags_init_net(&ieee802154_lowpan->frags);
+-      if (res)
+-              return res;
+-      res = lowpan_frags_ns_sysctl_register(net);
+-      if (res)
+-              inet_frags_uninit_net(&ieee802154_lowpan->frags);
+-      return res;
++      inet_frags_init_net(&ieee802154_lowpan->frags);
++
++      return lowpan_frags_ns_sysctl_register(net);
+ }
+ static void __net_exit lowpan_frags_exit_net(struct net *net)
+--- a/net/ipv4/ip_fragment.c
++++ b/net/ipv4/ip_fragment.c
+@@ -844,8 +844,6 @@ static void __init ip4_frags_ctl_registe
+ static int __net_init ipv4_frags_init_net(struct net *net)
+ {
+-      int res;
+-
+       /* Fragment cache limits.
+        *
+        * The fragment memory accounting code, (tries to) account for
+@@ -871,13 +869,9 @@ static int __net_init ipv4_frags_init_ne
+       net->ipv4.frags.max_dist = 64;
+-      res = inet_frags_init_net(&net->ipv4.frags);
+-      if (res)
+-              return res;
+-      res = ip4_frags_ns_ctl_register(net);
+-      if (res)
+-              inet_frags_uninit_net(&net->ipv4.frags);
+-      return res;
++      inet_frags_init_net(&net->ipv4.frags);
++
++      return ip4_frags_ns_ctl_register(net);
+ }
+ static void __net_exit ipv4_frags_exit_net(struct net *net)
+--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
++++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
+@@ -622,18 +622,12 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
+ static int nf_ct_net_init(struct net *net)
+ {
+-      int res;
+-
+       net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+       net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+       net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT;
+-      res = inet_frags_init_net(&net->nf_frag.frags);
+-      if (res)
+-              return res;
+-      res = nf_ct_frag6_sysctl_register(net);
+-      if (res)
+-              inet_frags_uninit_net(&net->nf_frag.frags);
+-      return res;
++      inet_frags_init_net(&net->nf_frag.frags);
++
++      return nf_ct_frag6_sysctl_register(net);
+ }
+ static void nf_ct_net_exit(struct net *net)
+--- a/net/ipv6/reassembly.c
++++ b/net/ipv6/reassembly.c
+@@ -714,19 +714,13 @@ static void ip6_frags_sysctl_unregister(
+ static int __net_init ipv6_frags_init_net(struct net *net)
+ {
+-      int res;
+-
+       net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH;
+       net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH;
+       net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT;
+-      res = inet_frags_init_net(&net->ipv6.frags);
+-      if (res)
+-              return res;
+-      res = ip6_frags_ns_sysctl_register(net);
+-      if (res)
+-              inet_frags_uninit_net(&net->ipv6.frags);
+-      return res;
++      inet_frags_init_net(&net->ipv6.frags);
++
++      return ip6_frags_ns_sysctl_register(net);
+ }
+ static void __net_exit ipv6_frags_exit_net(struct net *net)
diff --git a/queue-4.13/revert-net-use-lib-percpu_counter-api-for-fragmentation-mem-accounting.patch b/queue-4.13/revert-net-use-lib-percpu_counter-api-for-fragmentation-mem-accounting.patch
new file mode 100644 (file)
index 0000000..fbcd610
--- /dev/null
@@ -0,0 +1,141 @@
+From foo@baz Thu Sep 14 23:20:44 PDT 2017
+From: Jesper Dangaard Brouer <brouer@redhat.com>
+Date: Fri, 1 Sep 2017 11:26:08 +0200
+Subject: Revert "net: use lib/percpu_counter API for fragmentation mem accounting"
+
+From: Jesper Dangaard Brouer <brouer@redhat.com>
+
+
+[ Upstream commit fb452a1aa3fd4034d7999e309c5466ff2d7005aa ]
+
+This reverts commit 6d7b857d541ecd1d9bd997c97242d4ef94b19de2.
+
+There is a bug in fragmentation codes use of the percpu_counter API,
+that can cause issues on systems with many CPUs.
+
+The frag_mem_limit() just reads the global counter (fbc->count),
+without considering other CPUs can have upto batch size (130K) that
+haven't been subtracted yet.  Due to the 3MBytes lower thresh limit,
+this become dangerous at >=24 CPUs (3*1024*1024/130000=24).
+
+The correct API usage would be to use __percpu_counter_compare() which
+does the right thing, and takes into account the number of (online)
+CPUs and batch size, to account for this and call __percpu_counter_sum()
+when needed.
+
+We choose to revert the use of the lib/percpu_counter API for frag
+memory accounting for several reasons:
+
+1) On systems with CPUs > 24, the heavier fully locked
+   __percpu_counter_sum() is always invoked, which will be more
+   expensive than the atomic_t that is reverted to.
+
+Given systems with more than 24 CPUs are becoming common this doesn't
+seem like a good option.  To mitigate this, the batch size could be
+decreased and thresh be increased.
+
+2) The add_frag_mem_limit+sub_frag_mem_limit pairs happen on the RX
+   CPU, before SKBs are pushed into sockets on remote CPUs.  Given
+   NICs can only hash on L2 part of the IP-header, the NIC-RXq's will
+   likely be limited.  Thus, a fair chance that atomic add+dec happen
+   on the same CPU.
+
+Revert note that commit 1d6119baf061 ("net: fix percpu memory leaks")
+removed init_frag_mem_limit() and instead use inet_frags_init_net().
+After this revert, inet_frags_uninit_net() becomes empty.
+
+Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting")
+Fixes: 1d6119baf061 ("net: fix percpu memory leaks")
+Signed-off-by: Jesper Dangaard Brouer <brouer@redhat.com>
+Acked-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/inet_frag.h  |   30 +++++++++---------------------
+ net/ipv4/inet_fragment.c |    4 +---
+ 2 files changed, 10 insertions(+), 24 deletions(-)
+
+--- a/include/net/inet_frag.h
++++ b/include/net/inet_frag.h
+@@ -1,14 +1,9 @@
+ #ifndef __NET_FRAG_H__
+ #define __NET_FRAG_H__
+-#include <linux/percpu_counter.h>
+-
+ struct netns_frags {
+-      /* The percpu_counter "mem" need to be cacheline aligned.
+-       *  mem.count must not share cacheline with other writers
+-       */
+-      struct percpu_counter   mem ____cacheline_aligned_in_smp;
+-
++      /* Keep atomic mem on separate cachelines in structs that include it */
++      atomic_t                mem ____cacheline_aligned_in_smp;
+       /* sysctls */
+       int                     timeout;
+       int                     high_thresh;
+@@ -110,11 +105,11 @@ void inet_frags_fini(struct inet_frags *
+ static inline int inet_frags_init_net(struct netns_frags *nf)
+ {
+-      return percpu_counter_init(&nf->mem, 0, GFP_KERNEL);
++      atomic_set(&nf->mem, 0);
++      return 0;
+ }
+ static inline void inet_frags_uninit_net(struct netns_frags *nf)
+ {
+-      percpu_counter_destroy(&nf->mem);
+ }
+ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f);
+@@ -140,31 +135,24 @@ static inline bool inet_frag_evicting(st
+ /* Memory Tracking Functions. */
+-/* The default percpu_counter batch size is not big enough to scale to
+- * fragmentation mem acct sizes.
+- * The mem size of a 64K fragment is approx:
+- *  (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes
+- */
+-static unsigned int frag_percpu_counter_batch = 130000;
+-
+ static inline int frag_mem_limit(struct netns_frags *nf)
+ {
+-      return percpu_counter_read(&nf->mem);
++      return atomic_read(&nf->mem);
+ }
+ static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
+ {
+-      percpu_counter_add_batch(&nf->mem, -i, frag_percpu_counter_batch);
++      atomic_sub(i, &nf->mem);
+ }
+ static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
+ {
+-      percpu_counter_add_batch(&nf->mem, i, frag_percpu_counter_batch);
++      atomic_add(i, &nf->mem);
+ }
+-static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
++static inline int sum_frag_mem_limit(struct netns_frags *nf)
+ {
+-      return percpu_counter_sum_positive(&nf->mem);
++      return atomic_read(&nf->mem);
+ }
+ /* RFC 3168 support :
+--- a/net/ipv4/inet_fragment.c
++++ b/net/ipv4/inet_fragment.c
+@@ -234,10 +234,8 @@ evict_again:
+       cond_resched();
+       if (read_seqretry(&f->rnd_seqlock, seq) ||
+-          percpu_counter_sum(&nf->mem))
++          sum_frag_mem_limit(nf))
+               goto evict_again;
+-
+-      percpu_counter_destroy(&nf->mem);
+ }
+ EXPORT_SYMBOL(inet_frags_exit_net);
diff --git a/queue-4.13/sctp-fix-missing-wake-ups-in-some-situations.patch b/queue-4.13/sctp-fix-missing-wake-ups-in-some-situations.patch
new file mode 100644 (file)
index 0000000..edd848f
--- /dev/null
@@ -0,0 +1,52 @@
+From foo@baz Thu Sep 14 23:20:44 PDT 2017
+From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Date: Fri, 8 Sep 2017 11:35:21 -0300
+Subject: sctp: fix missing wake ups in some situations
+
+From: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+
+
+[ Upstream commit 7906b00f5cd1cd484fced7fcda892176e3202c8a ]
+
+Commit fb586f25300f ("sctp: delay calls to sk_data_ready() as much as
+possible") minimized the number of wake ups that are triggered in case
+the association receives a packet with multiple data chunks on it and/or
+when io_events are enabled and then commit 0970f5b36659 ("sctp: signal
+sk_data_ready earlier on data chunks reception") moved the wake up to as
+soon as possible. It thus relies on the state machine running later to
+clean the flag that the event was already generated.
+
+The issue is that there are 2 call paths that calls
+sctp_ulpq_tail_event() outside of the state machine, causing the flag to
+linger and possibly omitting a needed wake up in the sequence.
+
+One of the call paths is when enabling SCTP_SENDER_DRY_EVENTS via
+setsockopt(SCTP_EVENTS), as noticed by Harald Welte. The other is when
+partial reliability triggers removal of chunks from the send queue when
+the application calls sendmsg().
+
+This commit fixes it by not setting the flag in case the socket is not
+owned by the user, as it won't be cleaned later. This works for
+user-initiated calls and also for rx path processing.
+
+Fixes: fb586f25300f ("sctp: delay calls to sk_data_ready() as much as possible")
+Reported-by: Harald Welte <laforge@gnumonks.org>
+Signed-off-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sctp/ulpqueue.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/sctp/ulpqueue.c
++++ b/net/sctp/ulpqueue.c
+@@ -265,7 +265,8 @@ int sctp_ulpq_tail_event(struct sctp_ulp
+               sctp_ulpq_clear_pd(ulpq);
+       if (queue == &sk->sk_receive_queue && !sp->data_ready_signalled) {
+-              sp->data_ready_signalled = 1;
++              if (!sock_owned_by_user(sk))
++                      sp->data_ready_signalled = 1;
+               sk->sk_data_ready(sk);
+       }
+       return 1;
diff --git a/queue-4.13/tcp-fix-a-request-socket-leak.patch b/queue-4.13/tcp-fix-a-request-socket-leak.patch
new file mode 100644 (file)
index 0000000..371fad1
--- /dev/null
@@ -0,0 +1,54 @@
+From foo@baz Thu Sep 14 23:20:44 PDT 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Fri, 8 Sep 2017 12:44:47 -0700
+Subject: tcp: fix a request socket leak
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 1f3b359f1004bd34b7b0bad70b93e3c7af92a37b ]
+
+While the cited commit fixed a possible deadlock, it added a leak
+of the request socket, since reqsk_put() must be called if the BPF
+filter decided the ACK packet must be dropped.
+
+Fixes: d624d276d1dd ("tcp: fix possible deadlock in TCP stack vs BPF filter")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_ipv4.c |    6 +++---
+ net/ipv6/tcp_ipv6.c |    6 +++---
+ 2 files changed, 6 insertions(+), 6 deletions(-)
+
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -1722,9 +1722,9 @@ process:
+                */
+               sock_hold(sk);
+               refcounted = true;
+-              if (tcp_filter(sk, skb))
+-                      goto discard_and_relse;
+-              nsk = tcp_check_req(sk, skb, req, false);
++              nsk = NULL;
++              if (!tcp_filter(sk, skb))
++                      nsk = tcp_check_req(sk, skb, req, false);
+               if (!nsk) {
+                       reqsk_put(req);
+                       goto discard_and_relse;
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -1456,9 +1456,9 @@ process:
+               }
+               sock_hold(sk);
+               refcounted = true;
+-              if (tcp_filter(sk, skb))
+-                      goto discard_and_relse;
+-              nsk = tcp_check_req(sk, skb, req, false);
++              nsk = NULL;
++              if (!tcp_filter(sk, skb))
++                      nsk = tcp_check_req(sk, skb, req, false);
+               if (!nsk) {
+                       reqsk_put(req);
+                       goto discard_and_relse;
diff --git a/queue-4.13/udp-drop-head-states-only-when-all-skb-references-are-gone.patch b/queue-4.13/udp-drop-head-states-only-when-all-skb-references-are-gone.patch
new file mode 100644 (file)
index 0000000..69ed481
--- /dev/null
@@ -0,0 +1,82 @@
+From foo@baz Thu Sep 14 23:20:44 PDT 2017
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Wed, 6 Sep 2017 14:44:36 +0200
+Subject: udp: drop head states only when all skb references are gone
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+
+[ Upstream commit ca2c1418efe9f7fe37aa1f355efdf4eb293673ce ]
+
+After commit 0ddf3fb2c43d ("udp: preserve skb->dst if required
+for IP options processing") we clear the skb head state as soon
+as the skb carrying them is first processed.
+
+Since the same skb can be processed several times when MSG_PEEK
+is used, we can end up lacking the required head states, and
+eventually oopsing.
+
+Fix this clearing the skb head state only when processing the
+last skb reference.
+
+Reported-by: Eric Dumazet <edumazet@google.com>
+Fixes: 0ddf3fb2c43d ("udp: preserve skb->dst if required for IP options processing")
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/skbuff.h |    2 +-
+ net/core/skbuff.c      |    9 +++------
+ net/ipv4/udp.c         |    5 ++++-
+ 3 files changed, 8 insertions(+), 8 deletions(-)
+
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -885,7 +885,7 @@ void kfree_skb(struct sk_buff *skb);
+ void kfree_skb_list(struct sk_buff *segs);
+ void skb_tx_error(struct sk_buff *skb);
+ void consume_skb(struct sk_buff *skb);
+-void consume_stateless_skb(struct sk_buff *skb);
++void __consume_stateless_skb(struct sk_buff *skb);
+ void  __kfree_skb(struct sk_buff *skb);
+ extern struct kmem_cache *skbuff_head_cache;
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -753,14 +753,11 @@ EXPORT_SYMBOL(consume_skb);
+  *    consume_stateless_skb - free an skbuff, assuming it is stateless
+  *    @skb: buffer to free
+  *
+- *    Works like consume_skb(), but this variant assumes that all the head
+- *    states have been already dropped.
++ *    Alike consume_skb(), but this variant assumes that this is the last
++ *    skb reference and all the head states have been already dropped
+  */
+-void consume_stateless_skb(struct sk_buff *skb)
++void __consume_stateless_skb(struct sk_buff *skb)
+ {
+-      if (!skb_unref(skb))
+-              return;
+-
+       trace_consume_skb(skb);
+       if (likely(skb->head))
+               skb_release_data(skb);
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -1386,12 +1386,15 @@ void skb_consume_udp(struct sock *sk, st
+               unlock_sock_fast(sk, slow);
+       }
++      if (!skb_unref(skb))
++              return;
++
+       /* In the more common cases we cleared the head states previously,
+        * see __udp_queue_rcv_skb().
+        */
+       if (unlikely(udp_skb_has_head_state(skb)))
+               skb_release_head_state(skb);
+-      consume_stateless_skb(skb);
++      __consume_stateless_skb(skb);
+ }
+ EXPORT_SYMBOL_GPL(skb_consume_udp);
diff --git a/queue-4.13/vhost_net-correctly-check-tx-avail-during-rx-busy-polling.patch b/queue-4.13/vhost_net-correctly-check-tx-avail-during-rx-busy-polling.patch
new file mode 100644 (file)
index 0000000..0df738e
--- /dev/null
@@ -0,0 +1,47 @@
+From foo@baz Thu Sep 14 23:20:44 PDT 2017
+From: Jason Wang <jasowang@redhat.com>
+Date: Tue, 5 Sep 2017 09:22:05 +0800
+Subject: vhost_net: correctly check tx avail during rx busy polling
+
+From: Jason Wang <jasowang@redhat.com>
+
+
+[ Upstream commit 8b949bef9172ca69d918e93509a4ecb03d0355e0 ]
+
+We check tx avail through vhost_enable_notify() in the past which is
+wrong since it only checks whether or not guest has filled more
+available buffer since last avail idx synchronization which was just
+done by vhost_vq_avail_empty() before. What we really want is checking
+pending buffers in the avail ring. Fix this by calling
+vhost_vq_avail_empty() instead.
+
+This issue could be noticed by doing netperf TCP_RR benchmark as
+client from guest (but not host). With this fix, TCP_RR from guest to
+localhost restores from 1375.91 trans per sec to 55235.28 trans per
+sec on my laptop (Intel(R) Core(TM) i7-5600U CPU @ 2.60GHz).
+
+Fixes: 030881372460 ("vhost_net: basic polling support")
+Signed-off-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/vhost/net.c |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/drivers/vhost/net.c
++++ b/drivers/vhost/net.c
+@@ -634,8 +634,13 @@ static int vhost_net_rx_peek_head_len(st
+               preempt_enable();
+-              if (vhost_enable_notify(&net->dev, vq))
++              if (!vhost_vq_avail_empty(&net->dev, vq))
+                       vhost_poll_queue(&vq->poll);
++              else if (unlikely(vhost_enable_notify(&net->dev, vq))) {
++                      vhost_disable_notify(&net->dev, vq);
++                      vhost_poll_queue(&vq->poll);
++              }
++
+               mutex_unlock(&vq->mutex);
+               len = peek_head_len(rvq, sk);