4.9-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 15 Nov 2017 16:31:22 +0000 (17:31 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Wed, 15 Nov 2017 16:31:22 +0000 (17:31 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 15 Nov 2017 16:31:22 +0000 (17:31 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 15 Nov 2017 16:31:22 +0000 (17:31 +0100)
diff --git a/queue-4.9/gso-fix-payload-length-when-gso_size-is-zero.patch b/queue-4.9/gso-fix-payload-length-when-gso_size-is-zero.patch

new file mode 100644 (file)

index 0000000..90599b6
--- /dev/null
+++ b/queue-4.9/gso-fix-payload-length-when-gso_size-is-zero.patch
@@ -0,0 +1,62 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Alexey Kodanev <alexey.kodanev@oracle.com>
+Date: Fri, 6 Oct 2017 19:02:35 +0300
+Subject: gso: fix payload length when gso_size is zero
+
+From: Alexey Kodanev <alexey.kodanev@oracle.com>
+
+
+[ Upstream commit 3d0241d57c7b25bb75ac9d7a62753642264fdbce ]
+
+When gso_size reset to zero for the tail segment in skb_segment(), later
+in ipv6_gso_segment(), __skb_udp_tunnel_segment() and gre_gso_segment()
+we will get incorrect results (payload length, pcsum) for that segment.
+inet_gso_segment() already has a check for gso_size before calculating
+payload.
+
+The issue was found with LTP vxlan & gre tests over ixgbe NIC.
+
+Fixes: 07b26c9454a2 ("gso: Support partial splitting at the frag_list pointer")
+Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
+Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/gre_offload.c |    2 +-
+ net/ipv4/udp_offload.c |    2 +-
+ net/ipv6/ip6_offload.c |    2 +-
+ 3 files changed, 3 insertions(+), 3 deletions(-)
+
+--- a/net/ipv4/gre_offload.c
++++ b/net/ipv4/gre_offload.c
+@@ -98,7 +98,7 @@ static struct sk_buff *gre_gso_segment(s
+               greh = (struct gre_base_hdr *)skb_transport_header(skb);
+               pcsum = (__sum16 *)(greh + 1);
+ 
+-              if (gso_partial) {
++              if (gso_partial && skb_is_gso(skb)) {
+                       unsigned int partial_adj;
+ 
+                       /* Adjust checksum to account for the fact that
+--- a/net/ipv4/udp_offload.c
++++ b/net/ipv4/udp_offload.c
+@@ -122,7 +122,7 @@ static struct sk_buff *__skb_udp_tunnel_
+                * will be using a length value equal to only one MSS sized
+                * segment instead of the entire frame.
+                */
+-              if (gso_partial) {
++              if (gso_partial && skb_is_gso(skb)) {
+                       uh->len = htons(skb_shinfo(skb)->gso_size +
+                                       SKB_GSO_CB(skb)->data_offset +
+                                       skb->head - (unsigned char *)uh);
+--- a/net/ipv6/ip6_offload.c
++++ b/net/ipv6/ip6_offload.c
+@@ -105,7 +105,7 @@ static struct sk_buff *ipv6_gso_segment(
+ 
+       for (skb = segs; skb; skb = skb->next) {
+               ipv6h = (struct ipv6hdr *)(skb_mac_header(skb) + nhoff);
+-              if (gso_partial)
++              if (gso_partial && skb_is_gso(skb))
+                       payload_len = skb_shinfo(skb)->gso_size +
+                                     SKB_GSO_CB(skb)->data_offset +
+                                     skb->head - (unsigned char *)(ipv6h + 1);
diff --git a/queue-4.9/ip6_gre-only-increase-err_count-for-some-certain-type-icmpv6-in-ip6gre_err.patch b/queue-4.9/ip6_gre-only-increase-err_count-for-some-certain-type-icmpv6-in-ip6gre_err.patch

new file mode 100644 (file)

index 0000000..95cd5bd
--- /dev/null
+++ b/queue-4.9/ip6_gre-only-increase-err_count-for-some-certain-type-icmpv6-in-ip6gre_err.patch
@@ -0,0 +1,64 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Xin Long <lucien.xin@gmail.com>
+Date: Thu, 26 Oct 2017 19:23:27 +0800
+Subject: ip6_gre: only increase err_count for some certain type icmpv6 in ip6gre_err
+
+From: Xin Long <lucien.xin@gmail.com>
+
+
+[ Upstream commit f8d20b46ce55cf40afb30dcef6d9288f7ef46d9b ]
+
+The similar fix in patch 'ipip: only increase err_count for some
+certain type icmp in ipip_err' is needed for ip6gre_err.
+
+In Jianlin's case, udp netperf broke even when receiving a TooBig
+icmpv6 packet.
+
+Fixes: c12b395a4664 ("gre: Support GRE over IPv6")
+Reported-by: Jianlin Shi <jishi@redhat.com>
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/ip6_gre.c |   11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+--- a/net/ipv6/ip6_gre.c
++++ b/net/ipv6/ip6_gre.c
+@@ -408,13 +408,16 @@ static void ip6gre_err(struct sk_buff *s
+       case ICMPV6_DEST_UNREACH:
+               net_dbg_ratelimited("%s: Path to destination invalid or inactive!\n",
+                                   t->parms.name);
+-              break;
++              if (code != ICMPV6_PORT_UNREACH)
++                      break;
++              return;
+       case ICMPV6_TIME_EXCEED:
+               if (code == ICMPV6_EXC_HOPLIMIT) {
+                       net_dbg_ratelimited("%s: Too small hop limit or routing loop in tunnel!\n",
+                                           t->parms.name);
++                      break;
+               }
+-              break;
++              return;
+       case ICMPV6_PARAMPROB:
+               teli = 0;
+               if (code == ICMPV6_HDR_FIELD)
+@@ -430,7 +433,7 @@ static void ip6gre_err(struct sk_buff *s
+                       net_dbg_ratelimited("%s: Recipient unable to parse tunneled packet!\n",
+                                           t->parms.name);
+               }
+-              break;
++              return;
+       case ICMPV6_PKT_TOOBIG:
+               mtu = be32_to_cpu(info) - offset - t->tun_hlen;
+               if (t->dev->type == ARPHRD_ETHER)
+@@ -438,7 +441,7 @@ static void ip6gre_err(struct sk_buff *s
+               if (mtu < IPV6_MIN_MTU)
+                       mtu = IPV6_MIN_MTU;
+               t->dev->mtu = mtu;
+-              break;
++              return;
+       }
+ 
+       if (time_before(jiffies, t->err_time + IP6TUNNEL_ERR_TIMEO))
diff --git a/queue-4.9/ip6_gre-update-dst-pmtu-if-dev-mtu-has-been-updated-by-toobig-in-__gre6_xmit.patch b/queue-4.9/ip6_gre-update-dst-pmtu-if-dev-mtu-has-been-updated-by-toobig-in-__gre6_xmit.patch

new file mode 100644 (file)

index 0000000..6999bbe
--- /dev/null
+++ b/queue-4.9/ip6_gre-update-dst-pmtu-if-dev-mtu-has-been-updated-by-toobig-in-__gre6_xmit.patch
@@ -0,0 +1,68 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Xin Long <lucien.xin@gmail.com>
+Date: Thu, 26 Oct 2017 19:27:17 +0800
+Subject: ip6_gre: update dst pmtu if dev mtu has been updated by toobig in __gre6_xmit
+
+From: Xin Long <lucien.xin@gmail.com>
+
+
+[ Upstream commit 8aec4959d832bae0889a8e2f348973b5e4abffef ]
+
+When receiving a Toobig icmpv6 packet, ip6gre_err would just set
+tunnel dev's mtu, that's not enough. For skb_dst(skb)'s pmtu may
+still be using the old value, it has no chance to be updated with
+tunnel dev's mtu.
+
+Jianlin found this issue by reducing route's mtu while running
+netperf, the performance went to 0.
+
+ip6ip6 and ip4ip6 tunnel can work well with this, as they lookup
+the upper dst and update_pmtu it's pmtu or icmpv6_send a Toobig
+to upper socket after setting tunnel dev's mtu.
+
+We couldn't do that for ip6_gre, as gre's inner packet could be
+any protocol, it's difficult to handle them (like lookup upper
+dst) in a good way.
+
+So this patch is to fix it by updating skb_dst(skb)'s pmtu when
+dev->mtu < skb_dst(skb)'s pmtu in tx path. It's safe to do this
+update there, as usually dev->mtu <= skb_dst(skb)'s pmtu and no
+performance regression can be caused by this.
+
+Fixes: c12b395a4664 ("gre: Support GRE over IPv6")
+Reported-by: Jianlin Shi <jishi@redhat.com>
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/ip6_gre.c |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/net/ipv6/ip6_gre.c
++++ b/net/ipv6/ip6_gre.c
+@@ -508,8 +508,8 @@ static netdev_tx_t __gre6_xmit(struct sk
+                              __u32 *pmtu, __be16 proto)
+ {
+       struct ip6_tnl *tunnel = netdev_priv(dev);
+-      __be16 protocol = (dev->type == ARPHRD_ETHER) ?
+-                        htons(ETH_P_TEB) : proto;
++      struct dst_entry *dst = skb_dst(skb);
++      __be16 protocol;
+ 
+       if (dev->type == ARPHRD_ETHER)
+               IPCB(skb)->flags = 0;
+@@ -523,9 +523,14 @@ static netdev_tx_t __gre6_xmit(struct sk
+               tunnel->o_seqno++;
+ 
+       /* Push GRE header. */
++      protocol = (dev->type == ARPHRD_ETHER) ? htons(ETH_P_TEB) : proto;
+       gre_build_header(skb, tunnel->tun_hlen, tunnel->parms.o_flags,
+                        protocol, tunnel->parms.o_key, htonl(tunnel->o_seqno));
+ 
++      /* TooBig packet may have updated dst->dev's mtu */
++      if (dst && dst_mtu(dst) > dst->dev->mtu)
++              dst->ops->update_pmtu(dst, NULL, skb, dst->dev->mtu);
++
+       return ip6_tnl_xmit(skb, dev, dsfield, fl6, encap_limit, pmtu,
+                           NEXTHDR_GRE);
+ }
diff --git a/queue-4.9/ipip-only-increase-err_count-for-some-certain-type-icmp-in-ipip_err.patch b/queue-4.9/ipip-only-increase-err_count-for-some-certain-type-icmp-in-ipip_err.patch

new file mode 100644 (file)

index 0000000..9024736
--- /dev/null
+++ b/queue-4.9/ipip-only-increase-err_count-for-some-certain-type-icmp-in-ipip_err.patch
@@ -0,0 +1,128 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Xin Long <lucien.xin@gmail.com>
+Date: Thu, 26 Oct 2017 19:19:56 +0800
+Subject: ipip: only increase err_count for some certain type icmp in ipip_err
+
+From: Xin Long <lucien.xin@gmail.com>
+
+
+[ Upstream commit f3594f0a7ea36661d7fd942facd7f31a64245f1a ]
+
+t->err_count is used to count the link failure on tunnel and an err
+will be reported to user socket in tx path if t->err_count is not 0.
+udp socket could even return EHOSTUNREACH to users.
+
+Since commit fd58156e456d ("IPIP: Use ip-tunneling code.") removed
+the 'switch check' for icmp type in ipip_err(), err_count would be
+increased by the icmp packet with ICMP_EXC_FRAGTIME code. an link
+failure would be reported out due to this.
+
+In Jianlin's case, when receiving ICMP_EXC_FRAGTIME a icmp packet,
+udp netperf failed with the err:
+  send_data: data send error: No route to host (errno 113)
+
+We expect this error reported from tunnel to socket when receiving
+some certain type icmp, but not ICMP_EXC_FRAGTIME, ICMP_SR_FAILED
+or ICMP_PARAMETERPROB ones.
+
+This patch is to bring 'switch check' for icmp type back to ipip_err
+so that it only reports link failure for the right type icmp, just as
+in ipgre_err() and ipip6_err().
+
+Fixes: fd58156e456d ("IPIP: Use ip-tunneling code.")
+Reported-by: Jianlin Shi <jishi@redhat.com>
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/ipip.c |   59 +++++++++++++++++++++++++++++++++++++++-----------------
+ 1 file changed, 42 insertions(+), 17 deletions(-)
+
+--- a/net/ipv4/ipip.c
++++ b/net/ipv4/ipip.c
+@@ -128,43 +128,68 @@ static struct rtnl_link_ops ipip_link_op
+ 
+ static int ipip_err(struct sk_buff *skb, u32 info)
+ {
+-
+-/* All the routers (except for Linux) return only
+-   8 bytes of packet payload. It means, that precise relaying of
+-   ICMP in the real Internet is absolutely infeasible.
+- */
++      /* All the routers (except for Linux) return only
++       * 8 bytes of packet payload. It means, that precise relaying of
++       * ICMP in the real Internet is absolutely infeasible.
++       */
+       struct net *net = dev_net(skb->dev);
+       struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+       const struct iphdr *iph = (const struct iphdr *)skb->data;
+-      struct ip_tunnel *t;
+-      int err;
+       const int type = icmp_hdr(skb)->type;
+       const int code = icmp_hdr(skb)->code;
++      struct ip_tunnel *t;
++      int err = 0;
++
++      switch (type) {
++      case ICMP_DEST_UNREACH:
++              switch (code) {
++              case ICMP_SR_FAILED:
++                      /* Impossible event. */
++                      goto out;
++              default:
++                      /* All others are translated to HOST_UNREACH.
++                       * rfc2003 contains "deep thoughts" about NET_UNREACH,
++                       * I believe they are just ether pollution. --ANK
++                       */
++                      break;
++              }
++              break;
++
++      case ICMP_TIME_EXCEEDED:
++              if (code != ICMP_EXC_TTL)
++                      goto out;
++              break;
++
++      case ICMP_REDIRECT:
++              break;
++
++      default:
++              goto out;
++      }
+ 
+-      err = -ENOENT;
+       t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
+                            iph->daddr, iph->saddr, 0);
+-      if (!t)
++      if (!t) {
++              err = -ENOENT;
+               goto out;
++      }
+ 
+       if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
+-              ipv4_update_pmtu(skb, dev_net(skb->dev), info,
+-                               t->parms.link, 0, iph->protocol, 0);
+-              err = 0;
++              ipv4_update_pmtu(skb, net, info, t->parms.link, 0,
++                               iph->protocol, 0);
+               goto out;
+       }
+ 
+       if (type == ICMP_REDIRECT) {
+-              ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
+-                            iph->protocol, 0);
+-              err = 0;
++              ipv4_redirect(skb, net, t->parms.link, 0, iph->protocol, 0);
+               goto out;
+       }
+ 
+-      if (t->parms.iph.daddr == 0)
++      if (t->parms.iph.daddr == 0) {
++              err = -ENOENT;
+               goto out;
++      }
+ 
+-      err = 0;
+       if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
+               goto out;
+ 
diff --git a/queue-4.9/ipv6-addrconf-increment-ifp-refcount-before-ipv6_del_addr.patch b/queue-4.9/ipv6-addrconf-increment-ifp-refcount-before-ipv6_del_addr.patch

new file mode 100644 (file)

index 0000000..4cbe114
--- /dev/null
+++ b/queue-4.9/ipv6-addrconf-increment-ifp-refcount-before-ipv6_del_addr.patch
@@ -0,0 +1,95 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Mon, 30 Oct 2017 22:47:09 -0700
+Subject: ipv6: addrconf: increment ifp refcount before ipv6_del_addr()
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit e669b86945478b3d90d2d87e3793a6eed06d332f ]
+
+In the (unlikely) event fixup_permanent_addr() returns a failure,
+addrconf_permanent_addr() calls ipv6_del_addr() without the
+mandatory call to in6_ifa_hold(), leading to a refcount error,
+spotted by syzkaller :
+
+WARNING: CPU: 1 PID: 3142 at lib/refcount.c:227 refcount_dec+0x4c/0x50
+lib/refcount.c:227
+Kernel panic - not syncing: panic_on_warn set ...
+
+CPU: 1 PID: 3142 Comm: ip Not tainted 4.14.0-rc4-next-20171009+ #33
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
+Google 01/01/2011
+Call Trace:
+ __dump_stack lib/dump_stack.c:16 [inline]
+ dump_stack+0x194/0x257 lib/dump_stack.c:52
+ panic+0x1e4/0x41c kernel/panic.c:181
+ __warn+0x1c4/0x1e0 kernel/panic.c:544
+ report_bug+0x211/0x2d0 lib/bug.c:183
+ fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:178
+ do_trap_no_signal arch/x86/kernel/traps.c:212 [inline]
+ do_trap+0x260/0x390 arch/x86/kernel/traps.c:261
+ do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:298
+ do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:311
+ invalid_op+0x18/0x20 arch/x86/entry/entry_64.S:905
+RIP: 0010:refcount_dec+0x4c/0x50 lib/refcount.c:227
+RSP: 0018:ffff8801ca49e680 EFLAGS: 00010286
+RAX: 000000000000002c RBX: ffff8801d07cfcdc RCX: 0000000000000000
+RDX: 000000000000002c RSI: 1ffff10039493c90 RDI: ffffed0039493cc4
+RBP: ffff8801ca49e688 R08: ffff8801ca49dd70 R09: 0000000000000000
+R10: ffff8801ca49df58 R11: 0000000000000000 R12: 1ffff10039493cd9
+R13: ffff8801ca49e6e8 R14: ffff8801ca49e7e8 R15: ffff8801d07cfcdc
+ __in6_ifa_put include/net/addrconf.h:369 [inline]
+ ipv6_del_addr+0x42b/0xb60 net/ipv6/addrconf.c:1208
+ addrconf_permanent_addr net/ipv6/addrconf.c:3327 [inline]
+ addrconf_notify+0x1c66/0x2190 net/ipv6/addrconf.c:3393
+ notifier_call_chain+0x136/0x2c0 kernel/notifier.c:93
+ __raw_notifier_call_chain kernel/notifier.c:394 [inline]
+ raw_notifier_call_chain+0x2d/0x40 kernel/notifier.c:401
+ call_netdevice_notifiers_info+0x32/0x60 net/core/dev.c:1697
+ call_netdevice_notifiers net/core/dev.c:1715 [inline]
+ __dev_notify_flags+0x15d/0x430 net/core/dev.c:6843
+ dev_change_flags+0xf5/0x140 net/core/dev.c:6879
+ do_setlink+0xa1b/0x38e0 net/core/rtnetlink.c:2113
+ rtnl_newlink+0xf0d/0x1a40 net/core/rtnetlink.c:2661
+ rtnetlink_rcv_msg+0x733/0x1090 net/core/rtnetlink.c:4301
+ netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2408
+ rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:4313
+ netlink_unicast_kernel net/netlink/af_netlink.c:1273 [inline]
+ netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1299
+ netlink_sendmsg+0xa4a/0xe70 net/netlink/af_netlink.c:1862
+ sock_sendmsg_nosec net/socket.c:633 [inline]
+ sock_sendmsg+0xca/0x110 net/socket.c:643
+ ___sys_sendmsg+0x75b/0x8a0 net/socket.c:2049
+ __sys_sendmsg+0xe5/0x210 net/socket.c:2083
+ SYSC_sendmsg net/socket.c:2094 [inline]
+ SyS_sendmsg+0x2d/0x50 net/socket.c:2090
+ entry_SYSCALL_64_fastpath+0x1f/0xbe
+RIP: 0033:0x7fa9174d3320
+RSP: 002b:00007ffe302ae9e8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e
+RAX: ffffffffffffffda RBX: 00007ffe302b2ae0 RCX: 00007fa9174d3320
+RDX: 0000000000000000 RSI: 00007ffe302aea20 RDI: 0000000000000016
+RBP: 0000000000000082 R08: 0000000000000000 R09: 000000000000000f
+R10: 0000000000000000 R11: 0000000000000246 R12: 00007ffe302b32a0
+R13: 0000000000000000 R14: 00007ffe302b2ab8 R15: 00007ffe302b32b8
+
+Fixes: f1705ec197e7 ("net: ipv6: Make address flushing on ifdown optional")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: David Ahern <dsahern@gmail.com>
+Acked-by: David Ahern <dsahern@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/addrconf.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/ipv6/addrconf.c
++++ b/net/ipv6/addrconf.c
+@@ -3299,6 +3299,7 @@ static void addrconf_permanent_addr(stru
+               if ((ifp->flags & IFA_F_PERMANENT) &&
+                   fixup_permanent_addr(idev, ifp) < 0) {
+                       write_unlock_bh(&idev->lock);
++                      in6_ifa_hold(ifp);
+                       ipv6_del_addr(ifp);
+                       write_lock_bh(&idev->lock);
+ 
diff --git a/queue-4.9/ipv6-flowlabel-do-not-leave-opt-tot_len-with-garbage.patch b/queue-4.9/ipv6-flowlabel-do-not-leave-opt-tot_len-with-garbage.patch

new file mode 100644 (file)

index 0000000..61b0013
--- /dev/null
+++ b/queue-4.9/ipv6-flowlabel-do-not-leave-opt-tot_len-with-garbage.patch
@@ -0,0 +1,104 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Sat, 21 Oct 2017 12:26:23 -0700
+Subject: ipv6: flowlabel: do not leave opt->tot_len with garbage
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 864e2a1f8aac05effac6063ce316b480facb46ff ]
+
+When syzkaller team brought us a C repro for the crash [1] that
+had been reported many times in the past, I finally could find
+the root cause.
+
+If FlowLabel info is merged by fl6_merge_options(), we leave
+part of the opt_space storage provided by udp/raw/l2tp with random value
+in opt_space.tot_len, unless a control message was provided at sendmsg()
+time.
+
+Then ip6_setup_cork() would use this random value to perform a kzalloc()
+call. Undefined behavior and crashes.
+
+Fix is to properly set tot_len in fl6_merge_options()
+
+At the same time, we can also avoid consuming memory and cpu cycles
+to clear it, if every option is copied via a kmemdup(). This is the
+change in ip6_setup_cork().
+
+[1]
+kasan: CONFIG_KASAN_INLINE enabled
+kasan: GPF could be caused by NULL-ptr deref or user memory access
+general protection fault: 0000 [#1] SMP KASAN
+Dumping ftrace buffer:
+   (ftrace buffer empty)
+Modules linked in:
+CPU: 0 PID: 6613 Comm: syz-executor0 Not tainted 4.14.0-rc4+ #127
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+task: ffff8801cb64a100 task.stack: ffff8801cc350000
+RIP: 0010:ip6_setup_cork+0x274/0x15c0 net/ipv6/ip6_output.c:1168
+RSP: 0018:ffff8801cc357550 EFLAGS: 00010203
+RAX: dffffc0000000000 RBX: ffff8801cc357748 RCX: 0000000000000010
+RDX: 0000000000000002 RSI: ffffffff842bd1d9 RDI: 0000000000000014
+RBP: ffff8801cc357620 R08: ffff8801cb17f380 R09: ffff8801cc357b10
+R10: ffff8801cb64a100 R11: 0000000000000000 R12: ffff8801cc357ab0
+R13: ffff8801cc357b10 R14: 0000000000000000 R15: ffff8801c3bbf0c0
+FS:  00007f9c5c459700(0000) GS:ffff8801db200000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 0000000020324000 CR3: 00000001d1cf2000 CR4: 00000000001406f0
+DR0: 0000000020001010 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000600
+Call Trace:
+ ip6_make_skb+0x282/0x530 net/ipv6/ip6_output.c:1729
+ udpv6_sendmsg+0x2769/0x3380 net/ipv6/udp.c:1340
+ inet_sendmsg+0x11f/0x5e0 net/ipv4/af_inet.c:762
+ sock_sendmsg_nosec net/socket.c:633 [inline]
+ sock_sendmsg+0xca/0x110 net/socket.c:643
+ SYSC_sendto+0x358/0x5a0 net/socket.c:1750
+ SyS_sendto+0x40/0x50 net/socket.c:1718
+ entry_SYSCALL_64_fastpath+0x1f/0xbe
+RIP: 0033:0x4520a9
+RSP: 002b:00007f9c5c458c08 EFLAGS: 00000216 ORIG_RAX: 000000000000002c
+RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 00000000004520a9
+RDX: 0000000000000001 RSI: 0000000020fd1000 RDI: 0000000000000016
+RBP: 0000000000000086 R08: 0000000020e0afe4 R09: 000000000000001c
+R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004bb1ee
+R13: 00000000ffffffff R14: 0000000000000016 R15: 0000000000000029
+Code: e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85 ea 0f 00 00 48 8d 79 04 48 b8 00 00 00 00 00 fc ff df 45 8b 74 24 04 48 89 fa 48 c1 ea 03 <0f> b6 14 02 48 89 f8 83 e0 07 83 c0 03 38 d0 7c 08 84 d2 0f 85
+RIP: ip6_setup_cork+0x274/0x15c0 net/ipv6/ip6_output.c:1168 RSP: ffff8801cc357550
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/ip6_flowlabel.c |    1 +
+ net/ipv6/ip6_output.c    |    4 ++--
+ 2 files changed, 3 insertions(+), 2 deletions(-)
+
+--- a/net/ipv6/ip6_flowlabel.c
++++ b/net/ipv6/ip6_flowlabel.c
+@@ -315,6 +315,7 @@ struct ipv6_txoptions *fl6_merge_options
+       }
+       opt_space->dst1opt = fopt->dst1opt;
+       opt_space->opt_flen = fopt->opt_flen;
++      opt_space->tot_len = fopt->tot_len;
+       return opt_space;
+ }
+ EXPORT_SYMBOL_GPL(fl6_merge_options);
+--- a/net/ipv6/ip6_output.c
++++ b/net/ipv6/ip6_output.c
+@@ -1215,11 +1215,11 @@ static int ip6_setup_cork(struct sock *s
+               if (WARN_ON(v6_cork->opt))
+                       return -EINVAL;
+ 
+-              v6_cork->opt = kzalloc(opt->tot_len, sk->sk_allocation);
++              v6_cork->opt = kzalloc(sizeof(*opt), sk->sk_allocation);
+               if (unlikely(!v6_cork->opt))
+                       return -ENOBUFS;
+ 
+-              v6_cork->opt->tot_len = opt->tot_len;
++              v6_cork->opt->tot_len = sizeof(*opt);
+               v6_cork->opt->opt_flen = opt->opt_flen;
+               v6_cork->opt->opt_nflen = opt->opt_nflen;
+ 
diff --git a/queue-4.9/l2tp-check-ps-sock-before-running-pppol2tp_session_ioctl.patch b/queue-4.9/l2tp-check-ps-sock-before-running-pppol2tp_session_ioctl.patch

new file mode 100644 (file)

index 0000000..1f45649
--- /dev/null
+++ b/queue-4.9/l2tp-check-ps-sock-before-running-pppol2tp_session_ioctl.patch
@@ -0,0 +1,36 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Guillaume Nault <g.nault@alphalink.fr>
+Date: Fri, 13 Oct 2017 19:22:35 +0200
+Subject: l2tp: check ps->sock before running pppol2tp_session_ioctl()
+
+From: Guillaume Nault <g.nault@alphalink.fr>
+
+
+[ Upstream commit 5903f594935a3841137c86b9d5b75143a5b7121c ]
+
+When pppol2tp_session_ioctl() is called by pppol2tp_tunnel_ioctl(),
+the session may be unconnected. That is, it was created by
+pppol2tp_session_create() and hasn't been connected with
+pppol2tp_connect(). In this case, ps->sock is NULL, so we need to check
+for this case in order to avoid dereferencing a NULL pointer.
+
+Fixes: 309795f4bec2 ("l2tp: Add netlink control API for L2TP")
+Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/l2tp/l2tp_ppp.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/net/l2tp/l2tp_ppp.c
++++ b/net/l2tp/l2tp_ppp.c
+@@ -993,6 +993,9 @@ static int pppol2tp_session_ioctl(struct
+                session->name, cmd, arg);
+ 
+       sk = ps->sock;
++      if (!sk)
++              return -EBADR;
++
+       sock_hold(sk);
+ 
+       switch (cmd) {
diff --git a/queue-4.9/net-call-cgroup_sk_alloc-earlier-in-sk_clone_lock.patch b/queue-4.9/net-call-cgroup_sk_alloc-earlier-in-sk_clone_lock.patch

new file mode 100644 (file)

index 0000000..a0775f9
--- /dev/null
+++ b/queue-4.9/net-call-cgroup_sk_alloc-earlier-in-sk_clone_lock.patch
@@ -0,0 +1,44 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Tue, 10 Oct 2017 19:12:33 -0700
+Subject: net: call cgroup_sk_alloc() earlier in sk_clone_lock()
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit c0576e3975084d4699b7bfef578613fb8e1144f6 ]
+
+If for some reason, the newly allocated child need to be freed,
+we will call cgroup_put() (via sk_free_unlock_clone()) while the
+corresponding cgroup_get() was not yet done, and we will free memory
+too soon.
+
+Fixes: d979a39d7242 ("cgroup: duplicate cgroup reference when cloning sockets")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Tejun Heo <tj@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/sock.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -1526,6 +1526,7 @@ struct sock *sk_clone_lock(const struct
+               newsk->sk_userlocks     = sk->sk_userlocks & ~SOCK_BINDPORT_LOCK;
+ 
+               sock_reset_flag(newsk, SOCK_DONE);
++              cgroup_sk_alloc(&newsk->sk_cgrp_data);
+               skb_queue_head_init(&newsk->sk_error_queue);
+ 
+               filter = rcu_dereference_protected(newsk->sk_filter, 1);
+@@ -1560,8 +1561,6 @@ struct sock *sk_clone_lock(const struct
+               atomic64_set(&newsk->sk_cookie, 0);
+ 
+               mem_cgroup_sk_alloc(newsk);
+-              cgroup_sk_alloc(&newsk->sk_cgrp_data);
+-
+               /*
+                * Before updating sk_refcnt, we must commit prior changes to memory
+                * (Documentation/RCU/rculist_nulls.txt for details)
diff --git a/queue-4.9/net-unix-don-t-show-information-about-sockets-from-other-namespaces.patch b/queue-4.9/net-unix-don-t-show-information-about-sockets-from-other-namespaces.patch

new file mode 100644 (file)

index 0000000..eed98ef
--- /dev/null
+++ b/queue-4.9/net-unix-don-t-show-information-about-sockets-from-other-namespaces.patch
@@ -0,0 +1,39 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Andrei Vagin <avagin@openvz.org>
+Date: Wed, 25 Oct 2017 10:16:42 -0700
+Subject: net/unix: don't show information about sockets from other namespaces
+
+From: Andrei Vagin <avagin@openvz.org>
+
+
+[ Upstream commit 0f5da659d8f1810f44de14acf2c80cd6499623a0 ]
+
+socket_diag shows information only about sockets from a namespace where
+a diag socket lives.
+
+But if we request information about one unix socket, the kernel don't
+check that its netns is matched with a diag socket namespace, so any
+user can get information about any unix socket in a system. This looks
+like a bug.
+
+v2: add a Fixes tag
+
+Fixes: 51d7cccf0723 ("net: make sock diag per-namespace")
+Signed-off-by: Andrei Vagin <avagin@openvz.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/diag.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/net/unix/diag.c
++++ b/net/unix/diag.c
+@@ -257,6 +257,8 @@ static int unix_diag_get_exact(struct sk
+       err = -ENOENT;
+       if (sk == NULL)
+               goto out_nosk;
++      if (!net_eq(sock_net(sk), net))
++              goto out;
+ 
+       err = sock_diag_check_cookie(sk, req->udiag_cookie);
+       if (err)
diff --git a/queue-4.9/net_sched-avoid-matching-qdisc-with-zero-handle.patch b/queue-4.9/net_sched-avoid-matching-qdisc-with-zero-handle.patch

new file mode 100644 (file)

index 0000000..4bf7b23
--- /dev/null
+++ b/queue-4.9/net_sched-avoid-matching-qdisc-with-zero-handle.patch
@@ -0,0 +1,49 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Cong Wang <xiyou.wangcong@gmail.com>
+Date: Fri, 27 Oct 2017 22:08:56 -0700
+Subject: net_sched: avoid matching qdisc with zero handle
+
+From: Cong Wang <xiyou.wangcong@gmail.com>
+
+
+[ Upstream commit 50317fce2cc70a2bbbc4b42c31bbad510382a53c ]
+
+Davide found the following script triggers a NULL pointer
+dereference:
+
+ip l a name eth0 type dummy
+tc q a dev eth0 parent :1 handle 1: htb
+
+This is because for a freshly created netdevice noop_qdisc
+is attached and when passing 'parent :1', kernel actually
+tries to match the major handle which is 0 and noop_qdisc
+has handle 0 so is matched by mistake. Commit 69012ae425d7
+tries to fix a similar bug but still misses this case.
+
+Handle 0 is not a valid one, should be just skipped. In
+fact, kernel uses it as TC_H_UNSPEC.
+
+Fixes: 69012ae425d7 ("net: sched: fix handling of singleton qdiscs with qdisc_hash")
+Fixes: 59cc1f61f09c ("net: sched:convert qdisc linked list to hashtable")
+Reported-by: Davide Caratti <dcaratti@redhat.com>
+Cc: Jiri Kosina <jkosina@suse.cz>
+Cc: Eric Dumazet <edumazet@google.com>
+Cc: Jamal Hadi Salim <jhs@mojatatu.com>
+Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sched/sch_api.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/net/sched/sch_api.c
++++ b/net/sched/sch_api.c
+@@ -296,6 +296,8 @@ struct Qdisc *qdisc_lookup(struct net_de
+ {
+       struct Qdisc *q;
+ 
++      if (!handle)
++              return NULL;
+       q = qdisc_match_from_root(dev->qdisc, handle);
+       if (q)
+               goto out;
diff --git a/queue-4.9/netlink-do-not-set-cb_running-if-dump-s-start-errs.patch b/queue-4.9/netlink-do-not-set-cb_running-if-dump-s-start-errs.patch

new file mode 100644 (file)

index 0000000..77f7776
--- /dev/null
+++ b/queue-4.9/netlink-do-not-set-cb_running-if-dump-s-start-errs.patch
@@ -0,0 +1,61 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+Date: Mon, 9 Oct 2017 14:14:51 +0200
+Subject: netlink: do not set cb_running if dump's start() errs
+
+From: "Jason A. Donenfeld" <Jason@zx2c4.com>
+
+
+[ Upstream commit 41c87425a1ac9b633e0fcc78eb1f19640c8fb5a0 ]
+
+It turns out that multiple places can call netlink_dump(), which means
+it's still possible to dereference partially initialized values in
+dump() that were the result of a faulty returned start().
+
+This fixes the issue by calling start() _before_ setting cb_running to
+true, so that there's no chance at all of hitting the dump() function
+through any indirect paths.
+
+It also moves the call to start() to be when the mutex is held. This has
+the nice side effect of serializing invocations to start(), which is
+likely desirable anyway. It also prevents any possible other races that
+might come out of this logic.
+
+In testing this with several different pieces of tricky code to trigger
+these issues, this commit fixes all avenues that I'm aware of.
+
+Signed-off-by: Jason A. Donenfeld <Jason@zx2c4.com>
+Cc: Johannes Berg <johannes@sipsolutions.net>
+Reviewed-by: Johannes Berg <johannes@sipsolutions.net>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/netlink/af_netlink.c |   13 +++++++------
+ 1 file changed, 7 insertions(+), 6 deletions(-)
+
+--- a/net/netlink/af_netlink.c
++++ b/net/netlink/af_netlink.c
+@@ -2207,16 +2207,17 @@ int __netlink_dump_start(struct sock *ss
+       cb->min_dump_alloc = control->min_dump_alloc;
+       cb->skb = skb;
+ 
++      if (cb->start) {
++              ret = cb->start(cb);
++              if (ret)
++                      goto error_unlock;
++      }
++
+       nlk->cb_running = true;
+ 
+       mutex_unlock(nlk->cb_mutex);
+ 
+-      ret = 0;
+-      if (cb->start)
+-              ret = cb->start(cb);
+-
+-      if (!ret)
+-              ret = netlink_dump(sk);
++      ret = netlink_dump(sk);
+ 
+       sock_put(sk);
+ 
diff --git a/queue-4.9/packet-avoid-panic-in-packet_getsockopt.patch b/queue-4.9/packet-avoid-panic-in-packet_getsockopt.patch

new file mode 100644 (file)

index 0000000..ff8c826
--- /dev/null
+++ b/queue-4.9/packet-avoid-panic-in-packet_getsockopt.patch
@@ -0,0 +1,86 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Wed, 18 Oct 2017 16:14:52 -0700
+Subject: packet: avoid panic in packet_getsockopt()
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 509c7a1ecc8601f94ffba8a00889fefb239c00c6 ]
+
+syzkaller got crashes in packet_getsockopt() processing
+PACKET_ROLLOVER_STATS command while another thread was managing
+to change po->rollover
+
+Using RCU will fix this bug. We might later add proper RCU annotations
+for sparse sake.
+
+In v2: I replaced kfree(rollover) in fanout_add() to kfree_rcu()
+variant, as spotted by John.
+
+Fixes: a9b6391814d5 ("packet: rollover statistics")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Willem de Bruijn <willemb@google.com>
+Cc: John Sperbeck <jsperbeck@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/packet/af_packet.c |   24 ++++++++++++++++--------
+ 1 file changed, 16 insertions(+), 8 deletions(-)
+
+--- a/net/packet/af_packet.c
++++ b/net/packet/af_packet.c
+@@ -1720,7 +1720,7 @@ static int fanout_add(struct sock *sk, u
+ 
+ out:
+       if (err && rollover) {
+-              kfree(rollover);
++              kfree_rcu(rollover, rcu);
+               po->rollover = NULL;
+       }
+       mutex_unlock(&fanout_mutex);
+@@ -1747,8 +1747,10 @@ static struct packet_fanout *fanout_rele
+               else
+                       f = NULL;
+ 
+-              if (po->rollover)
++              if (po->rollover) {
+                       kfree_rcu(po->rollover, rcu);
++                      po->rollover = NULL;
++              }
+       }
+       mutex_unlock(&fanout_mutex);
+ 
+@@ -3851,6 +3853,7 @@ static int packet_getsockopt(struct sock
+       void *data = &val;
+       union tpacket_stats_u st;
+       struct tpacket_rollover_stats rstats;
++      struct packet_rollover *rollover;
+ 
+       if (level != SOL_PACKET)
+               return -ENOPROTOOPT;
+@@ -3929,13 +3932,18 @@ static int packet_getsockopt(struct sock
+                      0);
+               break;
+       case PACKET_ROLLOVER_STATS:
+-              if (!po->rollover)
++              rcu_read_lock();
++              rollover = rcu_dereference(po->rollover);
++              if (rollover) {
++                      rstats.tp_all = atomic_long_read(&rollover->num);
++                      rstats.tp_huge = atomic_long_read(&rollover->num_huge);
++                      rstats.tp_failed = atomic_long_read(&rollover->num_failed);
++                      data = &rstats;
++                      lv = sizeof(rstats);
++              }
++              rcu_read_unlock();
++              if (!rollover)
+                       return -EINVAL;
+-              rstats.tp_all = atomic_long_read(&po->rollover->num);
+-              rstats.tp_huge = atomic_long_read(&po->rollover->num_huge);
+-              rstats.tp_failed = atomic_long_read(&po->rollover->num_failed);
+-              data = &rstats;
+-              lv = sizeof(rstats);
+               break;
+       case PACKET_TX_HAS_OFF:
+               val = po->tp_tx_has_off;
diff --git a/queue-4.9/ppp-fix-race-in-ppp-device-destruction.patch b/queue-4.9/ppp-fix-race-in-ppp-device-destruction.patch

new file mode 100644 (file)

index 0000000..eb7923f
--- /dev/null
+++ b/queue-4.9/ppp-fix-race-in-ppp-device-destruction.patch
@@ -0,0 +1,113 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Guillaume Nault <g.nault@alphalink.fr>
+Date: Fri, 6 Oct 2017 17:05:49 +0200
+Subject: ppp: fix race in ppp device destruction
+
+From: Guillaume Nault <g.nault@alphalink.fr>
+
+
+[ Upstream commit 6151b8b37b119e8e3a8401b080d532520c95faf4 ]
+
+ppp_release() tries to ensure that netdevices are unregistered before
+decrementing the unit refcount and running ppp_destroy_interface().
+
+This is all fine as long as the the device is unregistered by
+ppp_release(): the unregister_netdevice() call, followed by
+rtnl_unlock(), guarantee that the unregistration process completes
+before rtnl_unlock() returns.
+
+However, the device may be unregistered by other means (like
+ppp_nl_dellink()). If this happens right before ppp_release() calling
+rtnl_lock(), then ppp_release() has to wait for the concurrent
+unregistration code to release the lock.
+But rtnl_unlock() releases the lock before completing the device
+unregistration process. This allows ppp_release() to proceed and
+eventually call ppp_destroy_interface() before the unregistration
+process completes. Calling free_netdev() on this partially unregistered
+device will BUG():
+
+ ------------[ cut here ]------------
+ kernel BUG at net/core/dev.c:8141!
+ invalid opcode: 0000 [#1] SMP
+
+ CPU: 1 PID: 1557 Comm: pppd Not tainted 4.14.0-rc2+ #4
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1.fc26 04/01/2014
+
+ Call Trace:
+  ppp_destroy_interface+0xd8/0xe0 [ppp_generic]
+  ppp_disconnect_channel+0xda/0x110 [ppp_generic]
+  ppp_unregister_channel+0x5e/0x110 [ppp_generic]
+  pppox_unbind_sock+0x23/0x30 [pppox]
+  pppoe_connect+0x130/0x440 [pppoe]
+  SYSC_connect+0x98/0x110
+  ? do_fcntl+0x2c0/0x5d0
+  SyS_connect+0xe/0x10
+  entry_SYSCALL_64_fastpath+0x1a/0xa5
+
+ RIP: free_netdev+0x107/0x110 RSP: ffffc28a40573d88
+ ---[ end trace ed294ff0cc40eeff ]---
+
+We could set the ->needs_free_netdev flag on PPP devices and move the
+ppp_destroy_interface() logic in the ->priv_destructor() callback. But
+that'd be quite intrusive as we'd first need to unlink from the other
+channels and units that depend on the device (the ones that used the
+PPPIOCCONNECT and PPPIOCATTACH ioctls).
+
+Instead, we can just let the netdevice hold a reference on its
+ppp_file. This reference is dropped in ->priv_destructor(), at the very
+end of the unregistration process, so that neither ppp_release() nor
+ppp_disconnect_channel() can call ppp_destroy_interface() in the interim.
+
+Reported-by: Beniamino Galvani <bgalvani@redhat.com>
+Fixes: 8cb775bc0a34 ("ppp: fix device unregistration upon netns deletion")
+Signed-off-by: Guillaume Nault <g.nault@alphalink.fr>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ppp/ppp_generic.c |   20 ++++++++++++++++++++
+ 1 file changed, 20 insertions(+)
+
+--- a/drivers/net/ppp/ppp_generic.c
++++ b/drivers/net/ppp/ppp_generic.c
+@@ -1338,7 +1338,17 @@ ppp_get_stats64(struct net_device *dev,
+ 
+ static int ppp_dev_init(struct net_device *dev)
+ {
++      struct ppp *ppp;
++
+       netdev_lockdep_set_classes(dev);
++
++      ppp = netdev_priv(dev);
++      /* Let the netdevice take a reference on the ppp file. This ensures
++       * that ppp_destroy_interface() won't run before the device gets
++       * unregistered.
++       */
++      atomic_inc(&ppp->file.refcnt);
++
+       return 0;
+ }
+ 
+@@ -1361,6 +1371,15 @@ static void ppp_dev_uninit(struct net_de
+       wake_up_interruptible(&ppp->file.rwait);
+ }
+ 
++static void ppp_dev_priv_destructor(struct net_device *dev)
++{
++      struct ppp *ppp;
++
++      ppp = netdev_priv(dev);
++      if (atomic_dec_and_test(&ppp->file.refcnt))
++              ppp_destroy_interface(ppp);
++}
++
+ static const struct net_device_ops ppp_netdev_ops = {
+       .ndo_init        = ppp_dev_init,
+       .ndo_uninit      = ppp_dev_uninit,
+@@ -1386,6 +1405,7 @@ static void ppp_setup(struct net_device
+       dev->tx_queue_len = 3;
+       dev->type = ARPHRD_PPP;
+       dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
++      dev->destructor = ppp_dev_priv_destructor;
+       netif_keep_dst(dev);
+ }
+ 
diff --git a/queue-4.9/sctp-add-the-missing-sock_owned_by_user-check-in-sctp_icmp_redirect.patch b/queue-4.9/sctp-add-the-missing-sock_owned_by_user-check-in-sctp_icmp_redirect.patch

new file mode 100644 (file)

index 0000000..5028b92
--- /dev/null
+++ b/queue-4.9/sctp-add-the-missing-sock_owned_by_user-check-in-sctp_icmp_redirect.patch
@@ -0,0 +1,48 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Xin Long <lucien.xin@gmail.com>
+Date: Wed, 18 Oct 2017 21:37:49 +0800
+Subject: sctp: add the missing sock_owned_by_user check in sctp_icmp_redirect
+
+From: Xin Long <lucien.xin@gmail.com>
+
+
+[ Upstream commit 1cc276cec9ec574d41cf47dfc0f51406b6f26ab4 ]
+
+Now sctp processes icmp redirect packet in sctp_icmp_redirect where
+it calls sctp_transport_dst_check in which tp->dst can be released.
+
+The problem is before calling sctp_transport_dst_check, it doesn't
+check sock_owned_by_user, which means tp->dst could be freed while
+a process is accessing it with owning the socket.
+
+An use-after-free issue could be triggered by this.
+
+This patch is to fix it by checking sock_owned_by_user before calling
+sctp_transport_dst_check in sctp_icmp_redirect, so that it would not
+release tp->dst if users still hold sock lock.
+
+Besides, the same issue fixed in commit 45caeaa5ac0b ("dccp/tcp: fix
+routing redirect race") on sctp also needs this check.
+
+Fixes: 55be7a9c6074 ("ipv4: Add redirect support to all protocol icmp error handlers")
+Reported-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Acked-by: Neil Horman <nhorman@tuxdriver.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sctp/input.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/sctp/input.c
++++ b/net/sctp/input.c
+@@ -421,7 +421,7 @@ void sctp_icmp_redirect(struct sock *sk,
+ {
+       struct dst_entry *dst;
+ 
+-      if (!t)
++      if (sock_owned_by_user(sk) || !t)
+               return;
+       dst = sctp_transport_dst_check(t);
+       if (dst)
diff --git a/queue-4.9/sctp-full-support-for-ipv6-ip_nonlocal_bind-ip_freebind.patch b/queue-4.9/sctp-full-support-for-ipv6-ip_nonlocal_bind-ip_freebind.patch

new file mode 100644 (file)

index 0000000..c090bfe
--- /dev/null
+++ b/queue-4.9/sctp-full-support-for-ipv6-ip_nonlocal_bind-ip_freebind.patch
@@ -0,0 +1,55 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Laszlo Toth <laszlth@gmail.com>
+Date: Mon, 23 Oct 2017 19:19:33 +0200
+Subject: sctp: full support for ipv6 ip_nonlocal_bind & IP_FREEBIND
+
+From: Laszlo Toth <laszlth@gmail.com>
+
+
+[ Upstream commit b71d21c274eff20a9db8158882b545b141b73ab8 ]
+
+Commit 9b9742022888 ("sctp: support ipv6 nonlocal bind")
+introduced support for the above options as v4 sctp did,
+so patched sctp_v6_available().
+
+In the v4 implementation it's enough, because
+sctp_inet_bind_verify() just returns with sctp_v4_available().
+However sctp_inet6_bind_verify() has an extra check before that
+for link-local scope_id, which won't respect the above options.
+
+Added the checks before calling ipv6_chk_addr(), but
+not before the validation of scope_id.
+
+before (w/ both options):
+ ./v6test fe80::10 sctp
+ bind failed, errno: 99 (Cannot assign requested address)
+ ./v6test fe80::10 tcp
+ bind success, errno: 0 (Success)
+
+after (w/ both options):
+ ./v6test fe80::10 sctp
+ bind success, errno: 0 (Success)
+
+Signed-off-by: Laszlo Toth <laszlth@gmail.com>
+Reviewed-by: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sctp/ipv6.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/net/sctp/ipv6.c
++++ b/net/sctp/ipv6.c
+@@ -881,8 +881,10 @@ static int sctp_inet6_bind_verify(struct
+                       net = sock_net(&opt->inet.sk);
+                       rcu_read_lock();
+                       dev = dev_get_by_index_rcu(net, addr->v6.sin6_scope_id);
+-                      if (!dev ||
+-                          !ipv6_chk_addr(net, &addr->v6.sin6_addr, dev, 0)) {
++                      if (!dev || !(opt->inet.freebind ||
++                                    net->ipv6.sysctl.ip_nonlocal_bind ||
++                                    ipv6_chk_addr(net, &addr->v6.sin6_addr,
++                                                  dev, 0))) {
+                               rcu_read_unlock();
+                               return 0;
+                       }
diff --git a/queue-4.9/sctp-reset-owner-sk-for-data-chunks-on-out-queues-when-migrating-a-sock.patch b/queue-4.9/sctp-reset-owner-sk-for-data-chunks-on-out-queues-when-migrating-a-sock.patch

new file mode 100644 (file)

index 0000000..60f863c
--- /dev/null
+++ b/queue-4.9/sctp-reset-owner-sk-for-data-chunks-on-out-queues-when-migrating-a-sock.patch
@@ -0,0 +1,100 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Xin Long <lucien.xin@gmail.com>
+Date: Sat, 28 Oct 2017 02:13:29 +0800
+Subject: sctp: reset owner sk for data chunks on out queues when migrating a sock
+
+From: Xin Long <lucien.xin@gmail.com>
+
+
+[ Upstream commit d04adf1b355181e737b6b1e23d801b07f0b7c4c0 ]
+
+Now when migrating sock to another one in sctp_sock_migrate(), it only
+resets owner sk for the data in receive queues, not the chunks on out
+queues.
+
+It would cause that data chunks length on the sock is not consistent
+with sk sk_wmem_alloc. When closing the sock or freeing these chunks,
+the old sk would never be freed, and the new sock may crash due to
+the overflow sk_wmem_alloc.
+
+syzbot found this issue with this series:
+
+  r0 = socket$inet_sctp()
+  sendto$inet(r0)
+  listen(r0)
+  accept4(r0)
+  close(r0)
+
+Although listen() should have returned error when one TCP-style socket
+is in connecting (I may fix this one in another patch), it could also
+be reproduced by peeling off an assoc.
+
+This issue is there since very beginning.
+
+This patch is to reset owner sk for the chunks on out queues so that
+sk sk_wmem_alloc has correct value after accept one sock or peeloff
+an assoc to one sock.
+
+Note that when resetting owner sk for chunks on outqueue, it has to
+sctp_clear_owner_w/skb_orphan chunks before changing assoc->base.sk
+first and then sctp_set_owner_w them after changing assoc->base.sk,
+due to that sctp_wfree and it's callees are using assoc->base.sk.
+
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sctp/socket.c |   32 ++++++++++++++++++++++++++++++++
+ 1 file changed, 32 insertions(+)
+
+--- a/net/sctp/socket.c
++++ b/net/sctp/socket.c
+@@ -168,6 +168,36 @@ static inline void sctp_set_owner_w(stru
+       sk_mem_charge(sk, chunk->skb->truesize);
+ }
+ 
++static void sctp_clear_owner_w(struct sctp_chunk *chunk)
++{
++      skb_orphan(chunk->skb);
++}
++
++static void sctp_for_each_tx_datachunk(struct sctp_association *asoc,
++                                     void (*cb)(struct sctp_chunk *))
++
++{
++      struct sctp_outq *q = &asoc->outqueue;
++      struct sctp_transport *t;
++      struct sctp_chunk *chunk;
++
++      list_for_each_entry(t, &asoc->peer.transport_addr_list, transports)
++              list_for_each_entry(chunk, &t->transmitted, transmitted_list)
++                      cb(chunk);
++
++      list_for_each_entry(chunk, &q->retransmit, list)
++              cb(chunk);
++
++      list_for_each_entry(chunk, &q->sacked, list)
++              cb(chunk);
++
++      list_for_each_entry(chunk, &q->abandoned, list)
++              cb(chunk);
++
++      list_for_each_entry(chunk, &q->out_chunk_list, list)
++              cb(chunk);
++}
++
+ /* Verify that this is a valid address. */
+ static inline int sctp_verify_addr(struct sock *sk, union sctp_addr *addr,
+                                  int len)
+@@ -7826,7 +7856,9 @@ static void sctp_sock_migrate(struct soc
+        * paths won't try to lock it and then oldsk.
+        */
+       lock_sock_nested(newsk, SINGLE_DEPTH_NESTING);
++      sctp_for_each_tx_datachunk(assoc, sctp_clear_owner_w);
+       sctp_assoc_migrate(assoc, newsk);
++      sctp_for_each_tx_datachunk(assoc, sctp_set_owner_w);
+ 
+       /* If the association on the newsk is already closed before accept()
+        * is called, set RCV_SHUTDOWN flag.
diff --git a/queue-4.9/series b/queue-4.9/series

new file mode 100644 (file)

index 0000000..bf4e3af
--- /dev/null
+++ b/queue-4.9/series
@@ -0,0 +1,25 @@
+gso-fix-payload-length-when-gso_size-is-zero.patch
+tun-tap-sanitize-tunsetsndbuf-input.patch
+ipv6-addrconf-increment-ifp-refcount-before-ipv6_del_addr.patch
+netlink-do-not-set-cb_running-if-dump-s-start-errs.patch
+net-call-cgroup_sk_alloc-earlier-in-sk_clone_lock.patch
+tcp-fix-tcp_mtu_probe-vs-highest_sack.patch
+l2tp-check-ps-sock-before-running-pppol2tp_session_ioctl.patch
+tun-call-dev_get_valid_name-before-register_netdevice.patch
+sctp-add-the-missing-sock_owned_by_user-check-in-sctp_icmp_redirect.patch
+tcp-dccp-fix-ireq-opt-races.patch
+packet-avoid-panic-in-packet_getsockopt.patch
+soreuseport-fix-initialization-race.patch
+ipv6-flowlabel-do-not-leave-opt-tot_len-with-garbage.patch
+sctp-full-support-for-ipv6-ip_nonlocal_bind-ip_freebind.patch
+tcp-dccp-fix-lockdep-splat-in-inet_csk_route_req.patch
+tcp-dccp-fix-other-lockdep-splats-accessing-ireq_opt.patch
+net-unix-don-t-show-information-about-sockets-from-other-namespaces.patch
+tap-double-free-in-error-path-in-tap_open.patch
+ipip-only-increase-err_count-for-some-certain-type-icmp-in-ipip_err.patch
+ip6_gre-only-increase-err_count-for-some-certain-type-icmpv6-in-ip6gre_err.patch
+ip6_gre-update-dst-pmtu-if-dev-mtu-has-been-updated-by-toobig-in-__gre6_xmit.patch
+tun-allow-positive-return-values-on-dev_get_valid_name-call.patch
+sctp-reset-owner-sk-for-data-chunks-on-out-queues-when-migrating-a-sock.patch
+net_sched-avoid-matching-qdisc-with-zero-handle.patch
+ppp-fix-race-in-ppp-device-destruction.patch
diff --git a/queue-4.9/soreuseport-fix-initialization-race.patch b/queue-4.9/soreuseport-fix-initialization-race.patch

new file mode 100644 (file)

index 0000000..4d4fbac
--- /dev/null
+++ b/queue-4.9/soreuseport-fix-initialization-race.patch
@@ -0,0 +1,91 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Craig Gallek <kraig@google.com>
+Date: Thu, 19 Oct 2017 15:00:29 -0400
+Subject: soreuseport: fix initialization race
+
+From: Craig Gallek <kraig@google.com>
+
+
+[ Upstream commit 1b5f962e71bfad6284574655c406597535c3ea7a ]
+
+Syzkaller stumbled upon a way to trigger
+WARNING: CPU: 1 PID: 13881 at net/core/sock_reuseport.c:41
+reuseport_alloc+0x306/0x3b0 net/core/sock_reuseport.c:39
+
+There are two initialization paths for the sock_reuseport structure in a
+socket: Through the udp/tcp bind paths of SO_REUSEPORT sockets or through
+SO_ATTACH_REUSEPORT_[CE]BPF before bind.  The existing implementation
+assumedthat the socket lock protected both of these paths when it actually
+only protects the SO_ATTACH_REUSEPORT path.  Syzkaller triggered this
+double allocation by running these paths concurrently.
+
+This patch moves the check for double allocation into the reuseport_alloc
+function which is protected by a global spin lock.
+
+Fixes: e32ea7e74727 ("soreuseport: fast reuseport UDP socket selection")
+Fixes: c125e80b8868 ("soreuseport: fast reuseport TCP socket selection")
+Signed-off-by: Craig Gallek <kraig@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/sock_reuseport.c  |   12 +++++++++---
+ net/ipv4/inet_hashtables.c |    5 +----
+ net/ipv4/udp.c             |    5 +----
+ 3 files changed, 11 insertions(+), 11 deletions(-)
+
+--- a/net/core/sock_reuseport.c
++++ b/net/core/sock_reuseport.c
+@@ -36,9 +36,14 @@ int reuseport_alloc(struct sock *sk)
+        * soft irq of receive path or setsockopt from process context
+        */
+       spin_lock_bh(&reuseport_lock);
+-      WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb,
+-                                          lockdep_is_held(&reuseport_lock)),
+-                "multiple allocations for the same socket");
++
++      /* Allocation attempts can occur concurrently via the setsockopt path
++       * and the bind/hash path.  Nothing to do when we lose the race.
++       */
++      if (rcu_dereference_protected(sk->sk_reuseport_cb,
++                                    lockdep_is_held(&reuseport_lock)))
++              goto out;
++
+       reuse = __reuseport_alloc(INIT_SOCKS);
+       if (!reuse) {
+               spin_unlock_bh(&reuseport_lock);
+@@ -49,6 +54,7 @@ int reuseport_alloc(struct sock *sk)
+       reuse->num_socks = 1;
+       rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+ 
++out:
+       spin_unlock_bh(&reuseport_lock);
+ 
+       return 0;
+--- a/net/ipv4/inet_hashtables.c
++++ b/net/ipv4/inet_hashtables.c
+@@ -455,10 +455,7 @@ static int inet_reuseport_add_sock(struc
+                       return reuseport_add_sock(sk, sk2);
+       }
+ 
+-      /* Initial allocation may have already happened via setsockopt */
+-      if (!rcu_access_pointer(sk->sk_reuseport_cb))
+-              return reuseport_alloc(sk);
+-      return 0;
++      return reuseport_alloc(sk);
+ }
+ 
+ int __inet_hash(struct sock *sk, struct sock *osk,
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -222,10 +222,7 @@ static int udp_reuseport_add_sock(struct
+               }
+       }
+ 
+-      /* Initial allocation may have already happened via setsockopt */
+-      if (!rcu_access_pointer(sk->sk_reuseport_cb))
+-              return reuseport_alloc(sk);
+-      return 0;
++      return reuseport_alloc(sk);
+ }
+ 
+ /**
diff --git a/queue-4.9/tap-double-free-in-error-path-in-tap_open.patch b/queue-4.9/tap-double-free-in-error-path-in-tap_open.patch

new file mode 100644 (file)

index 0000000..5162d1d
--- /dev/null
+++ b/queue-4.9/tap-double-free-in-error-path-in-tap_open.patch
@@ -0,0 +1,66 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Girish Moodalbail <girish.moodalbail@oracle.com>
+Date: Wed, 25 Oct 2017 00:23:04 -0700
+Subject: tap: double-free in error path in tap_open()
+
+From: Girish Moodalbail <girish.moodalbail@oracle.com>
+
+
+[ Upstream commit 78e0ea6791d7baafb8a0ca82b1bd0c7b3453c919 ]
+
+Double free of skb_array in tap module is causing kernel panic. When
+tap_set_queue() fails we free skb_array right away by calling
+skb_array_cleanup(). However, later on skb_array_cleanup() is called
+again by tap_sock_destruct through sock_put(). This patch fixes that
+issue.
+
+Fixes: 362899b8725b35e3 (macvtap: switch to use skb array)
+Signed-off-by: Girish Moodalbail <girish.moodalbail@oracle.com>
+Acked-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/macvtap.c |   18 +++++++++---------
+ 1 file changed, 9 insertions(+), 9 deletions(-)
+
+--- a/drivers/net/macvtap.c
++++ b/drivers/net/macvtap.c
+@@ -559,6 +559,10 @@ static int macvtap_open(struct inode *in
+                                            &macvtap_proto, 0);
+       if (!q)
+               goto err;
++      if (skb_array_init(&q->skb_array, dev->tx_queue_len, GFP_KERNEL)) {
++              sk_free(&q->sk);
++              goto err;
++      }
+ 
+       RCU_INIT_POINTER(q->sock.wq, &q->wq);
+       init_waitqueue_head(&q->wq.wait);
+@@ -582,22 +586,18 @@ static int macvtap_open(struct inode *in
+       if ((dev->features & NETIF_F_HIGHDMA) && (dev->features & NETIF_F_SG))
+               sock_set_flag(&q->sk, SOCK_ZEROCOPY);
+ 
+-      err = -ENOMEM;
+-      if (skb_array_init(&q->skb_array, dev->tx_queue_len, GFP_KERNEL))
+-              goto err_array;
+-
+       err = macvtap_set_queue(dev, file, q);
+-      if (err)
+-              goto err_queue;
++      if (err) {
++              /* macvtap_sock_destruct() will take care of freeing skb_array */
++              goto err_put;
++      }
+ 
+       dev_put(dev);
+ 
+       rtnl_unlock();
+       return err;
+ 
+-err_queue:
+-      skb_array_cleanup(&q->skb_array);
+-err_array:
++err_put:
+       sock_put(&q->sk);
+ err:
+       if (dev)
diff --git a/queue-4.9/tcp-dccp-fix-ireq-opt-races.patch b/queue-4.9/tcp-dccp-fix-ireq-opt-races.patch

new file mode 100644 (file)

index 0000000..47418a0
--- /dev/null
+++ b/queue-4.9/tcp-dccp-fix-ireq-opt-races.patch
@@ -0,0 +1,408 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Fri, 20 Oct 2017 09:04:13 -0700
+Subject: tcp/dccp: fix ireq->opt races
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit c92e8c02fe664155ac4234516e32544bec0f113d ]
+
+syzkaller found another bug in DCCP/TCP stacks [1]
+
+For the reasons explained in commit ce1050089c96 ("tcp/dccp: fix
+ireq->pktopts race"), we need to make sure we do not access
+ireq->opt unless we own the request sock.
+
+Note the opt field is renamed to ireq_opt to ease grep games.
+
+[1]
+BUG: KASAN: use-after-free in ip_queue_xmit+0x1687/0x18e0 net/ipv4/ip_output.c:474
+Read of size 1 at addr ffff8801c951039c by task syz-executor5/3295
+
+CPU: 1 PID: 3295 Comm: syz-executor5 Not tainted 4.14.0-rc4+ #80
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+Call Trace:
+ __dump_stack lib/dump_stack.c:16 [inline]
+ dump_stack+0x194/0x257 lib/dump_stack.c:52
+ print_address_description+0x73/0x250 mm/kasan/report.c:252
+ kasan_report_error mm/kasan/report.c:351 [inline]
+ kasan_report+0x25b/0x340 mm/kasan/report.c:409
+ __asan_report_load1_noabort+0x14/0x20 mm/kasan/report.c:427
+ ip_queue_xmit+0x1687/0x18e0 net/ipv4/ip_output.c:474
+ tcp_transmit_skb+0x1ab7/0x3840 net/ipv4/tcp_output.c:1135
+ tcp_send_ack.part.37+0x3bb/0x650 net/ipv4/tcp_output.c:3587
+ tcp_send_ack+0x49/0x60 net/ipv4/tcp_output.c:3557
+ __tcp_ack_snd_check+0x2c6/0x4b0 net/ipv4/tcp_input.c:5072
+ tcp_ack_snd_check net/ipv4/tcp_input.c:5085 [inline]
+ tcp_rcv_state_process+0x2eff/0x4850 net/ipv4/tcp_input.c:6071
+ tcp_child_process+0x342/0x990 net/ipv4/tcp_minisocks.c:816
+ tcp_v4_rcv+0x1827/0x2f80 net/ipv4/tcp_ipv4.c:1682
+ ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216
+ NF_HOOK include/linux/netfilter.h:249 [inline]
+ ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257
+ dst_input include/net/dst.h:464 [inline]
+ ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397
+ NF_HOOK include/linux/netfilter.h:249 [inline]
+ ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493
+ __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476
+ __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514
+ netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587
+ netif_receive_skb+0xae/0x390 net/core/dev.c:4611
+ tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372
+ tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766
+ tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792
+ call_write_iter include/linux/fs.h:1770 [inline]
+ new_sync_write fs/read_write.c:468 [inline]
+ __vfs_write+0x68a/0x970 fs/read_write.c:481
+ vfs_write+0x18f/0x510 fs/read_write.c:543
+ SYSC_write fs/read_write.c:588 [inline]
+ SyS_write+0xef/0x220 fs/read_write.c:580
+ entry_SYSCALL_64_fastpath+0x1f/0xbe
+RIP: 0033:0x40c341
+RSP: 002b:00007f469523ec10 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
+RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 000000000040c341
+RDX: 0000000000000037 RSI: 0000000020004000 RDI: 0000000000000015
+RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000
+R10: 00000000000f4240 R11: 0000000000000293 R12: 00000000004b7fd1
+R13: 00000000ffffffff R14: 0000000020000000 R15: 0000000000025000
+
+Allocated by task 3295:
+ save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
+ save_stack+0x43/0xd0 mm/kasan/kasan.c:447
+ set_track mm/kasan/kasan.c:459 [inline]
+ kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551
+ __do_kmalloc mm/slab.c:3725 [inline]
+ __kmalloc+0x162/0x760 mm/slab.c:3734
+ kmalloc include/linux/slab.h:498 [inline]
+ tcp_v4_save_options include/net/tcp.h:1962 [inline]
+ tcp_v4_init_req+0x2d3/0x3e0 net/ipv4/tcp_ipv4.c:1271
+ tcp_conn_request+0xf6d/0x3410 net/ipv4/tcp_input.c:6283
+ tcp_v4_conn_request+0x157/0x210 net/ipv4/tcp_ipv4.c:1313
+ tcp_rcv_state_process+0x8ea/0x4850 net/ipv4/tcp_input.c:5857
+ tcp_v4_do_rcv+0x55c/0x7d0 net/ipv4/tcp_ipv4.c:1482
+ tcp_v4_rcv+0x2d10/0x2f80 net/ipv4/tcp_ipv4.c:1711
+ ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216
+ NF_HOOK include/linux/netfilter.h:249 [inline]
+ ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257
+ dst_input include/net/dst.h:464 [inline]
+ ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397
+ NF_HOOK include/linux/netfilter.h:249 [inline]
+ ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493
+ __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476
+ __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514
+ netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587
+ netif_receive_skb+0xae/0x390 net/core/dev.c:4611
+ tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372
+ tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766
+ tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792
+ call_write_iter include/linux/fs.h:1770 [inline]
+ new_sync_write fs/read_write.c:468 [inline]
+ __vfs_write+0x68a/0x970 fs/read_write.c:481
+ vfs_write+0x18f/0x510 fs/read_write.c:543
+ SYSC_write fs/read_write.c:588 [inline]
+ SyS_write+0xef/0x220 fs/read_write.c:580
+ entry_SYSCALL_64_fastpath+0x1f/0xbe
+
+Freed by task 3306:
+ save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59
+ save_stack+0x43/0xd0 mm/kasan/kasan.c:447
+ set_track mm/kasan/kasan.c:459 [inline]
+ kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524
+ __cache_free mm/slab.c:3503 [inline]
+ kfree+0xca/0x250 mm/slab.c:3820
+ inet_sock_destruct+0x59d/0x950 net/ipv4/af_inet.c:157
+ __sk_destruct+0xfd/0x910 net/core/sock.c:1560
+ sk_destruct+0x47/0x80 net/core/sock.c:1595
+ __sk_free+0x57/0x230 net/core/sock.c:1603
+ sk_free+0x2a/0x40 net/core/sock.c:1614
+ sock_put include/net/sock.h:1652 [inline]
+ inet_csk_complete_hashdance+0xd5/0xf0 net/ipv4/inet_connection_sock.c:959
+ tcp_check_req+0xf4d/0x1620 net/ipv4/tcp_minisocks.c:765
+ tcp_v4_rcv+0x17f6/0x2f80 net/ipv4/tcp_ipv4.c:1675
+ ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216
+ NF_HOOK include/linux/netfilter.h:249 [inline]
+ ip_local_deliver+0x1ce/0x6e0 net/ipv4/ip_input.c:257
+ dst_input include/net/dst.h:464 [inline]
+ ip_rcv_finish+0x887/0x19a0 net/ipv4/ip_input.c:397
+ NF_HOOK include/linux/netfilter.h:249 [inline]
+ ip_rcv+0xc3f/0x1820 net/ipv4/ip_input.c:493
+ __netif_receive_skb_core+0x1a3e/0x34b0 net/core/dev.c:4476
+ __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4514
+ netif_receive_skb_internal+0x10b/0x670 net/core/dev.c:4587
+ netif_receive_skb+0xae/0x390 net/core/dev.c:4611
+ tun_rx_batched.isra.50+0x5ed/0x860 drivers/net/tun.c:1372
+ tun_get_user+0x249c/0x36d0 drivers/net/tun.c:1766
+ tun_chr_write_iter+0xbf/0x160 drivers/net/tun.c:1792
+ call_write_iter include/linux/fs.h:1770 [inline]
+ new_sync_write fs/read_write.c:468 [inline]
+ __vfs_write+0x68a/0x970 fs/read_write.c:481
+ vfs_write+0x18f/0x510 fs/read_write.c:543
+ SYSC_write fs/read_write.c:588 [inline]
+ SyS_write+0xef/0x220 fs/read_write.c:580
+ entry_SYSCALL_64_fastpath+0x1f/0xbe
+
+Fixes: e994b2f0fb92 ("tcp: do not lock listener to process SYN packets")
+Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/inet_sock.h         |    2 +-
+ net/dccp/ipv4.c                 |   13 ++++++++-----
+ net/ipv4/cipso_ipv4.c           |   24 +++++++-----------------
+ net/ipv4/inet_connection_sock.c |    8 +++-----
+ net/ipv4/syncookies.c           |    2 +-
+ net/ipv4/tcp_input.c            |    2 +-
+ net/ipv4/tcp_ipv4.c             |   21 ++++++++++++---------
+ 7 files changed, 33 insertions(+), 39 deletions(-)
+
+--- a/include/net/inet_sock.h
++++ b/include/net/inet_sock.h
+@@ -96,7 +96,7 @@ struct inet_request_sock {
+       kmemcheck_bitfield_end(flags);
+       u32                     ir_mark;
+       union {
+-              struct ip_options_rcu   *opt;
++              struct ip_options_rcu __rcu     *ireq_opt;
+ #if IS_ENABLED(CONFIG_IPV6)
+               struct {
+                       struct ipv6_txoptions   *ipv6_opt;
+--- a/net/dccp/ipv4.c
++++ b/net/dccp/ipv4.c
+@@ -414,8 +414,7 @@ struct sock *dccp_v4_request_recv_sock(c
+       sk_daddr_set(newsk, ireq->ir_rmt_addr);
+       sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
+       newinet->inet_saddr     = ireq->ir_loc_addr;
+-      newinet->inet_opt       = ireq->opt;
+-      ireq->opt          = NULL;
++      RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt));
+       newinet->mc_index  = inet_iif(skb);
+       newinet->mc_ttl    = ip_hdr(skb)->ttl;
+       newinet->inet_id   = jiffies;
+@@ -430,7 +429,10 @@ struct sock *dccp_v4_request_recv_sock(c
+       if (__inet_inherit_port(sk, newsk) < 0)
+               goto put_and_exit;
+       *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+-
++      if (*own_req)
++              ireq->ireq_opt = NULL;
++      else
++              newinet->inet_opt = NULL;
+       return newsk;
+ 
+ exit_overflow:
+@@ -441,6 +443,7 @@ exit:
+       __NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENDROPS);
+       return NULL;
+ put_and_exit:
++      newinet->inet_opt = NULL;
+       inet_csk_prepare_forced_close(newsk);
+       dccp_done(newsk);
+       goto exit;
+@@ -492,7 +495,7 @@ static int dccp_v4_send_response(const s
+                                                             ireq->ir_rmt_addr);
+               err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+                                           ireq->ir_rmt_addr,
+-                                          ireq->opt);
++                                          rcu_dereference(ireq->ireq_opt));
+               err = net_xmit_eval(err);
+       }
+ 
+@@ -548,7 +551,7 @@ out:
+ static void dccp_v4_reqsk_destructor(struct request_sock *req)
+ {
+       dccp_feat_list_purge(&dccp_rsk(req)->dreq_featneg);
+-      kfree(inet_rsk(req)->opt);
++      kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
+ }
+ 
+ void dccp_syn_ack_timeout(const struct request_sock *req)
+--- a/net/ipv4/cipso_ipv4.c
++++ b/net/ipv4/cipso_ipv4.c
+@@ -1943,7 +1943,7 @@ int cipso_v4_req_setattr(struct request_
+       buf = NULL;
+ 
+       req_inet = inet_rsk(req);
+-      opt = xchg(&req_inet->opt, opt);
++      opt = xchg((__force struct ip_options_rcu **)&req_inet->ireq_opt, opt);
+       if (opt)
+               kfree_rcu(opt, rcu);
+ 
+@@ -1965,11 +1965,13 @@ req_setattr_failure:
+  * values on failure.
+  *
+  */
+-static int cipso_v4_delopt(struct ip_options_rcu **opt_ptr)
++static int cipso_v4_delopt(struct ip_options_rcu __rcu **opt_ptr)
+ {
++      struct ip_options_rcu *opt = rcu_dereference_protected(*opt_ptr, 1);
+       int hdr_delta = 0;
+-      struct ip_options_rcu *opt = *opt_ptr;
+ 
++      if (!opt || opt->opt.cipso == 0)
++              return 0;
+       if (opt->opt.srr || opt->opt.rr || opt->opt.ts || opt->opt.router_alert) {
+               u8 cipso_len;
+               u8 cipso_off;
+@@ -2031,14 +2033,10 @@ static int cipso_v4_delopt(struct ip_opt
+  */
+ void cipso_v4_sock_delattr(struct sock *sk)
+ {
+-      int hdr_delta;
+-      struct ip_options_rcu *opt;
+       struct inet_sock *sk_inet;
++      int hdr_delta;
+ 
+       sk_inet = inet_sk(sk);
+-      opt = rcu_dereference_protected(sk_inet->inet_opt, 1);
+-      if (!opt || opt->opt.cipso == 0)
+-              return;
+ 
+       hdr_delta = cipso_v4_delopt(&sk_inet->inet_opt);
+       if (sk_inet->is_icsk && hdr_delta > 0) {
+@@ -2058,15 +2056,7 @@ void cipso_v4_sock_delattr(struct sock *
+  */
+ void cipso_v4_req_delattr(struct request_sock *req)
+ {
+-      struct ip_options_rcu *opt;
+-      struct inet_request_sock *req_inet;
+-
+-      req_inet = inet_rsk(req);
+-      opt = req_inet->opt;
+-      if (!opt || opt->opt.cipso == 0)
+-              return;
+-
+-      cipso_v4_delopt(&req_inet->opt);
++      cipso_v4_delopt(&inet_rsk(req)->ireq_opt);
+ }
+ 
+ /**
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -407,9 +407,10 @@ struct dst_entry *inet_csk_route_req(con
+ {
+       const struct inet_request_sock *ireq = inet_rsk(req);
+       struct net *net = read_pnet(&ireq->ireq_net);
+-      struct ip_options_rcu *opt = ireq->opt;
++      struct ip_options_rcu *opt;
+       struct rtable *rt;
+ 
++      opt = rcu_dereference(ireq->ireq_opt);
+       flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
+                          RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+                          sk->sk_protocol, inet_sk_flowi_flags(sk),
+@@ -443,10 +444,9 @@ struct dst_entry *inet_csk_route_child_s
+       struct flowi4 *fl4;
+       struct rtable *rt;
+ 
++      opt = rcu_dereference(ireq->ireq_opt);
+       fl4 = &newinet->cork.fl.u.ip4;
+ 
+-      rcu_read_lock();
+-      opt = rcu_dereference(newinet->inet_opt);
+       flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
+                          RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+                          sk->sk_protocol, inet_sk_flowi_flags(sk),
+@@ -459,13 +459,11 @@ struct dst_entry *inet_csk_route_child_s
+               goto no_route;
+       if (opt && opt->opt.is_strictroute && rt->rt_uses_gateway)
+               goto route_err;
+-      rcu_read_unlock();
+       return &rt->dst;
+ 
+ route_err:
+       ip_rt_put(rt);
+ no_route:
+-      rcu_read_unlock();
+       __IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES);
+       return NULL;
+ }
+--- a/net/ipv4/syncookies.c
++++ b/net/ipv4/syncookies.c
+@@ -354,7 +354,7 @@ struct sock *cookie_v4_check(struct sock
+       /* We throwed the options of the initial SYN away, so we hope
+        * the ACK carries the same options again (see RFC1122 4.2.3.8)
+        */
+-      ireq->opt = tcp_v4_save_options(skb);
++      RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(skb));
+ 
+       if (security_inet_conn_request(sk, skb, req)) {
+               reqsk_free(req);
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -6237,7 +6237,7 @@ struct request_sock *inet_reqsk_alloc(co
+               struct inet_request_sock *ireq = inet_rsk(req);
+ 
+               kmemcheck_annotate_bitfield(ireq, flags);
+-              ireq->opt = NULL;
++              ireq->ireq_opt = NULL;
+ #if IS_ENABLED(CONFIG_IPV6)
+               ireq->pktopts = NULL;
+ #endif
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -861,7 +861,7 @@ static int tcp_v4_send_synack(const stru
+ 
+               err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+                                           ireq->ir_rmt_addr,
+-                                          ireq->opt);
++                                          rcu_dereference(ireq->ireq_opt));
+               err = net_xmit_eval(err);
+       }
+ 
+@@ -873,7 +873,7 @@ static int tcp_v4_send_synack(const stru
+  */
+ static void tcp_v4_reqsk_destructor(struct request_sock *req)
+ {
+-      kfree(inet_rsk(req)->opt);
++      kfree(rcu_dereference_protected(inet_rsk(req)->ireq_opt, 1));
+ }
+ 
+ #ifdef CONFIG_TCP_MD5SIG
+@@ -1199,7 +1199,7 @@ static void tcp_v4_init_req(struct reque
+ 
+       sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
+       sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
+-      ireq->opt = tcp_v4_save_options(skb);
++      RCU_INIT_POINTER(ireq->ireq_opt, tcp_v4_save_options(skb));
+ }
+ 
+ static struct dst_entry *tcp_v4_route_req(const struct sock *sk,
+@@ -1295,10 +1295,9 @@ struct sock *tcp_v4_syn_recv_sock(const
+       sk_daddr_set(newsk, ireq->ir_rmt_addr);
+       sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
+       newsk->sk_bound_dev_if = ireq->ir_iif;
+-      newinet->inet_saddr           = ireq->ir_loc_addr;
+-      inet_opt              = ireq->opt;
+-      rcu_assign_pointer(newinet->inet_opt, inet_opt);
+-      ireq->opt             = NULL;
++      newinet->inet_saddr   = ireq->ir_loc_addr;
++      inet_opt              = rcu_dereference(ireq->ireq_opt);
++      RCU_INIT_POINTER(newinet->inet_opt, inet_opt);
+       newinet->mc_index     = inet_iif(skb);
+       newinet->mc_ttl       = ip_hdr(skb)->ttl;
+       newinet->rcv_tos      = ip_hdr(skb)->tos;
+@@ -1346,9 +1345,12 @@ struct sock *tcp_v4_syn_recv_sock(const
+       if (__inet_inherit_port(sk, newsk) < 0)
+               goto put_and_exit;
+       *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+-      if (*own_req)
++      if (likely(*own_req)) {
+               tcp_move_syn(newtp, req);
+-
++              ireq->ireq_opt = NULL;
++      } else {
++              newinet->inet_opt = NULL;
++      }
+       return newsk;
+ 
+ exit_overflow:
+@@ -1359,6 +1361,7 @@ exit:
+       tcp_listendrop(sk);
+       return NULL;
+ put_and_exit:
++      newinet->inet_opt = NULL;
+       inet_csk_prepare_forced_close(newsk);
+       tcp_done(newsk);
+       goto exit;
diff --git a/queue-4.9/tcp-dccp-fix-lockdep-splat-in-inet_csk_route_req.patch b/queue-4.9/tcp-dccp-fix-lockdep-splat-in-inet_csk_route_req.patch

new file mode 100644 (file)

index 0000000..fff7df2
--- /dev/null
+++ b/queue-4.9/tcp-dccp-fix-lockdep-splat-in-inet_csk_route_req.patch
@@ -0,0 +1,49 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Sun, 22 Oct 2017 12:33:57 -0700
+Subject: tcp/dccp: fix lockdep splat in inet_csk_route_req()
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit a6ca7abe53633d08eea1c6756cb49c9b2d4c90bf ]
+
+This patch fixes the following lockdep splat in inet_csk_route_req()
+
+  lockdep_rcu_suspicious
+  inet_csk_route_req
+  tcp_v4_send_synack
+  tcp_rtx_synack
+  inet_rtx_syn_ack
+  tcp_fastopen_synack_time
+  tcp_retransmit_timer
+  tcp_write_timer_handler
+  tcp_write_timer
+  call_timer_fn
+
+Thread running inet_csk_route_req() owns a reference on the request
+socket, so we have the guarantee ireq->ireq_opt wont be changed or
+freed.
+
+lockdep can enforce this invariant for us.
+
+Fixes: c92e8c02fe66 ("tcp/dccp: fix ireq->opt races")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/inet_connection_sock.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -410,7 +410,8 @@ struct dst_entry *inet_csk_route_req(con
+       struct ip_options_rcu *opt;
+       struct rtable *rt;
+ 
+-      opt = rcu_dereference(ireq->ireq_opt);
++      opt = rcu_dereference_protected(ireq->ireq_opt,
++                                      atomic_read(&req->rsk_refcnt) > 0);
+       flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
+                          RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+                          sk->sk_protocol, inet_sk_flowi_flags(sk),
diff --git a/queue-4.9/tcp-dccp-fix-other-lockdep-splats-accessing-ireq_opt.patch b/queue-4.9/tcp-dccp-fix-other-lockdep-splats-accessing-ireq_opt.patch

new file mode 100644 (file)

index 0000000..7a9ebd4
--- /dev/null
+++ b/queue-4.9/tcp-dccp-fix-other-lockdep-splats-accessing-ireq_opt.patch
@@ -0,0 +1,113 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Tue, 24 Oct 2017 08:20:31 -0700
+Subject: tcp/dccp: fix other lockdep splats accessing ireq_opt
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 06f877d613be3621604c2520ec0351d9fbdca15f ]
+
+In my first attempt to fix the lockdep splat, I forgot we could
+enter inet_csk_route_req() with a freshly allocated request socket,
+for which refcount has not yet been elevated, due to complex
+SLAB_TYPESAFE_BY_RCU rules.
+
+We either are in rcu_read_lock() section _or_ we own a refcount on the
+request.
+
+Correct RCU verb to use here is rcu_dereference_check(), although it is
+not possible to prove we actually own a reference on a shared
+refcount :/
+
+In v2, I added ireq_opt_deref() helper and use in three places, to fix other
+possible splats.
+
+[   49.844590]  lockdep_rcu_suspicious+0xea/0xf3
+[   49.846487]  inet_csk_route_req+0x53/0x14d
+[   49.848334]  tcp_v4_route_req+0xe/0x10
+[   49.850174]  tcp_conn_request+0x31c/0x6a0
+[   49.851992]  ? __lock_acquire+0x614/0x822
+[   49.854015]  tcp_v4_conn_request+0x5a/0x79
+[   49.855957]  ? tcp_v4_conn_request+0x5a/0x79
+[   49.858052]  tcp_rcv_state_process+0x98/0xdcc
+[   49.859990]  ? sk_filter_trim_cap+0x2f6/0x307
+[   49.862085]  tcp_v4_do_rcv+0xfc/0x145
+[   49.864055]  ? tcp_v4_do_rcv+0xfc/0x145
+[   49.866173]  tcp_v4_rcv+0x5ab/0xaf9
+[   49.868029]  ip_local_deliver_finish+0x1af/0x2e7
+[   49.870064]  ip_local_deliver+0x1b2/0x1c5
+[   49.871775]  ? inet_del_offload+0x45/0x45
+[   49.873916]  ip_rcv_finish+0x3f7/0x471
+[   49.875476]  ip_rcv+0x3f1/0x42f
+[   49.876991]  ? ip_local_deliver_finish+0x2e7/0x2e7
+[   49.878791]  __netif_receive_skb_core+0x6d3/0x950
+[   49.880701]  ? process_backlog+0x7e/0x216
+[   49.882589]  __netif_receive_skb+0x1d/0x5e
+[   49.884122]  process_backlog+0x10c/0x216
+[   49.885812]  net_rx_action+0x147/0x3df
+
+Fixes: a6ca7abe53633 ("tcp/dccp: fix lockdep splat in inet_csk_route_req()")
+Fixes: c92e8c02fe66 ("tcp/dccp: fix ireq->opt races")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: kernel test robot <fengguang.wu@intel.com>
+Reported-by: Maciej Żenczykowski <maze@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/inet_sock.h         |    6 ++++++
+ net/dccp/ipv4.c                 |    2 +-
+ net/ipv4/inet_connection_sock.c |    4 ++--
+ net/ipv4/tcp_ipv4.c             |    2 +-
+ 4 files changed, 10 insertions(+), 4 deletions(-)
+
+--- a/include/net/inet_sock.h
++++ b/include/net/inet_sock.h
+@@ -132,6 +132,12 @@ static inline int inet_request_bound_dev
+       return sk->sk_bound_dev_if;
+ }
+ 
++static inline struct ip_options_rcu *ireq_opt_deref(const struct inet_request_sock *ireq)
++{
++      return rcu_dereference_check(ireq->ireq_opt,
++                                   atomic_read(&ireq->req.rsk_refcnt) > 0);
++}
++
+ struct inet_cork {
+       unsigned int            flags;
+       __be32                  addr;
+--- a/net/dccp/ipv4.c
++++ b/net/dccp/ipv4.c
+@@ -495,7 +495,7 @@ static int dccp_v4_send_response(const s
+                                                             ireq->ir_rmt_addr);
+               err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+                                           ireq->ir_rmt_addr,
+-                                          rcu_dereference(ireq->ireq_opt));
++                                          ireq_opt_deref(ireq));
+               err = net_xmit_eval(err);
+       }
+ 
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -410,8 +410,8 @@ struct dst_entry *inet_csk_route_req(con
+       struct ip_options_rcu *opt;
+       struct rtable *rt;
+ 
+-      opt = rcu_dereference_protected(ireq->ireq_opt,
+-                                      atomic_read(&req->rsk_refcnt) > 0);
++      opt = ireq_opt_deref(ireq);
++
+       flowi4_init_output(fl4, ireq->ir_iif, ireq->ir_mark,
+                          RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
+                          sk->sk_protocol, inet_sk_flowi_flags(sk),
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -861,7 +861,7 @@ static int tcp_v4_send_synack(const stru
+ 
+               err = ip_build_and_send_pkt(skb, sk, ireq->ir_loc_addr,
+                                           ireq->ir_rmt_addr,
+-                                          rcu_dereference(ireq->ireq_opt));
++                                          ireq_opt_deref(ireq));
+               err = net_xmit_eval(err);
+       }
+ 
diff --git a/queue-4.9/tcp-fix-tcp_mtu_probe-vs-highest_sack.patch b/queue-4.9/tcp-fix-tcp_mtu_probe-vs-highest_sack.patch

new file mode 100644 (file)

index 0000000..9d976e8
--- /dev/null
+++ b/queue-4.9/tcp-fix-tcp_mtu_probe-vs-highest_sack.patch
@@ -0,0 +1,81 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Mon, 30 Oct 2017 23:08:20 -0700
+Subject: tcp: fix tcp_mtu_probe() vs highest_sack
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 2b7cda9c35d3b940eb9ce74b30bbd5eb30db493d ]
+
+Based on SNMP values provided by Roman, Yuchung made the observation
+that some crashes in tcp_sacktag_walk() might be caused by MTU probing.
+
+Looking at tcp_mtu_probe(), I found that when a new skb was placed
+in front of the write queue, we were not updating tcp highest sack.
+
+If one skb is freed because all its content was copied to the new skb
+(for MTU probing), then tp->highest_sack could point to a now freed skb.
+
+Bad things would then happen, including infinite loops.
+
+This patch renames tcp_highest_sack_combine() and uses it
+from tcp_mtu_probe() to fix the bug.
+
+Note that I also removed one test against tp->sacked_out,
+since we want to replace tp->highest_sack regardless of whatever
+condition, since keeping a stale pointer to freed skb is a recipe
+for disaster.
+
+Fixes: a47e5a988a57 ("[TCP]: Convert highest_sack to sk_buff to allow direct access")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
+Reported-by: Roman Gushchin <guro@fb.com>
+Reported-by: Oleksandr Natalenko <oleksandr@natalenko.name>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/tcp.h     |    6 +++---
+ net/ipv4/tcp_output.c |    3 ++-
+ 2 files changed, 5 insertions(+), 4 deletions(-)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -1681,12 +1681,12 @@ static inline void tcp_highest_sack_rese
+       tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk);
+ }
+ 
+-/* Called when old skb is about to be deleted (to be combined with new skb) */
+-static inline void tcp_highest_sack_combine(struct sock *sk,
++/* Called when old skb is about to be deleted and replaced by new skb */
++static inline void tcp_highest_sack_replace(struct sock *sk,
+                                           struct sk_buff *old,
+                                           struct sk_buff *new)
+ {
+-      if (tcp_sk(sk)->sacked_out && (old == tcp_sk(sk)->highest_sack))
++      if (old == tcp_highest_sack(sk))
+               tcp_sk(sk)->highest_sack = new;
+ }
+ 
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -1996,6 +1996,7 @@ static int tcp_mtu_probe(struct sock *sk
+       nskb->ip_summed = skb->ip_summed;
+ 
+       tcp_insert_write_queue_before(nskb, skb, sk);
++      tcp_highest_sack_replace(sk, skb, nskb);
+ 
+       len = 0;
+       tcp_for_write_queue_from_safe(skb, next, sk) {
+@@ -2535,7 +2536,7 @@ static void tcp_collapse_retrans(struct
+ 
+       BUG_ON(tcp_skb_pcount(skb) != 1 || tcp_skb_pcount(next_skb) != 1);
+ 
+-      tcp_highest_sack_combine(sk, next_skb, skb);
++      tcp_highest_sack_replace(sk, next_skb, skb);
+ 
+       tcp_unlink_write_queue(next_skb, sk);
+ 
diff --git a/queue-4.9/tun-allow-positive-return-values-on-dev_get_valid_name-call.patch b/queue-4.9/tun-allow-positive-return-values-on-dev_get_valid_name-call.patch

new file mode 100644 (file)

index 0000000..a7b4e6b
--- /dev/null
+++ b/queue-4.9/tun-allow-positive-return-values-on-dev_get_valid_name-call.patch
@@ -0,0 +1,36 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Julien Gomes <julien@arista.com>
+Date: Wed, 25 Oct 2017 11:50:50 -0700
+Subject: tun: allow positive return values on dev_get_valid_name() call
+
+From: Julien Gomes <julien@arista.com>
+
+
+[ Upstream commit 5c25f65fd1e42685f7ccd80e0621829c105785d9 ]
+
+If the name argument of dev_get_valid_name() contains "%d", it will try
+to assign it a unit number in __dev__alloc_name() and return either the
+unit number (>= 0) or an error code (< 0).
+Considering positive values as error values prevent tun device creations
+relying this mechanism, therefor we should only consider negative values
+as errors here.
+
+Signed-off-by: Julien Gomes <julien@arista.com>
+Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/tun.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -1788,7 +1788,7 @@ static int tun_set_iff(struct net *net,
+               if (!dev)
+                       return -ENOMEM;
+               err = dev_get_valid_name(net, dev, name);
+-              if (err)
++              if (err < 0)
+                       goto err_free_dev;
+ 
+               dev_net_set(dev, net);
diff --git a/queue-4.9/tun-call-dev_get_valid_name-before-register_netdevice.patch b/queue-4.9/tun-call-dev_get_valid_name-before-register_netdevice.patch

new file mode 100644 (file)

index 0000000..9816331
--- /dev/null
+++ b/queue-4.9/tun-call-dev_get_valid_name-before-register_netdevice.patch
@@ -0,0 +1,82 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Cong Wang <xiyou.wangcong@gmail.com>
+Date: Fri, 13 Oct 2017 11:58:53 -0700
+Subject: tun: call dev_get_valid_name() before register_netdevice()
+
+From: Cong Wang <xiyou.wangcong@gmail.com>
+
+
+[ Upstream commit 0ad646c81b2182f7fa67ec0c8c825e0ee165696d ]
+
+register_netdevice() could fail early when we have an invalid
+dev name, in which case ->ndo_uninit() is not called. For tun
+device, this is a problem because a timer etc. are already
+initialized and it expects ->ndo_uninit() to clean them up.
+
+We could move these initializations into a ->ndo_init() so
+that register_netdevice() knows better, however this is still
+complicated due to the logic in tun_detach().
+
+Therefore, I choose to just call dev_get_valid_name() before
+register_netdevice(), which is quicker and much easier to audit.
+And for this specific case, it is already enough.
+
+Fixes: 96442e42429e ("tuntap: choose the txq based on rxq")
+Reported-by: Dmitry Alexeev <avekceeb@gmail.com>
+Cc: Jason Wang <jasowang@redhat.com>
+Cc: "Michael S. Tsirkin" <mst@redhat.com>
+Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/tun.c         |    3 +++
+ include/linux/netdevice.h |    3 +++
+ net/core/dev.c            |    6 +++---
+ 3 files changed, 9 insertions(+), 3 deletions(-)
+
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -1787,6 +1787,9 @@ static int tun_set_iff(struct net *net,
+ 
+               if (!dev)
+                       return -ENOMEM;
++              err = dev_get_valid_name(net, dev, name);
++              if (err)
++                      goto err_free_dev;
+ 
+               dev_net_set(dev, net);
+               dev->rtnl_link_ops = &tun_link_ops;
+--- a/include/linux/netdevice.h
++++ b/include/linux/netdevice.h
+@@ -3742,6 +3742,9 @@ struct net_device *alloc_netdev_mqs(int
+                                   unsigned char name_assign_type,
+                                   void (*setup)(struct net_device *),
+                                   unsigned int txqs, unsigned int rxqs);
++int dev_get_valid_name(struct net *net, struct net_device *dev,
++                     const char *name);
++
+ #define alloc_netdev(sizeof_priv, name, name_assign_type, setup) \
+       alloc_netdev_mqs(sizeof_priv, name, name_assign_type, setup, 1, 1)
+ 
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -1115,9 +1115,8 @@ static int dev_alloc_name_ns(struct net
+       return ret;
+ }
+ 
+-static int dev_get_valid_name(struct net *net,
+-                            struct net_device *dev,
+-                            const char *name)
++int dev_get_valid_name(struct net *net, struct net_device *dev,
++                     const char *name)
+ {
+       BUG_ON(!net);
+ 
+@@ -1133,6 +1132,7 @@ static int dev_get_valid_name(struct net
+ 
+       return 0;
+ }
++EXPORT_SYMBOL(dev_get_valid_name);
+ 
+ /**
+  *    dev_change_name - change name of a device
diff --git a/queue-4.9/tun-tap-sanitize-tunsetsndbuf-input.patch b/queue-4.9/tun-tap-sanitize-tunsetsndbuf-input.patch

new file mode 100644 (file)

index 0000000..afb057f
--- /dev/null
+++ b/queue-4.9/tun-tap-sanitize-tunsetsndbuf-input.patch
@@ -0,0 +1,88 @@
+From foo@baz Wed Nov 15 17:24:03 CET 2017
+From: Craig Gallek <kraig@google.com>
+Date: Mon, 30 Oct 2017 18:50:11 -0400
+Subject: tun/tap: sanitize TUNSETSNDBUF input
+
+From: Craig Gallek <kraig@google.com>
+
+
+[ Upstream commit 93161922c658c714715686cd0cf69b090cb9bf1d ]
+
+Syzkaller found several variants of the lockup below by setting negative
+values with the TUNSETSNDBUF ioctl.  This patch adds a sanity check
+to both the tun and tap versions of this ioctl.
+
+  watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [repro:2389]
+  Modules linked in:
+  irq event stamp: 329692056
+  hardirqs last  enabled at (329692055): [<ffffffff824b8381>] _raw_spin_unlock_irqrestore+0x31/0x75
+  hardirqs last disabled at (329692056): [<ffffffff824b9e58>] apic_timer_interrupt+0x98/0xb0
+  softirqs last  enabled at (35659740): [<ffffffff824bc958>] __do_softirq+0x328/0x48c
+  softirqs last disabled at (35659731): [<ffffffff811c796c>] irq_exit+0xbc/0xd0
+  CPU: 0 PID: 2389 Comm: repro Not tainted 4.14.0-rc7 #23
+  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
+  task: ffff880009452140 task.stack: ffff880006a20000
+  RIP: 0010:_raw_spin_lock_irqsave+0x11/0x80
+  RSP: 0018:ffff880006a27c50 EFLAGS: 00000282 ORIG_RAX: ffffffffffffff10
+  RAX: ffff880009ac68d0 RBX: ffff880006a27ce0 RCX: 0000000000000000
+  RDX: 0000000000000001 RSI: ffff880006a27ce0 RDI: ffff880009ac6900
+  RBP: ffff880006a27c60 R08: 0000000000000000 R09: 0000000000000000
+  R10: 0000000000000001 R11: 000000000063ff00 R12: ffff880009ac6900
+  R13: ffff880006a27cf8 R14: 0000000000000001 R15: ffff880006a27cf8
+  FS:  00007f4be4838700(0000) GS:ffff88000cc00000(0000) knlGS:0000000000000000
+  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  CR2: 0000000020101000 CR3: 0000000009616000 CR4: 00000000000006f0
+  Call Trace:
+   prepare_to_wait+0x26/0xc0
+   sock_alloc_send_pskb+0x14e/0x270
+   ? remove_wait_queue+0x60/0x60
+   tun_get_user+0x2cc/0x19d0
+   ? __tun_get+0x60/0x1b0
+   tun_chr_write_iter+0x57/0x86
+   __vfs_write+0x156/0x1e0
+   vfs_write+0xf7/0x230
+   SyS_write+0x57/0xd0
+   entry_SYSCALL_64_fastpath+0x1f/0xbe
+  RIP: 0033:0x7f4be4356df9
+  RSP: 002b:00007ffc18101c08 EFLAGS: 00000293 ORIG_RAX: 0000000000000001
+  RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f4be4356df9
+  RDX: 0000000000000046 RSI: 0000000020101000 RDI: 0000000000000005
+  RBP: 00007ffc18101c40 R08: 0000000000000001 R09: 0000000000000001
+  R10: 0000000000000001 R11: 0000000000000293 R12: 0000559c75f64780
+  R13: 00007ffc18101d30 R14: 0000000000000000 R15: 0000000000000000
+
+Fixes: 33dccbb050bb ("tun: Limit amount of queued packets per device")
+Fixes: 20d29d7a916a ("net: macvtap driver")
+Signed-off-by: Craig Gallek <kraig@google.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/macvtap.c |    2 ++
+ drivers/net/tun.c     |    4 ++++
+ 2 files changed, 6 insertions(+)
+
+--- a/drivers/net/macvtap.c
++++ b/drivers/net/macvtap.c
+@@ -1077,6 +1077,8 @@ static long macvtap_ioctl(struct file *f
+       case TUNSETSNDBUF:
+               if (get_user(s, sp))
+                       return -EFAULT;
++              if (s <= 0)
++                      return -EINVAL;
+ 
+               q->sk.sk_sndbuf = s;
+               return 0;
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -2180,6 +2180,10 @@ static long __tun_chr_ioctl(struct file
+                       ret = -EFAULT;
+                       break;
+               }
++              if (sndbuf <= 0) {
++                      ret = -EINVAL;
++                      break;
++              }
+ 
+               tun->sndbuf = sndbuf;
+               tun_set_sndbuf(tun);
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 15 Nov 2017 16:31:22 +0000 (17:31 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Wed, 15 Nov 2017 16:31:22 +0000 (17:31 +0100)
queue-4.9/gso-fix-payload-length-when-gso_size-is-zero.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/ip6_gre-only-increase-err_count-for-some-certain-type-icmpv6-in-ip6gre_err.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/ip6_gre-update-dst-pmtu-if-dev-mtu-has-been-updated-by-toobig-in-__gre6_xmit.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/ipip-only-increase-err_count-for-some-certain-type-icmp-in-ipip_err.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/ipv6-addrconf-increment-ifp-refcount-before-ipv6_del_addr.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/ipv6-flowlabel-do-not-leave-opt-tot_len-with-garbage.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/l2tp-check-ps-sock-before-running-pppol2tp_session_ioctl.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-call-cgroup_sk_alloc-earlier-in-sk_clone_lock.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-unix-don-t-show-information-about-sockets-from-other-namespaces.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net_sched-avoid-matching-qdisc-with-zero-handle.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/netlink-do-not-set-cb_running-if-dump-s-start-errs.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/packet-avoid-panic-in-packet_getsockopt.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/ppp-fix-race-in-ppp-device-destruction.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/sctp-add-the-missing-sock_owned_by_user-check-in-sctp_icmp_redirect.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/sctp-full-support-for-ipv6-ip_nonlocal_bind-ip_freebind.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/sctp-reset-owner-sk-for-data-chunks-on-out-queues-when-migrating-a-sock.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/series	[new file with mode: 0644]	patch \| blob
queue-4.9/soreuseport-fix-initialization-race.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/tap-double-free-in-error-path-in-tap_open.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/tcp-dccp-fix-ireq-opt-races.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/tcp-dccp-fix-lockdep-splat-in-inet_csk_route_req.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/tcp-dccp-fix-other-lockdep-splats-accessing-ireq_opt.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/tcp-fix-tcp_mtu_probe-vs-highest_sack.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/tun-allow-positive-return-values-on-dev_get_valid_name-call.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/tun-call-dev_get_valid_name-before-register_netdevice.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/tun-tap-sanitize-tunsetsndbuf-input.patch	[new file with mode: 0644]	patch \| blob