4.9-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Thu, 29 Jun 2017 16:59:01 +0000 (18:59 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Thu, 29 Jun 2017 16:59:01 +0000 (18:59 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 29 Jun 2017 16:59:01 +0000 (18:59 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 29 Jun 2017 16:59:01 +0000 (18:59 +0200)
diff --git a/queue-4.9/af_unix-add-sockaddr-length-checks-before-accessing-sa_family-in-bind-and-connect-handlers.patch b/queue-4.9/af_unix-add-sockaddr-length-checks-before-accessing-sa_family-in-bind-and-connect-handlers.patch

new file mode 100644 (file)

index 0000000..de4a0fc
--- /dev/null
+++ b/queue-4.9/af_unix-add-sockaddr-length-checks-before-accessing-sa_family-in-bind-and-connect-handlers.patch
@@ -0,0 +1,47 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Mateusz Jurczyk <mjurczyk@google.com>
+Date: Thu, 8 Jun 2017 11:13:36 +0200
+Subject: af_unix: Add sockaddr length checks before accessing sa_family in bind and connect handlers
+
+From: Mateusz Jurczyk <mjurczyk@google.com>
+
+
+[ Upstream commit defbcf2decc903a28d8398aa477b6881e711e3ea ]
+
+Verify that the caller-provided sockaddr structure is large enough to
+contain the sa_family field, before accessing it in bind() and connect()
+handlers of the AF_UNIX socket. Since neither syscall enforces a minimum
+size of the corresponding memory region, very short sockaddrs (zero or
+one byte long) result in operating on uninitialized memory while
+referencing .sa_family.
+
+Signed-off-by: Mateusz Jurczyk <mjurczyk@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/af_unix.c |    7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -998,7 +998,8 @@ static int unix_bind(struct socket *sock
+       struct path path = { NULL, NULL };
+ 
+       err = -EINVAL;
+-      if (sunaddr->sun_family != AF_UNIX)
++      if (addr_len < offsetofend(struct sockaddr_un, sun_family) ||
++          sunaddr->sun_family != AF_UNIX)
+               goto out;
+ 
+       if (addr_len == sizeof(short)) {
+@@ -1109,6 +1110,10 @@ static int unix_dgram_connect(struct soc
+       unsigned int hash;
+       int err;
+ 
++      err = -EINVAL;
++      if (alen < offsetofend(struct sockaddr, sa_family))
++              goto out;
++
+       if (addr->sa_family != AF_UNSPEC) {
+               err = unix_mkname(sunaddr, alen, &hash);
+               if (err < 0)
diff --git a/queue-4.9/decnet-always-not-take-dst-__refcnt-when-inserting-dst-into-hash-table.patch b/queue-4.9/decnet-always-not-take-dst-__refcnt-when-inserting-dst-into-hash-table.patch

new file mode 100644 (file)

index 0000000..cb504b6
--- /dev/null
+++ b/queue-4.9/decnet-always-not-take-dst-__refcnt-when-inserting-dst-into-hash-table.patch
@@ -0,0 +1,89 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Wei Wang <weiwan@google.com>
+Date: Fri, 16 Jun 2017 10:46:37 -0700
+Subject: decnet: always not take dst->__refcnt when inserting dst into hash table
+
+From: Wei Wang <weiwan@google.com>
+
+
+[ Upstream commit 76371d2e3ad1f84426a30ebcd8c3b9b98f4c724f ]
+
+In the existing dn_route.c code, dn_route_output_slow() takes
+dst->__refcnt before calling dn_insert_route() while dn_route_input_slow()
+does not take dst->__refcnt before calling dn_insert_route().
+This makes the whole routing code very buggy.
+In dn_dst_check_expire(), dnrt_free() is called when rt expires. This
+makes the routes inserted by dn_route_output_slow() not able to be
+freed as the refcnt is not released.
+In dn_dst_gc(), dnrt_drop() is called to release rt which could
+potentially cause the dst->__refcnt to be dropped to -1.
+In dn_run_flush(), dst_free() is called to release all the dst. Again,
+it makes the dst inserted by dn_route_output_slow() not able to be
+released and also, it does not wait on the rcu and could potentially
+cause crash in the path where other users still refer to this dst.
+
+This patch makes sure both input and output path do not take
+dst->__refcnt before calling dn_insert_route() and also makes sure
+dnrt_free()/dst_free() is called when removing dst from the hash table.
+The only difference between those 2 calls is that dnrt_free() waits on
+the rcu while dst_free() does not.
+
+Signed-off-by: Wei Wang <weiwan@google.com>
+Acked-by: Martin KaFai Lau <kafai@fb.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/decnet/dn_route.c |   14 ++++----------
+ 1 file changed, 4 insertions(+), 10 deletions(-)
+
+--- a/net/decnet/dn_route.c
++++ b/net/decnet/dn_route.c
+@@ -188,12 +188,6 @@ static inline void dnrt_free(struct dn_r
+       call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
+ }
+ 
+-static inline void dnrt_drop(struct dn_route *rt)
+-{
+-      dst_release(&rt->dst);
+-      call_rcu_bh(&rt->dst.rcu_head, dst_rcu_free);
+-}
+-
+ static void dn_dst_check_expire(unsigned long dummy)
+ {
+       int i;
+@@ -248,7 +242,7 @@ static int dn_dst_gc(struct dst_ops *ops
+                       }
+                       *rtp = rt->dst.dn_next;
+                       rt->dst.dn_next = NULL;
+-                      dnrt_drop(rt);
++                      dnrt_free(rt);
+                       break;
+               }
+               spin_unlock_bh(&dn_rt_hash_table[i].lock);
+@@ -350,7 +344,7 @@ static int dn_insert_route(struct dn_rou
+                       dst_use(&rth->dst, now);
+                       spin_unlock_bh(&dn_rt_hash_table[hash].lock);
+ 
+-                      dnrt_drop(rt);
++                      dst_free(&rt->dst);
+                       *rp = rth;
+                       return 0;
+               }
+@@ -380,7 +374,7 @@ static void dn_run_flush(unsigned long d
+               for(; rt; rt = next) {
+                       next = rcu_dereference_raw(rt->dst.dn_next);
+                       RCU_INIT_POINTER(rt->dst.dn_next, NULL);
+-                      dst_free((struct dst_entry *)rt);
++                      dnrt_free(rt);
+               }
+ 
+ nothing_to_declare:
+@@ -1187,7 +1181,7 @@ make_route:
+       if (dev_out->flags & IFF_LOOPBACK)
+               flags |= RTCF_LOCAL;
+ 
+-      rt = dst_alloc(&dn_dst_ops, dev_out, 1, DST_OBSOLETE_NONE, DST_HOST);
++      rt = dst_alloc(&dn_dst_ops, dev_out, 0, DST_OBSOLETE_NONE, DST_HOST);
+       if (rt == NULL)
+               goto e_nobufs;
+ 
diff --git a/queue-4.9/decnet-dn_rtmsg-improve-input-length-sanitization-in-dnrmg_receive_user_skb.patch b/queue-4.9/decnet-dn_rtmsg-improve-input-length-sanitization-in-dnrmg_receive_user_skb.patch

new file mode 100644 (file)

index 0000000..105f478
--- /dev/null
+++ b/queue-4.9/decnet-dn_rtmsg-improve-input-length-sanitization-in-dnrmg_receive_user_skb.patch
@@ -0,0 +1,42 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Mateusz Jurczyk <mjurczyk@google.com>
+Date: Wed, 7 Jun 2017 16:14:29 +0200
+Subject: decnet: dn_rtmsg: Improve input length sanitization in dnrmg_receive_user_skb
+
+From: Mateusz Jurczyk <mjurczyk@google.com>
+
+
+[ Upstream commit dd0da17b209ed91f39872766634ca967c170ada1 ]
+
+Verify that the length of the socket buffer is sufficient to cover the
+nlmsghdr structure before accessing the nlh->nlmsg_len field for further
+input sanitization. If the client only supplies 1-3 bytes of data in
+sk_buff, then nlh->nlmsg_len remains partially uninitialized and
+contains leftover memory from the corresponding kernel allocation.
+Operating on such data may result in indeterminate evaluation of the
+nlmsg_len < sizeof(*nlh) expression.
+
+The bug was discovered by a runtime instrumentation designed to detect
+use of uninitialized memory in the kernel. The patch prevents this and
+other similar tools (e.g. KMSAN) from flagging this behavior in the future.
+
+Signed-off-by: Mateusz Jurczyk <mjurczyk@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/decnet/netfilter/dn_rtmsg.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/net/decnet/netfilter/dn_rtmsg.c
++++ b/net/decnet/netfilter/dn_rtmsg.c
+@@ -102,7 +102,9 @@ static inline void dnrmg_receive_user_sk
+ {
+       struct nlmsghdr *nlh = nlmsg_hdr(skb);
+ 
+-      if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
++      if (skb->len < sizeof(*nlh) ||
++          nlh->nlmsg_len < sizeof(*nlh) ||
++          skb->len < nlh->nlmsg_len)
+               return;
+ 
+       if (!netlink_capable(skb, CAP_NET_ADMIN))
diff --git a/queue-4.9/fix-an-intermittent-pr_emerg-warning-about-lo-becoming-free.patch b/queue-4.9/fix-an-intermittent-pr_emerg-warning-about-lo-becoming-free.patch

new file mode 100644 (file)

index 0000000..295b50f
--- /dev/null
+++ b/queue-4.9/fix-an-intermittent-pr_emerg-warning-about-lo-becoming-free.patch
@@ -0,0 +1,76 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Krister Johansen <kjlx@templeofstupid.com>
+Date: Thu, 8 Jun 2017 13:12:38 -0700
+Subject: Fix an intermittent pr_emerg warning about lo becoming free.
+
+From: Krister Johansen <kjlx@templeofstupid.com>
+
+
+[ Upstream commit f186ce61bb8235d80068c390dc2aad7ca427a4c2 ]
+
+It looks like this:
+
+Message from syslogd@flamingo at Apr 26 00:45:00 ...
+ kernel:unregister_netdevice: waiting for lo to become free. Usage count = 4
+
+They seem to coincide with net namespace teardown.
+
+The message is emitted by netdev_wait_allrefs().
+
+Forced a kdump in netdev_run_todo, but found that the refcount on the lo
+device was already 0 at the time we got to the panic.
+
+Used bcc to check the blocking in netdev_run_todo.  The only places
+where we're off cpu there are in the rcu_barrier() and msleep() calls.
+That behavior is expected.  The msleep time coincides with the amount of
+time we spend waiting for the refcount to reach zero; the rcu_barrier()
+wait times are not excessive.
+
+After looking through the list of callbacks that the netdevice notifiers
+invoke in this path, it appears that the dst_dev_event is the most
+interesting.  The dst_ifdown path places a hold on the loopback_dev as
+part of releasing the dev associated with the original dst cache entry.
+Most of our notifier callbacks are straight-forward, but this one a)
+looks complex, and b) places a hold on the network interface in
+question.
+
+I constructed a new bcc script that watches various events in the
+liftime of a dst cache entry.  Note that dst_ifdown will take a hold on
+the loopback device until the invalidated dst entry gets freed.
+
+[      __dst_free] on DST: ffff883ccabb7900 IF tap1008300eth0 invoked at 1282115677036183
+    __dst_free
+    rcu_nocb_kthread
+    kthread
+    ret_from_fork
+Acked-by: Eric Dumazet <edumazet@google.com>
+
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/dst.c |   14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+--- a/net/core/dst.c
++++ b/net/core/dst.c
+@@ -470,6 +470,20 @@ static int dst_dev_event(struct notifier
+               spin_lock_bh(&dst_garbage.lock);
+               dst = dst_garbage.list;
+               dst_garbage.list = NULL;
++              /* The code in dst_ifdown places a hold on the loopback device.
++               * If the gc entry processing is set to expire after a lengthy
++               * interval, this hold can cause netdev_wait_allrefs() to hang
++               * out and wait for a long time -- until the the loopback
++               * interface is released.  If we're really unlucky, it'll emit
++               * pr_emerg messages to console too.  Reset the interval here,
++               * so dst cleanups occur in a more timely fashion.
++               */
++              if (dst_garbage.timer_inc > DST_GC_INC) {
++                      dst_garbage.timer_inc = DST_GC_INC;
++                      dst_garbage.timer_expires = DST_GC_MIN;
++                      mod_delayed_work(system_wq, &dst_gc_work,
++                                       dst_garbage.timer_expires);
++              }
+               spin_unlock_bh(&dst_garbage.lock);
+ 
+               if (last)
diff --git a/queue-4.9/igmp-acquire-pmc-lock-for-ip_mc_clear_src.patch b/queue-4.9/igmp-acquire-pmc-lock-for-ip_mc_clear_src.patch

new file mode 100644 (file)

index 0000000..5f9dff4
--- /dev/null
+++ b/queue-4.9/igmp-acquire-pmc-lock-for-ip_mc_clear_src.patch
@@ -0,0 +1,82 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: WANG Cong <xiyou.wangcong@gmail.com>
+Date: Mon, 12 Jun 2017 09:52:26 -0700
+Subject: igmp: acquire pmc lock for ip_mc_clear_src()
+
+From: WANG Cong <xiyou.wangcong@gmail.com>
+
+
+[ Upstream commit c38b7d327aafd1e3ad7ff53eefac990673b65667 ]
+
+Andrey reported a use-after-free in add_grec():
+
+        for (psf = *psf_list; psf; psf = psf_next) {
+               ...
+                psf_next = psf->sf_next;
+
+where the struct ip_sf_list's were already freed by:
+
+ kfree+0xe8/0x2b0 mm/slub.c:3882
+ ip_mc_clear_src+0x69/0x1c0 net/ipv4/igmp.c:2078
+ ip_mc_dec_group+0x19a/0x470 net/ipv4/igmp.c:1618
+ ip_mc_drop_socket+0x145/0x230 net/ipv4/igmp.c:2609
+ inet_release+0x4e/0x1c0 net/ipv4/af_inet.c:411
+ sock_release+0x8d/0x1e0 net/socket.c:597
+ sock_close+0x16/0x20 net/socket.c:1072
+
+This happens because we don't hold pmc->lock in ip_mc_clear_src()
+and a parallel mr_ifc_timer timer could jump in and access them.
+
+The RCU lock is there but it is merely for pmc itself, this
+spinlock could actually ensure we don't access them in parallel.
+
+Thanks to Eric and Long for discussion on this bug.
+
+Reported-by: Andrey Konovalov <andreyknvl@google.com>
+Cc: Eric Dumazet <edumazet@google.com>
+Cc: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
+Reviewed-by: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/igmp.c |   21 +++++++++++++--------
+ 1 file changed, 13 insertions(+), 8 deletions(-)
+
+--- a/net/ipv4/igmp.c
++++ b/net/ipv4/igmp.c
+@@ -2071,21 +2071,26 @@ static int ip_mc_add_src(struct in_devic
+ 
+ static void ip_mc_clear_src(struct ip_mc_list *pmc)
+ {
+-      struct ip_sf_list *psf, *nextpsf;
++      struct ip_sf_list *psf, *nextpsf, *tomb, *sources;
+ 
+-      for (psf = pmc->tomb; psf; psf = nextpsf) {
++      spin_lock_bh(&pmc->lock);
++      tomb = pmc->tomb;
++      pmc->tomb = NULL;
++      sources = pmc->sources;
++      pmc->sources = NULL;
++      pmc->sfmode = MCAST_EXCLUDE;
++      pmc->sfcount[MCAST_INCLUDE] = 0;
++      pmc->sfcount[MCAST_EXCLUDE] = 1;
++      spin_unlock_bh(&pmc->lock);
++
++      for (psf = tomb; psf; psf = nextpsf) {
+               nextpsf = psf->sf_next;
+               kfree(psf);
+       }
+-      pmc->tomb = NULL;
+-      for (psf = pmc->sources; psf; psf = nextpsf) {
++      for (psf = sources; psf; psf = nextpsf) {
+               nextpsf = psf->sf_next;
+               kfree(psf);
+       }
+-      pmc->sources = NULL;
+-      pmc->sfmode = MCAST_EXCLUDE;
+-      pmc->sfcount[MCAST_INCLUDE] = 0;
+-      pmc->sfcount[MCAST_EXCLUDE] = 1;
+ }
+ 
+ /* Join a multicast group
diff --git a/queue-4.9/igmp-add-a-missing-spin_lock_init.patch b/queue-4.9/igmp-add-a-missing-spin_lock_init.patch

new file mode 100644 (file)

index 0000000..003ea27
--- /dev/null
+++ b/queue-4.9/igmp-add-a-missing-spin_lock_init.patch
@@ -0,0 +1,57 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: WANG Cong <xiyou.wangcong@gmail.com>
+Date: Tue, 20 Jun 2017 10:46:27 -0700
+Subject: igmp: add a missing spin_lock_init()
+
+From: WANG Cong <xiyou.wangcong@gmail.com>
+
+
+[ Upstream commit b4846fc3c8559649277e3e4e6b5cec5348a8d208 ]
+
+Andrey reported a lockdep warning on non-initialized
+spinlock:
+
+ INFO: trying to register non-static key.
+ the code is fine but needs lockdep annotation.
+ turning off the locking correctness validator.
+ CPU: 1 PID: 4099 Comm: a.out Not tainted 4.12.0-rc6+ #9
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011
+ Call Trace:
+  __dump_stack lib/dump_stack.c:16
+  dump_stack+0x292/0x395 lib/dump_stack.c:52
+  register_lock_class+0x717/0x1aa0 kernel/locking/lockdep.c:755
+  ? 0xffffffffa0000000
+  __lock_acquire+0x269/0x3690 kernel/locking/lockdep.c:3255
+  lock_acquire+0x22d/0x560 kernel/locking/lockdep.c:3855
+  __raw_spin_lock_bh ./include/linux/spinlock_api_smp.h:135
+  _raw_spin_lock_bh+0x36/0x50 kernel/locking/spinlock.c:175
+  spin_lock_bh ./include/linux/spinlock.h:304
+  ip_mc_clear_src+0x27/0x1e0 net/ipv4/igmp.c:2076
+  igmpv3_clear_delrec+0xee/0x4f0 net/ipv4/igmp.c:1194
+  ip_mc_destroy_dev+0x4e/0x190 net/ipv4/igmp.c:1736
+
+We miss a spin_lock_init() in igmpv3_add_delrec(), probably
+because previously we never use it on this code path. Since
+we already unlink it from the global mc_tomb list, it is
+probably safe not to acquire this spinlock here. It does not
+harm to have it although, to avoid conditional locking.
+
+Fixes: c38b7d327aaf ("igmp: acquire pmc lock for ip_mc_clear_src()")
+Reported-by: Andrey Konovalov <andreyknvl@google.com>
+Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/igmp.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/ipv4/igmp.c
++++ b/net/ipv4/igmp.c
+@@ -1112,6 +1112,7 @@ static void igmpv3_add_delrec(struct in_
+       pmc = kzalloc(sizeof(*pmc), GFP_KERNEL);
+       if (!pmc)
+               return;
++      spin_lock_init(&pmc->lock);
+       spin_lock_bh(&im->lock);
+       pmc->interface = im->interface;
+       in_dev_hold(in_dev);
diff --git a/queue-4.9/ipv6-do-not-leak-throw-route-references.patch b/queue-4.9/ipv6-do-not-leak-throw-route-references.patch

new file mode 100644 (file)

index 0000000..14fea90
--- /dev/null
+++ b/queue-4.9/ipv6-do-not-leak-throw-route-references.patch
@@ -0,0 +1,95 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Serhey Popovych <serhe.popovych@gmail.com>
+Date: Tue, 20 Jun 2017 13:29:25 +0300
+Subject: ipv6: Do not leak throw route references
+
+From: Serhey Popovych <serhe.popovych@gmail.com>
+
+
+[ Upstream commit 07f615574f8ac499875b21c1142f26308234a92c ]
+
+While commit 73ba57bfae4a ("ipv6: fix backtracking for throw routes")
+does good job on error propagation to the fib_rules_lookup()
+in fib rules core framework that also corrects throw routes
+handling, it does not solve route reference leakage problem
+happened when we return -EAGAIN to the fib_rules_lookup()
+and leave routing table entry referenced in arg->result.
+
+If rule with matched throw route isn't last matched in the
+list we overwrite arg->result losing reference on throw
+route stored previously forever.
+
+We also partially revert commit ab997ad40839 ("ipv6: fix the
+incorrect return value of throw route") since we never return
+routing table entry with dst.error == -EAGAIN when
+CONFIG_IPV6_MULTIPLE_TABLES is on. Also there is no point
+to check for RTF_REJECT flag since it is always set throw
+route.
+
+Fixes: 73ba57bfae4a ("ipv6: fix backtracking for throw routes")
+Signed-off-by: Serhey Popovych <serhe.popovych@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/fib6_rules.c |   22 ++++++----------------
+ net/ipv6/ip6_fib.c    |    3 +--
+ 2 files changed, 7 insertions(+), 18 deletions(-)
+
+--- a/net/ipv6/fib6_rules.c
++++ b/net/ipv6/fib6_rules.c
+@@ -32,7 +32,6 @@ struct fib6_rule {
+ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
+                                  int flags, pol_lookup_t lookup)
+ {
+-      struct rt6_info *rt;
+       struct fib_lookup_arg arg = {
+               .lookup_ptr = lookup,
+               .flags = FIB_LOOKUP_NOREF,
+@@ -44,21 +43,11 @@ struct dst_entry *fib6_rule_lookup(struc
+       fib_rules_lookup(net->ipv6.fib6_rules_ops,
+                        flowi6_to_flowi(fl6), flags, &arg);
+ 
+-      rt = arg.result;
++      if (arg.result)
++              return arg.result;
+ 
+-      if (!rt) {
+-              dst_hold(&net->ipv6.ip6_null_entry->dst);
+-              return &net->ipv6.ip6_null_entry->dst;
+-      }
+-
+-      if (rt->rt6i_flags & RTF_REJECT &&
+-          rt->dst.error == -EAGAIN) {
+-              ip6_rt_put(rt);
+-              rt = net->ipv6.ip6_null_entry;
+-              dst_hold(&rt->dst);
+-      }
+-
+-      return &rt->dst;
++      dst_hold(&net->ipv6.ip6_null_entry->dst);
++      return &net->ipv6.ip6_null_entry->dst;
+ }
+ 
+ static int fib6_rule_action(struct fib_rule *rule, struct flowi *flp,
+@@ -121,7 +110,8 @@ static int fib6_rule_action(struct fib_r
+                       flp6->saddr = saddr;
+               }
+               err = rt->dst.error;
+-              goto out;
++              if (err != -EAGAIN)
++                      goto out;
+       }
+ again:
+       ip6_rt_put(rt);
+--- a/net/ipv6/ip6_fib.c
++++ b/net/ipv6/ip6_fib.c
+@@ -289,8 +289,7 @@ struct dst_entry *fib6_rule_lookup(struc
+       struct rt6_info *rt;
+ 
+       rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, flags);
+-      if (rt->rt6i_flags & RTF_REJECT &&
+-          rt->dst.error == -EAGAIN) {
++      if (rt->dst.error == -EAGAIN) {
+               ip6_rt_put(rt);
+               rt = net->ipv6.ip6_null_entry;
+               dst_hold(&rt->dst);
diff --git a/queue-4.9/ipv6-fix-calling-in6_ifa_hold-incorrectly-for-dad-work.patch b/queue-4.9/ipv6-fix-calling-in6_ifa_hold-incorrectly-for-dad-work.patch

new file mode 100644 (file)

index 0000000..7c77e31
--- /dev/null
+++ b/queue-4.9/ipv6-fix-calling-in6_ifa_hold-incorrectly-for-dad-work.patch
@@ -0,0 +1,64 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Xin Long <lucien.xin@gmail.com>
+Date: Thu, 15 Jun 2017 16:33:58 +0800
+Subject: ipv6: fix calling in6_ifa_hold incorrectly for dad work
+
+From: Xin Long <lucien.xin@gmail.com>
+
+
+[ Upstream commit f8a894b218138888542a5058d0e902378fd0d4ec ]
+
+Now when starting the dad work in addrconf_mod_dad_work, if the dad work
+is idle and queued, it needs to hold ifa.
+
+The problem is there's one gap in [1], during which if the pending dad work
+is removed elsewhere. It will miss to hold ifa, but the dad word is still
+idea and queue.
+
+        if (!delayed_work_pending(&ifp->dad_work))
+                in6_ifa_hold(ifp);
+                    <--------------[1]
+        mod_delayed_work(addrconf_wq, &ifp->dad_work, delay);
+
+An use-after-free issue can be caused by this.
+
+Chen Wei found this issue when WARN_ON(!hlist_unhashed(&ifp->addr_lst)) in
+net6_ifa_finish_destroy was hit because of it.
+
+As Hannes' suggestion, this patch is to fix it by holding ifa first in
+addrconf_mod_dad_work, then calling mod_delayed_work and putting ifa if
+the dad_work is already in queue.
+
+Note that this patch did not choose to fix it with:
+
+  if (!mod_delayed_work(delay))
+          in6_ifa_hold(ifp);
+
+As with it, when delay == 0, dad_work would be scheduled immediately, all
+addrconf_mod_dad_work(0) callings had to be moved under ifp->lock.
+
+Reported-by: Wei Chen <weichen@redhat.com>
+Suggested-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
+Acked-by: Hannes Frederic Sowa <hannes@stressinduktion.org>
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/addrconf.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/net/ipv6/addrconf.c
++++ b/net/ipv6/addrconf.c
+@@ -315,9 +315,9 @@ static void addrconf_mod_rs_timer(struct
+ static void addrconf_mod_dad_work(struct inet6_ifaddr *ifp,
+                                  unsigned long delay)
+ {
+-      if (!delayed_work_pending(&ifp->dad_work))
+-              in6_ifa_hold(ifp);
+-      mod_delayed_work(addrconf_wq, &ifp->dad_work, delay);
++      in6_ifa_hold(ifp);
++      if (mod_delayed_work(addrconf_wq, &ifp->dad_work, delay))
++              in6_ifa_put(ifp);
+ }
+ 
+ static int snmp6_alloc_dev(struct inet6_dev *idev)
diff --git a/queue-4.9/net-8021q-fix-one-possible-panic-caused-by-bug_on-in-free_netdev.patch b/queue-4.9/net-8021q-fix-one-possible-panic-caused-by-bug_on-in-free_netdev.patch

new file mode 100644 (file)

index 0000000..91859c0
--- /dev/null
+++ b/queue-4.9/net-8021q-fix-one-possible-panic-caused-by-bug_on-in-free_netdev.patch
@@ -0,0 +1,73 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Gao Feng <gfree.wind@vip.163.com>
+Date: Fri, 16 Jun 2017 15:00:02 +0800
+Subject: net: 8021q: Fix one possible panic caused by BUG_ON in free_netdev
+
+From: Gao Feng <gfree.wind@vip.163.com>
+
+
+[ Upstream commit 9745e362add89432d2c951272a99b0a5fe4348a9 ]
+
+The register_vlan_device would invoke free_netdev directly, when
+register_vlan_dev failed. It would trigger the BUG_ON in free_netdev
+if the dev was already registered. In this case, the netdev would be
+freed in netdev_run_todo later.
+
+So add one condition check now. Only when dev is not registered, then
+free it directly.
+
+The following is the part coredump when netdev_upper_dev_link failed
+in register_vlan_dev. I removed the lines which are too long.
+
+[  411.237457] ------------[ cut here ]------------
+[  411.237458] kernel BUG at net/core/dev.c:7998!
+[  411.237484] invalid opcode: 0000 [#1] SMP
+[  411.237705]  [last unloaded: 8021q]
+[  411.237718] CPU: 1 PID: 12845 Comm: vconfig Tainted: G            E   4.12.0-rc5+ #6
+[  411.237737] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 07/02/2015
+[  411.237764] task: ffff9cbeb6685580 task.stack: ffffa7d2807d8000
+[  411.237782] RIP: 0010:free_netdev+0x116/0x120
+[  411.237794] RSP: 0018:ffffa7d2807dbdb0 EFLAGS: 00010297
+[  411.237808] RAX: 0000000000000002 RBX: ffff9cbeb6ba8fd8 RCX: 0000000000001878
+[  411.237826] RDX: 0000000000000001 RSI: 0000000000000282 RDI: 0000000000000000
+[  411.237844] RBP: ffffa7d2807dbdc8 R08: 0002986100029841 R09: 0002982100029801
+[  411.237861] R10: 0004000100029980 R11: 0004000100029980 R12: ffff9cbeb6ba9000
+[  411.238761] R13: ffff9cbeb6ba9060 R14: ffff9cbe60f1a000 R15: ffff9cbeb6ba9000
+[  411.239518] FS:  00007fb690d81700(0000) GS:ffff9cbebb640000(0000) knlGS:0000000000000000
+[  411.239949] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[  411.240454] CR2: 00007f7115624000 CR3: 0000000077cdf000 CR4: 00000000003406e0
+[  411.240936] Call Trace:
+[  411.241462]  vlan_ioctl_handler+0x3f1/0x400 [8021q]
+[  411.241910]  sock_ioctl+0x18b/0x2c0
+[  411.242394]  do_vfs_ioctl+0xa1/0x5d0
+[  411.242853]  ? sock_alloc_file+0xa6/0x130
+[  411.243465]  SyS_ioctl+0x79/0x90
+[  411.243900]  entry_SYSCALL_64_fastpath+0x1e/0xa9
+[  411.244425] RIP: 0033:0x7fb69089a357
+[  411.244863] RSP: 002b:00007ffcd04e0fc8 EFLAGS: 00000202 ORIG_RAX: 0000000000000010
+[  411.245445] RAX: ffffffffffffffda RBX: 00007ffcd04e2884 RCX: 00007fb69089a357
+[  411.245903] RDX: 00007ffcd04e0fd0 RSI: 0000000000008983 RDI: 0000000000000003
+[  411.246527] RBP: 00007ffcd04e0fd0 R08: 0000000000000000 R09: 1999999999999999
+[  411.246976] R10: 000000000000053f R11: 0000000000000202 R12: 0000000000000004
+[  411.247414] R13: 00007ffcd04e1128 R14: 00007ffcd04e2888 R15: 0000000000000001
+[  411.249129] RIP: free_netdev+0x116/0x120 RSP: ffffa7d2807dbdb0
+
+Signed-off-by: Gao Feng <gfree.wind@vip.163.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/8021q/vlan.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/8021q/vlan.c
++++ b/net/8021q/vlan.c
+@@ -277,7 +277,8 @@ static int register_vlan_device(struct n
+       return 0;
+ 
+ out_free_newdev:
+-      free_netdev(new_dev);
++      if (new_dev->reg_state == NETREG_UNINITIALIZED)
++              free_netdev(new_dev);
+       return err;
+ }
+ 
diff --git a/queue-4.9/net-caif-fix-a-sleep-in-atomic-bug-in-cfpkt_create_pfx.patch b/queue-4.9/net-caif-fix-a-sleep-in-atomic-bug-in-cfpkt_create_pfx.patch

new file mode 100644 (file)

index 0000000..9b5883d
--- /dev/null
+++ b/queue-4.9/net-caif-fix-a-sleep-in-atomic-bug-in-cfpkt_create_pfx.patch
@@ -0,0 +1,50 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Jia-Ju Bai <baijiaju1990@163.com>
+Date: Sat, 10 Jun 2017 16:49:39 +0800
+Subject: net: caif: Fix a sleep-in-atomic bug in cfpkt_create_pfx
+
+From: Jia-Ju Bai <baijiaju1990@163.com>
+
+
+[ Upstream commit f146e872eb12ebbe92d8e583b2637e0741440db3 ]
+
+The kernel may sleep under a rcu read lock in cfpkt_create_pfx, and the
+function call path is:
+cfcnfg_linkup_rsp (acquire the lock by rcu_read_lock)
+  cfctrl_linkdown_req
+    cfpkt_create
+      cfpkt_create_pfx
+        alloc_skb(GFP_KERNEL) --> may sleep
+cfserl_receive (acquire the lock by rcu_read_lock)
+  cfpkt_split
+    cfpkt_create_pfx
+      alloc_skb(GFP_KERNEL) --> may sleep
+
+There is "in_interrupt" in cfpkt_create_pfx to decide use "GFP_KERNEL" or
+"GFP_ATOMIC". In this situation, "GFP_KERNEL" is used because the function
+is called under a rcu read lock, instead in interrupt.
+
+To fix it, only "GFP_ATOMIC" is used in cfpkt_create_pfx.
+
+Signed-off-by: Jia-Ju Bai <baijiaju1990@163.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/caif/cfpkt_skbuff.c |    6 +-----
+ 1 file changed, 1 insertion(+), 5 deletions(-)
+
+--- a/net/caif/cfpkt_skbuff.c
++++ b/net/caif/cfpkt_skbuff.c
+@@ -81,11 +81,7 @@ static struct cfpkt *cfpkt_create_pfx(u1
+ {
+       struct sk_buff *skb;
+ 
+-      if (likely(in_interrupt()))
+-              skb = alloc_skb(len + pfx, GFP_ATOMIC);
+-      else
+-              skb = alloc_skb(len + pfx, GFP_KERNEL);
+-
++      skb = alloc_skb(len + pfx, GFP_ATOMIC);
+       if (unlikely(skb == NULL))
+               return NULL;
+ 
diff --git a/queue-4.9/net-don-t-call-strlen-on-non-terminated-string-in-dev_set_alias.patch b/queue-4.9/net-don-t-call-strlen-on-non-terminated-string-in-dev_set_alias.patch

new file mode 100644 (file)

index 0000000..3cf3bf1
--- /dev/null
+++ b/queue-4.9/net-don-t-call-strlen-on-non-terminated-string-in-dev_set_alias.patch
@@ -0,0 +1,34 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Alexander Potapenko <glider@google.com>
+Date: Tue, 6 Jun 2017 15:56:54 +0200
+Subject: net: don't call strlen on non-terminated string in dev_set_alias()
+
+From: Alexander Potapenko <glider@google.com>
+
+
+[ Upstream commit c28294b941232931fbd714099798eb7aa7e865d7 ]
+
+KMSAN reported a use of uninitialized memory in dev_set_alias(),
+which was caused by calling strlcpy() (which in turn called strlen())
+on the user-supplied non-terminated string.
+
+Signed-off-by: Alexander Potapenko <glider@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/dev.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -1250,8 +1250,9 @@ int dev_set_alias(struct net_device *dev
+       if (!new_ifalias)
+               return -ENOMEM;
+       dev->ifalias = new_ifalias;
++      memcpy(dev->ifalias, alias, len);
++      dev->ifalias[len] = 0;
+ 
+-      strlcpy(dev->ifalias, alias, len+1);
+       return len;
+ }
+ 
diff --git a/queue-4.9/net-mlx5-wait-for-fw-readiness-before-initializing-command-interface.patch b/queue-4.9/net-mlx5-wait-for-fw-readiness-before-initializing-command-interface.patch

new file mode 100644 (file)

index 0000000..e18326f
--- /dev/null
+++ b/queue-4.9/net-mlx5-wait-for-fw-readiness-before-initializing-command-interface.patch
@@ -0,0 +1,56 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Eli Cohen <eli@mellanox.com>
+Date: Thu, 8 Jun 2017 11:33:16 -0500
+Subject: net/mlx5: Wait for FW readiness before initializing command interface
+
+From: Eli Cohen <eli@mellanox.com>
+
+
+[ Upstream commit 6c780a0267b8a1075f40b39851132eeaefefcff5 ]
+
+Before attempting to initialize the command interface we must wait till
+the fw_initializing bit is clear.
+
+If we fail to meet this condition the hardware will drop our
+configuration, specifically the descriptors page address.  This scenario
+can happen when the firmware is still executing an FLR flow and did not
+finish yet so the driver needs to wait for that to finish.
+
+Fixes: e3297246c2c8 ('net/mlx5_core: Wait for FW readiness on startup')
+Signed-off-by: Eli Cohen <eli@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/main.c |   14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
+@@ -155,8 +155,9 @@ static struct mlx5_profile profile[] = {
+       },
+ };
+ 
+-#define FW_INIT_TIMEOUT_MILI  2000
+-#define FW_INIT_WAIT_MS               2
++#define FW_INIT_TIMEOUT_MILI          2000
++#define FW_INIT_WAIT_MS                       2
++#define FW_PRE_INIT_TIMEOUT_MILI      10000
+ 
+ static int wait_fw_init(struct mlx5_core_dev *dev, u32 max_wait_mili)
+ {
+@@ -956,6 +957,15 @@ static int mlx5_load_one(struct mlx5_cor
+        */
+       dev->state = MLX5_DEVICE_STATE_UP;
+ 
++      /* wait for firmware to accept initialization segments configurations
++       */
++      err = wait_fw_init(dev, FW_PRE_INIT_TIMEOUT_MILI);
++      if (err) {
++              dev_err(&dev->pdev->dev, "Firmware over %d MS in pre-initializing state, aborting\n",
++                      FW_PRE_INIT_TIMEOUT_MILI);
++              goto out;
++      }
++
+       err = mlx5_cmd_init(dev);
+       if (err) {
+               dev_err(&pdev->dev, "Failed initializing command interface, aborting\n");
diff --git a/queue-4.9/net-mlx5e-added-bw-check-for-dim-decision-mechanism.patch b/queue-4.9/net-mlx5e-added-bw-check-for-dim-decision-mechanism.patch

new file mode 100644 (file)

index 0000000..19c32c2
--- /dev/null
+++ b/queue-4.9/net-mlx5e-added-bw-check-for-dim-decision-mechanism.patch
@@ -0,0 +1,130 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Tal Gilboa <talgi@mellanox.com>
+Date: Mon, 15 May 2017 14:13:16 +0300
+Subject: net/mlx5e: Added BW check for DIM decision mechanism
+
+From: Tal Gilboa <talgi@mellanox.com>
+
+
+[ Upstream commit c3164d2fc48fd4fa0477ab658b644559c3fe9073 ]
+
+DIM (Dynamically-tuned Interrupt Moderation) is a mechanism designed for
+changing the channel interrupt moderation values in order to reduce CPU
+overhead for all traffic types.
+Until now only interrupt and packet rate were sampled.
+We found a scenario on which we get a false indication since a change in
+DIM caused more aggregation and reduced packet rate while increasing BW.
+
+We now regard a change as succesfull iff:
+current_BW > (prev_BW + threshold) or
+current_BW ~= prev_BW and current_PR > (prev_PR + threshold) or
+current_BW ~= prev_BW and current_PR ~= prev_PR and
+    current_IR < (prev_IR - threshold)
+Where BW = Bandwidth, PR = Packet rate and IR = Interrupt rate
+
+Improvements (ConnectX-4Lx 25GbE, single RX queue, LRO off)
+    --------------------------------------------------
+    packet size | before[Mb/s] | after[Mb/s] | gain  |
+    2B          | 343.4        | 359.4       |  4.5% |
+    16B         | 2739.7       | 2814.8      |  2.7% |
+    64B         | 9739         | 10185.3     |  4.5% |
+
+Fixes: cb3c7fd4f839 ("net/mlx5e: Support adaptive RX coalescing")
+Signed-off-by: Tal Gilboa <talgi@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en.h       |    2 +
+ drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c |   41 +++++++++++----------
+ 2 files changed, 24 insertions(+), 19 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
+@@ -283,12 +283,14 @@ struct mlx5e_dma_info {
+ 
+ struct mlx5e_rx_am_stats {
+       int ppms; /* packets per msec */
++      int bpms; /* bytes per msec */
+       int epms; /* events per msec */
+ };
+ 
+ struct mlx5e_rx_am_sample {
+       ktime_t         time;
+       unsigned int    pkt_ctr;
++      unsigned int    byte_ctr;
+       u16             event_ctr;
+ };
+ 
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
+@@ -183,28 +183,27 @@ static void mlx5e_am_exit_parking(struct
+       mlx5e_am_step(am);
+ }
+ 
++#define IS_SIGNIFICANT_DIFF(val, ref) \
++      (((100 * abs((val) - (ref))) / (ref)) > 10) /* more than 10% difference */
++
+ static int mlx5e_am_stats_compare(struct mlx5e_rx_am_stats *curr,
+                                 struct mlx5e_rx_am_stats *prev)
+ {
+-      int diff;
+-
+-      if (!prev->ppms)
+-              return curr->ppms ? MLX5E_AM_STATS_BETTER :
+-                                  MLX5E_AM_STATS_SAME;
+-
+-      diff = curr->ppms - prev->ppms;
+-      if (((100 * abs(diff)) / prev->ppms) > 10) /* more than 10% diff */
+-              return (diff > 0) ? MLX5E_AM_STATS_BETTER :
+-                                  MLX5E_AM_STATS_WORSE;
+-
+-      if (!prev->epms)
+-              return curr->epms ? MLX5E_AM_STATS_WORSE :
++      if (!prev->bpms)
++              return curr->bpms ? MLX5E_AM_STATS_BETTER :
+                                   MLX5E_AM_STATS_SAME;
+ 
+-      diff = curr->epms - prev->epms;
+-      if (((100 * abs(diff)) / prev->epms) > 10) /* more than 10% diff */
+-              return (diff < 0) ? MLX5E_AM_STATS_BETTER :
+-                                  MLX5E_AM_STATS_WORSE;
++      if (IS_SIGNIFICANT_DIFF(curr->bpms, prev->bpms))
++              return (curr->bpms > prev->bpms) ? MLX5E_AM_STATS_BETTER :
++                                                 MLX5E_AM_STATS_WORSE;
++
++      if (IS_SIGNIFICANT_DIFF(curr->ppms, prev->ppms))
++              return (curr->ppms > prev->ppms) ? MLX5E_AM_STATS_BETTER :
++                                                 MLX5E_AM_STATS_WORSE;
++
++      if (IS_SIGNIFICANT_DIFF(curr->epms, prev->epms))
++              return (curr->epms < prev->epms) ? MLX5E_AM_STATS_BETTER :
++                                                 MLX5E_AM_STATS_WORSE;
+ 
+       return MLX5E_AM_STATS_SAME;
+ }
+@@ -266,6 +265,7 @@ static void mlx5e_am_sample(struct mlx5e
+ {
+       s->time      = ktime_get();
+       s->pkt_ctr   = rq->stats.packets;
++      s->byte_ctr  = rq->stats.bytes;
+       s->event_ctr = rq->cq.event_ctr;
+ }
+ 
+@@ -278,12 +278,15 @@ static void mlx5e_am_calc_stats(struct m
+       /* u32 holds up to 71 minutes, should be enough */
+       u32 delta_us = ktime_us_delta(end->time, start->time);
+       unsigned int npkts = end->pkt_ctr - start->pkt_ctr;
++      unsigned int nbytes = end->byte_ctr - start->byte_ctr;
+ 
+       if (!delta_us)
+               return;
+ 
+-      curr_stats->ppms =            (npkts * USEC_PER_MSEC) / delta_us;
+-      curr_stats->epms = (MLX5E_AM_NEVENTS * USEC_PER_MSEC) / delta_us;
++      curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us);
++      curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us);
++      curr_stats->epms = DIV_ROUND_UP(MLX5E_AM_NEVENTS * USEC_PER_MSEC,
++                                      delta_us);
+ }
+ 
+ void mlx5e_rx_am_work(struct work_struct *work)
diff --git a/queue-4.9/net-mlx5e-avoid-doing-a-cleanup-call-if-the-profile-doesn-t-have-it.patch b/queue-4.9/net-mlx5e-avoid-doing-a-cleanup-call-if-the-profile-doesn-t-have-it.patch

new file mode 100644 (file)

index 0000000..26d03db
--- /dev/null
+++ b/queue-4.9/net-mlx5e-avoid-doing-a-cleanup-call-if-the-profile-doesn-t-have-it.patch
@@ -0,0 +1,38 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Or Gerlitz <ogerlitz@mellanox.com>
+Date: Thu, 15 Jun 2017 20:08:32 +0300
+Subject: net/mlx5e: Avoid doing a cleanup call if the profile doesn't have it
+
+From: Or Gerlitz <ogerlitz@mellanox.com>
+
+
+[ Upstream commit 31ac93386d135a6c96de9c8bab406f5ccabf5a4d ]
+
+The error flow of mlx5e_create_netdev calls the cleanup call
+of the given profile without checking if it exists, fix that.
+
+Currently the VF reps don't register that callback and we crash
+if getting into error -- can be reproduced by the user doing ctrl^C
+while attempting to change the sriov mode from legacy to switchdev.
+
+Fixes: 26e59d8077a3 '(net/mlx5e: Implement mlx5e interface attach/detach callbacks')
+Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
+Reported-by: Sabrina Dubroca <sdubroca@redhat.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_main.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+@@ -3846,7 +3846,8 @@ struct net_device *mlx5e_create_netdev(s
+       return netdev;
+ 
+ err_cleanup_nic:
+-      profile->cleanup(priv);
++      if (profile->cleanup)
++              profile->cleanup(priv);
+       free_netdev(netdev);
+ 
+       return NULL;
diff --git a/queue-4.9/net-mlx5e-fix-timestamping-capabilities-reporting.patch b/queue-4.9/net-mlx5e-fix-timestamping-capabilities-reporting.patch

new file mode 100644 (file)

index 0000000..338185a
--- /dev/null
+++ b/queue-4.9/net-mlx5e-fix-timestamping-capabilities-reporting.patch
@@ -0,0 +1,40 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Maor Dickman <maord@mellanox.com>
+Date: Thu, 18 May 2017 15:15:08 +0300
+Subject: net/mlx5e: Fix timestamping capabilities reporting
+
+From: Maor Dickman <maord@mellanox.com>
+
+
+[ Upstream commit f0b381178b01b831f9907d72f467d6443afdea67 ]
+
+Misuse of (BIT) macro caused to report wrong flags for
+"Hardware Transmit Timestamp Modes" and "Hardware Receive
+Filter Modes"
+
+Fixes: ef9814deafd0 ('net/mlx5e: Add HW timestamping (TS) support')
+Signed-off-by: Maor Dickman <maord@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c |    8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_ethtool.c
+@@ -1183,11 +1183,11 @@ static int mlx5e_get_ts_info(struct net_
+                                SOF_TIMESTAMPING_RX_HARDWARE |
+                                SOF_TIMESTAMPING_RAW_HARDWARE;
+ 
+-      info->tx_types = (BIT(1) << HWTSTAMP_TX_OFF) |
+-                       (BIT(1) << HWTSTAMP_TX_ON);
++      info->tx_types = BIT(HWTSTAMP_TX_OFF) |
++                       BIT(HWTSTAMP_TX_ON);
+ 
+-      info->rx_filters = (BIT(1) << HWTSTAMP_FILTER_NONE) |
+-                         (BIT(1) << HWTSTAMP_FILTER_ALL);
++      info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) |
++                         BIT(HWTSTAMP_FILTER_ALL);
+ 
+       return 0;
+ }
diff --git a/queue-4.9/net-mlx5e-fix-wrong-indications-in-dim-due-to-counter-wraparound.patch b/queue-4.9/net-mlx5e-fix-wrong-indications-in-dim-due-to-counter-wraparound.patch

new file mode 100644 (file)

index 0000000..40dd9ef
--- /dev/null
+++ b/queue-4.9/net-mlx5e-fix-wrong-indications-in-dim-due-to-counter-wraparound.patch
@@ -0,0 +1,87 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Tal Gilboa <talgi@mellanox.com>
+Date: Mon, 29 May 2017 17:02:55 +0300
+Subject: net/mlx5e: Fix wrong indications in DIM due to counter wraparound
+
+From: Tal Gilboa <talgi@mellanox.com>
+
+
+[ Upstream commit 53acd76ce571e3b71f9205f2d49ab285a9f1aad8 ]
+
+DIM (Dynamically-tuned Interrupt Moderation) is a mechanism designed for
+changing the channel interrupt moderation values in order to reduce CPU
+overhead for all traffic types.
+Each iteration of the algorithm, DIM calculates the difference in
+throughput, packet rate and interrupt rate from last iteration in order
+to make a decision. DIM relies on counters for each metric. When these
+counters get to their type's max value they wraparound. In this case
+the delta between 'end' and 'start' samples is negative and when
+translated to unsigned integers - very high. This results in a false
+indication to the algorithm and might result in a wrong decision.
+
+The fix calculates the 'distance' between 'end' and 'start' samples in a
+cyclic way around the relevant type's max value. It can also be viewed as
+an absolute value around the type's max value instead of around 0.
+
+Testing show higher stability in DIM profile selection and no wraparound
+issues.
+
+Fixes: cb3c7fd4f839 ("net/mlx5e: Support adaptive RX coalescing")
+Signed-off-by: Tal Gilboa <talgi@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en.h       |    8 ++++----
+ drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c |   10 +++++++---
+ 2 files changed, 11 insertions(+), 7 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
+@@ -288,10 +288,10 @@ struct mlx5e_rx_am_stats {
+ };
+ 
+ struct mlx5e_rx_am_sample {
+-      ktime_t         time;
+-      unsigned int    pkt_ctr;
+-      unsigned int    byte_ctr;
+-      u16             event_ctr;
++      ktime_t time;
++      u32     pkt_ctr;
++      u32     byte_ctr;
++      u16     event_ctr;
+ };
+ 
+ struct mlx5e_rx_am { /* Adaptive Moderation */
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx_am.c
+@@ -270,6 +270,8 @@ static void mlx5e_am_sample(struct mlx5e
+ }
+ 
+ #define MLX5E_AM_NEVENTS 64
++#define BITS_PER_TYPE(type) (sizeof(type) * BITS_PER_BYTE)
++#define BIT_GAP(bits, end, start) ((((end) - (start)) + BIT_ULL(bits)) & (BIT_ULL(bits) - 1))
+ 
+ static void mlx5e_am_calc_stats(struct mlx5e_rx_am_sample *start,
+                               struct mlx5e_rx_am_sample *end,
+@@ -277,8 +279,9 @@ static void mlx5e_am_calc_stats(struct m
+ {
+       /* u32 holds up to 71 minutes, should be enough */
+       u32 delta_us = ktime_us_delta(end->time, start->time);
+-      unsigned int npkts = end->pkt_ctr - start->pkt_ctr;
+-      unsigned int nbytes = end->byte_ctr - start->byte_ctr;
++      u32 npkts = BIT_GAP(BITS_PER_TYPE(u32), end->pkt_ctr, start->pkt_ctr);
++      u32 nbytes = BIT_GAP(BITS_PER_TYPE(u32), end->byte_ctr,
++                           start->byte_ctr);
+ 
+       if (!delta_us)
+               return;
+@@ -311,7 +314,8 @@ void mlx5e_rx_am(struct mlx5e_rq *rq)
+ 
+       switch (am->state) {
+       case MLX5E_AM_MEASURE_IN_PROGRESS:
+-              nevents = rq->cq.event_ctr - am->start_sample.event_ctr;
++              nevents = BIT_GAP(BITS_PER_TYPE(u16), rq->cq.event_ctr,
++                                am->start_sample.event_ctr);
+               if (nevents < MLX5E_AM_NEVENTS)
+                       break;
+               mlx5e_am_sample(rq, &end_sample);
diff --git a/queue-4.9/net-tipc-fix-a-sleep-in-atomic-bug-in-tipc_msg_reverse.patch b/queue-4.9/net-tipc-fix-a-sleep-in-atomic-bug-in-tipc_msg_reverse.patch

new file mode 100644 (file)

index 0000000..0fcd68f
--- /dev/null
+++ b/queue-4.9/net-tipc-fix-a-sleep-in-atomic-bug-in-tipc_msg_reverse.patch
@@ -0,0 +1,44 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Jia-Ju Bai <baijiaju1990@163.com>
+Date: Sat, 10 Jun 2017 17:03:35 +0800
+Subject: net: tipc: Fix a sleep-in-atomic bug in tipc_msg_reverse
+
+From: Jia-Ju Bai <baijiaju1990@163.com>
+
+
+[ Upstream commit 343eba69c6968190d8654b857aea952fed9a6749 ]
+
+The kernel may sleep under a rcu read lock in tipc_msg_reverse, and the
+function call path is:
+tipc_l2_rcv_msg (acquire the lock by rcu_read_lock)
+  tipc_rcv
+    tipc_sk_rcv
+      tipc_msg_reverse
+        pskb_expand_head(GFP_KERNEL) --> may sleep
+tipc_node_broadcast
+  tipc_node_xmit_skb
+    tipc_node_xmit
+      tipc_sk_rcv
+        tipc_msg_reverse
+          pskb_expand_head(GFP_KERNEL) --> may sleep
+
+To fix it, "GFP_KERNEL" is replaced with "GFP_ATOMIC".
+
+Signed-off-by: Jia-Ju Bai <baijiaju1990@163.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/tipc/msg.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/tipc/msg.c
++++ b/net/tipc/msg.c
+@@ -508,7 +508,7 @@ bool tipc_msg_reverse(u32 own_node,  str
+       }
+ 
+       if (skb_cloned(_skb) &&
+-          pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_KERNEL))
++          pskb_expand_head(_skb, BUF_HEADROOM, BUF_TAILROOM, GFP_ATOMIC))
+               goto exit;
+ 
+       /* Now reverse the concerned fields */
diff --git a/queue-4.9/net-vrf-make-add_fib_rules-per-network-namespace-flag.patch b/queue-4.9/net-vrf-make-add_fib_rules-per-network-namespace-flag.patch

new file mode 100644 (file)

index 0000000..983fb15
--- /dev/null
+++ b/queue-4.9/net-vrf-make-add_fib_rules-per-network-namespace-flag.patch
@@ -0,0 +1,110 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: David Ahern <dsahern@gmail.com>
+Date: Thu, 8 Jun 2017 11:31:11 -0600
+Subject: net: vrf: Make add_fib_rules per network namespace flag
+
+From: David Ahern <dsahern@gmail.com>
+
+
+[ Upstream commit 097d3c9508dc58286344e4a22b300098cf0c1566 ]
+
+Commit 1aa6c4f6b8cd8 ("net: vrf: Add l3mdev rules on first device create")
+adds the l3mdev FIB rule the first time a VRF device is created. However,
+it only creates the rule once and only in the namespace the first device
+is created - which may not be init_net. Fix by using the net_generic
+capability to make the add_fib_rules flag per network namespace.
+
+Fixes: 1aa6c4f6b8cd8 ("net: vrf: Add l3mdev rules on first device create")
+Reported-by: Petr Machata <petrm@mellanox.com>
+Signed-off-by: David Ahern <dsahern@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/vrf.c |   36 ++++++++++++++++++++++++++++++++----
+ 1 file changed, 32 insertions(+), 4 deletions(-)
+
+--- a/drivers/net/vrf.c
++++ b/drivers/net/vrf.c
+@@ -36,12 +36,14 @@
+ #include <net/addrconf.h>
+ #include <net/l3mdev.h>
+ #include <net/fib_rules.h>
++#include <net/netns/generic.h>
+ 
+ #define DRV_NAME      "vrf"
+ #define DRV_VERSION   "1.0"
+ 
+ #define FIB_RULE_PREF  1000       /* default preference for FIB rules */
+-static bool add_fib_rules = true;
++
++static unsigned int vrf_net_id;
+ 
+ struct net_vrf {
+       struct rtable __rcu     *rth;
+@@ -1237,6 +1239,8 @@ static int vrf_newlink(struct net *src_n
+                      struct nlattr *tb[], struct nlattr *data[])
+ {
+       struct net_vrf *vrf = netdev_priv(dev);
++      bool *add_fib_rules;
++      struct net *net;
+       int err;
+ 
+       if (!data || !data[IFLA_VRF_TABLE])
+@@ -1252,13 +1256,15 @@ static int vrf_newlink(struct net *src_n
+       if (err)
+               goto out;
+ 
+-      if (add_fib_rules) {
++      net = dev_net(dev);
++      add_fib_rules = net_generic(net, vrf_net_id);
++      if (*add_fib_rules) {
+               err = vrf_add_fib_rules(dev);
+               if (err) {
+                       unregister_netdevice(dev);
+                       goto out;
+               }
+-              add_fib_rules = false;
++              *add_fib_rules = false;
+       }
+ 
+ out:
+@@ -1341,16 +1347,38 @@ static struct notifier_block vrf_notifie
+       .notifier_call = vrf_device_event,
+ };
+ 
++/* Initialize per network namespace state */
++static int __net_init vrf_netns_init(struct net *net)
++{
++      bool *add_fib_rules = net_generic(net, vrf_net_id);
++
++      *add_fib_rules = true;
++
++      return 0;
++}
++
++static struct pernet_operations vrf_net_ops __net_initdata = {
++      .init = vrf_netns_init,
++      .id   = &vrf_net_id,
++      .size = sizeof(bool),
++};
++
+ static int __init vrf_init_module(void)
+ {
+       int rc;
+ 
+       register_netdevice_notifier(&vrf_notifier_block);
+ 
+-      rc = rtnl_link_register(&vrf_link_ops);
++      rc = register_pernet_subsys(&vrf_net_ops);
+       if (rc < 0)
+               goto error;
+ 
++      rc = rtnl_link_register(&vrf_link_ops);
++      if (rc < 0) {
++              unregister_pernet_subsys(&vrf_net_ops);
++              goto error;
++      }
++
+       return 0;
+ 
+ error:
diff --git a/queue-4.9/net-zero-ifla_vf_info-in-rtnl_fill_vfinfo.patch b/queue-4.9/net-zero-ifla_vf_info-in-rtnl_fill_vfinfo.patch

new file mode 100644 (file)

index 0000000..fedda9b
--- /dev/null
+++ b/queue-4.9/net-zero-ifla_vf_info-in-rtnl_fill_vfinfo.patch
@@ -0,0 +1,42 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
+Date: Wed, 7 Jun 2017 21:00:33 +0300
+Subject: net: Zero ifla_vf_info in rtnl_fill_vfinfo()
+
+From: "Mintz, Yuval" <Yuval.Mintz@cavium.com>
+
+
+[ Upstream commit 0eed9cf58446b28b233388b7f224cbca268b6986 ]
+
+Some of the structure's fields are not initialized by the
+rtnetlink. If driver doesn't set those in ndo_get_vf_config(),
+they'd leak memory to user.
+
+Signed-off-by: Yuval Mintz <Yuval.Mintz@cavium.com>
+CC: Michal Schmidt <mschmidt@redhat.com>
+Reviewed-by: Greg Rose <gvrose8192@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/rtnetlink.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/core/rtnetlink.c
++++ b/net/core/rtnetlink.c
+@@ -1130,6 +1130,8 @@ static noinline_for_stack int rtnl_fill_
+       struct ifla_vf_mac vf_mac;
+       struct ifla_vf_info ivi;
+ 
++      memset(&ivi, 0, sizeof(ivi));
++
+       /* Not all SR-IOV capable drivers support the
+        * spoofcheck and "RSS query enable" query.  Preset to
+        * -1 so the user space tool can detect that the driver
+@@ -1138,7 +1140,6 @@ static noinline_for_stack int rtnl_fill_
+       ivi.spoofchk = -1;
+       ivi.rss_query_en = -1;
+       ivi.trusted = -1;
+-      memset(ivi.mac, 0, sizeof(ivi.mac));
+       /* The default value for VF link state is "auto"
+        * IFLA_VF_LINK_STATE_AUTO which equals zero
+        */
diff --git a/queue-4.9/proc-snmp6-use-correct-type-in-memset.patch b/queue-4.9/proc-snmp6-use-correct-type-in-memset.patch

new file mode 100644 (file)

index 0000000..2966bd6
--- /dev/null
+++ b/queue-4.9/proc-snmp6-use-correct-type-in-memset.patch
@@ -0,0 +1,32 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Christian Perle <christian.perle@secunet.com>
+Date: Mon, 12 Jun 2017 10:06:57 +0200
+Subject: proc: snmp6: Use correct type in memset
+
+From: Christian Perle <christian.perle@secunet.com>
+
+
+[ Upstream commit 3500cd73dff48f28f4ba80c171c4c80034d40f76 ]
+
+Reading /proc/net/snmp6 yields bogus values on 32 bit kernels.
+Use "u64" instead of "unsigned long" in sizeof().
+
+Fixes: 4a4857b1c81e ("proc: Reduce cache miss in snmp6_seq_show")
+Signed-off-by: Christian Perle <christian.perle@secunet.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/proc.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ipv6/proc.c
++++ b/net/ipv6/proc.c
+@@ -219,7 +219,7 @@ static void snmp6_seq_show_item64(struct
+       u64 buff64[SNMP_MIB_MAX];
+       int i;
+ 
+-      memset(buff64, 0, sizeof(unsigned long) * SNMP_MIB_MAX);
++      memset(buff64, 0, sizeof(u64) * SNMP_MIB_MAX);
+ 
+       snmp_get_cpu_field64_batch(buff64, itemlist, mib, syncpoff);
+       for (i = 0; itemlist[i].name; i++)
diff --git a/queue-4.9/rtnetlink-add-ifla_group-to-ifla_policy.patch b/queue-4.9/rtnetlink-add-ifla_group-to-ifla_policy.patch

new file mode 100644 (file)

index 0000000..fd54224
--- /dev/null
+++ b/queue-4.9/rtnetlink-add-ifla_group-to-ifla_policy.patch
@@ -0,0 +1,42 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Serhey Popovych <serhe.popovych@gmail.com>
+Date: Tue, 20 Jun 2017 14:35:23 +0300
+Subject: rtnetlink: add IFLA_GROUP to ifla_policy
+
+From: Serhey Popovych <serhe.popovych@gmail.com>
+
+
+[ Upstream commit db833d40ad3263b2ee3b59a1ba168bb3cfed8137 ]
+
+Network interface groups support added while ago, however
+there is no IFLA_GROUP attribute description in policy
+and netlink message size calculations until now.
+
+Add IFLA_GROUP attribute to the policy.
+
+Fixes: cbda10fa97d7 ("net_device: add support for network device groups")
+Signed-off-by: Serhey Popovych <serhe.popovych@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/rtnetlink.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/net/core/rtnetlink.c
++++ b/net/core/rtnetlink.c
+@@ -937,6 +937,7 @@ static noinline size_t if_nlmsg_size(con
+              + nla_total_size(1) /* IFLA_LINKMODE */
+              + nla_total_size(4) /* IFLA_CARRIER_CHANGES */
+              + nla_total_size(4) /* IFLA_LINK_NETNSID */
++             + nla_total_size(4) /* IFLA_GROUP */
+              + nla_total_size(ext_filter_mask
+                               & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */
+              + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */
+@@ -1465,6 +1466,7 @@ static const struct nla_policy ifla_poli
+       [IFLA_LINK_NETNSID]     = { .type = NLA_S32 },
+       [IFLA_PROTO_DOWN]       = { .type = NLA_U8 },
+       [IFLA_XDP]              = { .type = NLA_NESTED },
++      [IFLA_GROUP]            = { .type = NLA_U32 },
+ };
+ 
+ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
diff --git a/queue-4.9/sctp-disable-bh-in-sctp_for_each_endpoint.patch b/queue-4.9/sctp-disable-bh-in-sctp_for_each_endpoint.patch

new file mode 100644 (file)

index 0000000..e4c5bf5
--- /dev/null
+++ b/queue-4.9/sctp-disable-bh-in-sctp_for_each_endpoint.patch
@@ -0,0 +1,49 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Xin Long <lucien.xin@gmail.com>
+Date: Sat, 10 Jun 2017 14:48:14 +0800
+Subject: sctp: disable BH in sctp_for_each_endpoint
+
+From: Xin Long <lucien.xin@gmail.com>
+
+
+[ Upstream commit 581409dacc9176b0de1f6c4ca8d66e13aa8e1b29 ]
+
+Now sctp holds read_lock when foreach sctp_ep_hashtable without disabling
+BH. If CPU schedules to another thread A at this moment, the thread A may
+be trying to hold the write_lock with disabling BH.
+
+As BH is disabled and CPU cannot schedule back to the thread holding the
+read_lock, while the thread A keeps waiting for the read_lock. A dead
+lock would be triggered by this.
+
+This patch is to fix this dead lock by calling read_lock_bh instead to
+disable BH when holding the read_lock in sctp_for_each_endpoint.
+
+Fixes: 626d16f50f39 ("sctp: export some apis or variables for sctp_diag and reuse some for proc")
+Reported-by: Xiumei Mu <xmu@redhat.com>
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sctp/socket.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/sctp/socket.c
++++ b/net/sctp/socket.c
+@@ -4460,13 +4460,13 @@ int sctp_for_each_endpoint(int (*cb)(str
+ 
+       for (head = sctp_ep_hashtable; hash < sctp_ep_hashsize;
+            hash++, head++) {
+-              read_lock(&head->lock);
++              read_lock_bh(&head->lock);
+               sctp_for_each_hentry(epb, &head->chain) {
+                       err = cb(sctp_ep(epb), p);
+                       if (err)
+                               break;
+               }
+-              read_unlock(&head->lock);
++              read_unlock_bh(&head->lock);
+       }
+ 
+       return err;
diff --git a/queue-4.9/sctp-return-next-obj-by-passing-pos-1-into-sctp_transport_get_idx.patch b/queue-4.9/sctp-return-next-obj-by-passing-pos-1-into-sctp_transport_get_idx.patch

new file mode 100644 (file)

index 0000000..acc67c1
--- /dev/null
+++ b/queue-4.9/sctp-return-next-obj-by-passing-pos-1-into-sctp_transport_get_idx.patch
@@ -0,0 +1,46 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Xin Long <lucien.xin@gmail.com>
+Date: Thu, 15 Jun 2017 17:49:08 +0800
+Subject: sctp: return next obj by passing pos + 1 into sctp_transport_get_idx
+
+From: Xin Long <lucien.xin@gmail.com>
+
+
+[ Upstream commit 988c7322116970696211e902b468aefec95b6ec4 ]
+
+In sctp_for_each_transport, pos is used to save how many objs it has
+dumped. Now it gets the last obj by sctp_transport_get_idx, then gets
+the next obj by sctp_transport_get_next.
+
+The issue is that in the meanwhile if some objs in transport hashtable
+are removed and the objs nums are less than pos, sctp_transport_get_idx
+would return NULL and hti.walker.tbl is NULL as well. At this moment
+it should stop hti, instead of continue getting the next obj. Or it
+would cause a NULL pointer dereference in sctp_transport_get_next.
+
+This patch is to pass pos + 1 into sctp_transport_get_idx to get the
+next obj directly, even if pos > objs nums, it would return NULL and
+stop hti.
+
+Fixes: 626d16f50f39 ("sctp: export some apis or variables for sctp_diag and reuse some for proc")
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sctp/socket.c |    5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/net/sctp/socket.c
++++ b/net/sctp/socket.c
+@@ -4506,9 +4506,8 @@ int sctp_for_each_transport(int (*cb)(st
+       if (err)
+               return err;
+ 
+-      sctp_transport_get_idx(net, &hti, pos);
+-      obj = sctp_transport_get_next(net, &hti);
+-      for (; obj && !IS_ERR(obj); obj = sctp_transport_get_next(net, &hti)) {
++      obj = sctp_transport_get_idx(net, &hti, pos + 1);
++      for (; !IS_ERR_OR_NULL(obj); obj = sctp_transport_get_next(net, &hti)) {
+               struct sctp_transport *transport = obj;
+ 
+               if (!sctp_transport_hold(transport))
diff --git a/queue-4.9/series b/queue-4.9/series

index 035e36a79211e4adc997dea33cf6bf596c688f4b..36c697c78c0af1a13190b486b69cf2423649d30e 100644 (file)
--- a/queue-4.9/series
+++ b/queue-4.9/series
@@ -1 +1,25 @@
  ipv6-release-dst-on-error-in-ip6_dst_lookup_tail.patch
+net-don-t-call-strlen-on-non-terminated-string-in-dev_set_alias.patch
+decnet-dn_rtmsg-improve-input-length-sanitization-in-dnrmg_receive_user_skb.patch
+net-zero-ifla_vf_info-in-rtnl_fill_vfinfo.patch
+net-vrf-make-add_fib_rules-per-network-namespace-flag.patch
+af_unix-add-sockaddr-length-checks-before-accessing-sa_family-in-bind-and-connect-handlers.patch
+fix-an-intermittent-pr_emerg-warning-about-lo-becoming-free.patch
+sctp-disable-bh-in-sctp_for_each_endpoint.patch
+net-caif-fix-a-sleep-in-atomic-bug-in-cfpkt_create_pfx.patch
+net-tipc-fix-a-sleep-in-atomic-bug-in-tipc_msg_reverse.patch
+net-mlx5e-added-bw-check-for-dim-decision-mechanism.patch
+net-mlx5e-fix-wrong-indications-in-dim-due-to-counter-wraparound.patch
+proc-snmp6-use-correct-type-in-memset.patch
+igmp-acquire-pmc-lock-for-ip_mc_clear_src.patch
+igmp-add-a-missing-spin_lock_init.patch
+ipv6-fix-calling-in6_ifa_hold-incorrectly-for-dad-work.patch
+sctp-return-next-obj-by-passing-pos-1-into-sctp_transport_get_idx.patch
+net-mlx5e-avoid-doing-a-cleanup-call-if-the-profile-doesn-t-have-it.patch
+net-mlx5-wait-for-fw-readiness-before-initializing-command-interface.patch
+net-mlx5e-fix-timestamping-capabilities-reporting.patch
+decnet-always-not-take-dst-__refcnt-when-inserting-dst-into-hash-table.patch
+net-8021q-fix-one-possible-panic-caused-by-bug_on-in-free_netdev.patch
+sfc-provide-dummy-definitions-of-vswitch-functions.patch
+ipv6-do-not-leak-throw-route-references.patch
+rtnetlink-add-ifla_group-to-ifla_policy.patch
diff --git a/queue-4.9/sfc-provide-dummy-definitions-of-vswitch-functions.patch b/queue-4.9/sfc-provide-dummy-definitions-of-vswitch-functions.patch

new file mode 100644 (file)

index 0000000..b23b7c9
--- /dev/null
+++ b/queue-4.9/sfc-provide-dummy-definitions-of-vswitch-functions.patch
@@ -0,0 +1,50 @@
+From foo@baz Thu Jun 29 18:57:46 CEST 2017
+From: Bert Kenward <bkenward@solarflare.com>
+Date: Fri, 16 Jun 2017 09:45:08 +0100
+Subject: sfc: provide dummy definitions of vswitch functions
+
+From: Bert Kenward <bkenward@solarflare.com>
+
+
+efx_probe_all() calls efx->type->vswitching_probe during probe. For
+SFC4000 (Falcon) NICs this function is not defined, leading to a BUG
+with the top of the call stack similar to:
+  ? efx_pci_probe_main+0x29a/0x830
+  efx_pci_probe+0x7d3/0xe70
+
+vswitching_restore and vswitching_remove also need to be defined.
+
+Fixed in mainline by:
+commit 5a6681e22c14 ("sfc: separate out SFC4000 ("Falcon") support into new sfc-falcon driver")
+
+Fixes: 6d8aaaf6f798 ("sfc: create VEB vswitch and vport above default firmware setup")
+Signed-off-by: Bert Kenward <bkenward@solarflare.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/sfc/falcon.c |   10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+--- a/drivers/net/ethernet/sfc/falcon.c
++++ b/drivers/net/ethernet/sfc/falcon.c
+@@ -2801,6 +2801,11 @@ const struct efx_nic_type falcon_a1_nic_
+       .timer_period_max =  1 << FRF_AB_TC_TIMER_VAL_WIDTH,
+       .offload_features = NETIF_F_IP_CSUM,
+       .mcdi_max_ver = -1,
++#ifdef CONFIG_SFC_SRIOV
++      .vswitching_probe = efx_port_dummy_op_int,
++      .vswitching_restore = efx_port_dummy_op_int,
++      .vswitching_remove = efx_port_dummy_op_void,
++#endif
+ };
+ 
+ const struct efx_nic_type falcon_b0_nic_type = {
+@@ -2902,4 +2907,9 @@ const struct efx_nic_type falcon_b0_nic_
+       .offload_features = NETIF_F_IP_CSUM | NETIF_F_RXHASH | NETIF_F_NTUPLE,
+       .mcdi_max_ver = -1,
+       .max_rx_ip_filters = FR_BZ_RX_FILTER_TBL0_ROWS,
++#ifdef CONFIG_SFC_SRIOV
++      .vswitching_probe = efx_port_dummy_op_int,
++      .vswitching_restore = efx_port_dummy_op_int,
++      .vswitching_remove = efx_port_dummy_op_void,
++#endif
+ };
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Thu, 29 Jun 2017 16:59:01 +0000 (18:59 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Thu, 29 Jun 2017 16:59:01 +0000 (18:59 +0200)
queue-4.9/af_unix-add-sockaddr-length-checks-before-accessing-sa_family-in-bind-and-connect-handlers.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/decnet-always-not-take-dst-__refcnt-when-inserting-dst-into-hash-table.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/decnet-dn_rtmsg-improve-input-length-sanitization-in-dnrmg_receive_user_skb.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/fix-an-intermittent-pr_emerg-warning-about-lo-becoming-free.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/igmp-acquire-pmc-lock-for-ip_mc_clear_src.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/igmp-add-a-missing-spin_lock_init.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/ipv6-do-not-leak-throw-route-references.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/ipv6-fix-calling-in6_ifa_hold-incorrectly-for-dad-work.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-8021q-fix-one-possible-panic-caused-by-bug_on-in-free_netdev.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-caif-fix-a-sleep-in-atomic-bug-in-cfpkt_create_pfx.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-don-t-call-strlen-on-non-terminated-string-in-dev_set_alias.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-mlx5-wait-for-fw-readiness-before-initializing-command-interface.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-mlx5e-added-bw-check-for-dim-decision-mechanism.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-mlx5e-avoid-doing-a-cleanup-call-if-the-profile-doesn-t-have-it.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-mlx5e-fix-timestamping-capabilities-reporting.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-mlx5e-fix-wrong-indications-in-dim-due-to-counter-wraparound.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-tipc-fix-a-sleep-in-atomic-bug-in-tipc_msg_reverse.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-vrf-make-add_fib_rules-per-network-namespace-flag.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/net-zero-ifla_vf_info-in-rtnl_fill_vfinfo.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/proc-snmp6-use-correct-type-in-memset.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/rtnetlink-add-ifla_group-to-ifla_policy.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/sctp-disable-bh-in-sctp_for_each_endpoint.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/sctp-return-next-obj-by-passing-pos-1-into-sctp_transport_get_idx.patch	[new file with mode: 0644]	patch \| blob
queue-4.9/series		patch \| blob \| blame \| history
queue-4.9/sfc-provide-dummy-definitions-of-vswitch-functions.patch	[new file with mode: 0644]	patch \| blob