]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.14-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 31 Dec 2017 10:13:42 +0000 (11:13 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 31 Dec 2017 10:13:42 +0000 (11:13 +0100)
added patches:
adding-missing-rcu_read_unlock-in-ipxip6_rcv.patch
bnxt_en-fix-sources-of-spurious-netpoll-warnings.patch
ip6_gre-fix-device-features-for-ioctl-setup.patch
ipv4-fib-fix-metrics-match-when-deleting-a-route.patch
ipv4-fix-use-after-free-when-flushing-fib-tables.patch
ipv4-igmp-guard-against-silly-mtu-values.patch
ipv6-honor-specified-parameters-in-fibmatch-lookup.patch
ipv6-mcast-better-catch-silly-mtu-values.patch
ipv6-set-all.accept_dad-to-0-by-default.patch
mlxsw-spectrum-disable-mac-learning-for-ovs-port.patch
net-bridge-fix-early-call-to-br_stp_change_bridge_id-and-plug-newlink-leaks.patch
net-dsa-bcm_sf2-clear-iddq_global_pwr-bit-for-phy.patch
net-fec-unmap-the-xmit-buffer-that-are-not-transferred-by-dma.patch
net-fix-double-free-and-memory-corruption-in-get_net_ns_by_id.patch
net-igmp-use-correct-source-address-on-igmpv3-reports.patch
net-ipv4-fix-for-a-race-condition-in-raw_sendmsg.patch
net-mlx5-fix-error-flow-in-create_qp-command.patch
net-mlx5-fix-rate-limit-packet-pacing-naming-and-struct.patch
net-mlx5-fpga-return-einval-if-size-is-zero.patch
net-mlx5e-add-refcount-to-vxlan-structure.patch
net-mlx5e-fix-features-check-of-ipv6-traffic.patch
net-mlx5e-fix-possible-deadlock-of-vxlan-lock.patch
net-mlx5e-prevent-possible-races-in-vxlan-control-flow.patch
net-mvmdio-disable-unprepare-clocks-in-eprobe_defer-case.patch
net-phy-marvell-limit-88m1101-autoneg-errata-to-88e1145-as-well.patch
net-phy-micrel-ksz9031-reconfigure-autoneg-after-phy-autoneg-workaround.patch
net-qmi_wwan-add-sierra-em7565-1199-9091.patch
net-reevalulate-autoflowlabel-setting-after-sysctl-setting.patch
net-sched-fix-static-key-imbalance-in-case-of-ingress-clsact_init-error.patch
netlink-add-netns-check-on-taps.patch
openvswitch-fix-pop_vlan-action-for-double-tagged-frames.patch
phylink-ensure-an-is-enabled.patch
phylink-ensure-the-phy-interface-mode-is-appropriately-set.patch
ptr_ring-add-barriers.patch
rds-check-cmsg_len-before-dereferencing-cmsg_data.patch
revert-mlx5-move-affinity-hints-assignments-to-generic-code.patch
s390-qeth-apply-takeover-changes-when-mode-is-toggled.patch
s390-qeth-don-t-apply-takeover-changes-to-rxip.patch
s390-qeth-fix-error-handling-in-checksum-cmd-callback.patch
s390-qeth-lock-ip-table-while-applying-takeover-changes.patch
s390-qeth-update-takeover-ips-after-configuration-change.patch
sctp-make-sure-stream-nums-can-match-optlen-in-sctp_setsockopt_reset_streams.patch
sctp-replace-use-of-sockets_allocated-with-specified-macro.patch
sfc-pass-valid-pointers-from-efx_enqueue_unwind.patch
skbuff-in-skb_copy_ubufs-unclone-before-releasing-zerocopy.patch
skbuff-orphan-frags-before-zerocopy-clone.patch
skbuff-skb_copy_ubufs-must-release-uarg-even-without-user-frags.patch
sock-free-skb-in-skb_complete_tx_timestamp-on-error.patch
tcp-fix-potential-underestimation-on-rcv_rtt.patch
tcp-invalidate-rate-samples-during-sack-reneging.patch
tcp-md5sig-use-skb-s-saddr-when-replying-to-an-incoming-segment.patch
tcp-refresh-tcp_mstamp-from-timers-callbacks.patch
tcp_bbr-record-full-bw-reached-decision-in-new-full_bw_reached-bit.patch
tcp_bbr-reset-full-pipe-detection-on-loss-recovery-undo.patch
tcp_bbr-reset-long-term-bandwidth-sampling-on-loss-recovery-undo.patch
tg3-fix-rx-hang-on-mtu-change-with-5717-5719.patch
tipc-fix-hanging-poll-for-stream-sockets.patch
vxlan-restore-dev-mtu-setting-based-on-lower-device.patch

59 files changed:
queue-4.14/adding-missing-rcu_read_unlock-in-ipxip6_rcv.patch [new file with mode: 0644]
queue-4.14/bnxt_en-fix-sources-of-spurious-netpoll-warnings.patch [new file with mode: 0644]
queue-4.14/ip6_gre-fix-device-features-for-ioctl-setup.patch [new file with mode: 0644]
queue-4.14/ipv4-fib-fix-metrics-match-when-deleting-a-route.patch [new file with mode: 0644]
queue-4.14/ipv4-fix-use-after-free-when-flushing-fib-tables.patch [new file with mode: 0644]
queue-4.14/ipv4-igmp-guard-against-silly-mtu-values.patch [new file with mode: 0644]
queue-4.14/ipv6-honor-specified-parameters-in-fibmatch-lookup.patch [new file with mode: 0644]
queue-4.14/ipv6-mcast-better-catch-silly-mtu-values.patch [new file with mode: 0644]
queue-4.14/ipv6-set-all.accept_dad-to-0-by-default.patch [new file with mode: 0644]
queue-4.14/mlxsw-spectrum-disable-mac-learning-for-ovs-port.patch [new file with mode: 0644]
queue-4.14/net-bridge-fix-early-call-to-br_stp_change_bridge_id-and-plug-newlink-leaks.patch [new file with mode: 0644]
queue-4.14/net-dsa-bcm_sf2-clear-iddq_global_pwr-bit-for-phy.patch [new file with mode: 0644]
queue-4.14/net-fec-unmap-the-xmit-buffer-that-are-not-transferred-by-dma.patch [new file with mode: 0644]
queue-4.14/net-fix-double-free-and-memory-corruption-in-get_net_ns_by_id.patch [new file with mode: 0644]
queue-4.14/net-igmp-use-correct-source-address-on-igmpv3-reports.patch [new file with mode: 0644]
queue-4.14/net-ipv4-fix-for-a-race-condition-in-raw_sendmsg.patch [new file with mode: 0644]
queue-4.14/net-mlx5-fix-error-flow-in-create_qp-command.patch [new file with mode: 0644]
queue-4.14/net-mlx5-fix-rate-limit-packet-pacing-naming-and-struct.patch [new file with mode: 0644]
queue-4.14/net-mlx5-fpga-return-einval-if-size-is-zero.patch [new file with mode: 0644]
queue-4.14/net-mlx5e-add-refcount-to-vxlan-structure.patch [new file with mode: 0644]
queue-4.14/net-mlx5e-fix-features-check-of-ipv6-traffic.patch [new file with mode: 0644]
queue-4.14/net-mlx5e-fix-possible-deadlock-of-vxlan-lock.patch [new file with mode: 0644]
queue-4.14/net-mlx5e-prevent-possible-races-in-vxlan-control-flow.patch [new file with mode: 0644]
queue-4.14/net-mvmdio-disable-unprepare-clocks-in-eprobe_defer-case.patch [new file with mode: 0644]
queue-4.14/net-phy-marvell-limit-88m1101-autoneg-errata-to-88e1145-as-well.patch [new file with mode: 0644]
queue-4.14/net-phy-micrel-ksz9031-reconfigure-autoneg-after-phy-autoneg-workaround.patch [new file with mode: 0644]
queue-4.14/net-qmi_wwan-add-sierra-em7565-1199-9091.patch [new file with mode: 0644]
queue-4.14/net-reevalulate-autoflowlabel-setting-after-sysctl-setting.patch [new file with mode: 0644]
queue-4.14/net-sched-fix-static-key-imbalance-in-case-of-ingress-clsact_init-error.patch [new file with mode: 0644]
queue-4.14/netlink-add-netns-check-on-taps.patch [new file with mode: 0644]
queue-4.14/openvswitch-fix-pop_vlan-action-for-double-tagged-frames.patch [new file with mode: 0644]
queue-4.14/phylink-ensure-an-is-enabled.patch [new file with mode: 0644]
queue-4.14/phylink-ensure-the-phy-interface-mode-is-appropriately-set.patch [new file with mode: 0644]
queue-4.14/ptr_ring-add-barriers.patch [new file with mode: 0644]
queue-4.14/rds-check-cmsg_len-before-dereferencing-cmsg_data.patch [new file with mode: 0644]
queue-4.14/revert-mlx5-move-affinity-hints-assignments-to-generic-code.patch [new file with mode: 0644]
queue-4.14/s390-qeth-apply-takeover-changes-when-mode-is-toggled.patch [new file with mode: 0644]
queue-4.14/s390-qeth-don-t-apply-takeover-changes-to-rxip.patch [new file with mode: 0644]
queue-4.14/s390-qeth-fix-error-handling-in-checksum-cmd-callback.patch [new file with mode: 0644]
queue-4.14/s390-qeth-lock-ip-table-while-applying-takeover-changes.patch [new file with mode: 0644]
queue-4.14/s390-qeth-update-takeover-ips-after-configuration-change.patch [new file with mode: 0644]
queue-4.14/sctp-make-sure-stream-nums-can-match-optlen-in-sctp_setsockopt_reset_streams.patch [new file with mode: 0644]
queue-4.14/sctp-replace-use-of-sockets_allocated-with-specified-macro.patch [new file with mode: 0644]
queue-4.14/series
queue-4.14/sfc-pass-valid-pointers-from-efx_enqueue_unwind.patch [new file with mode: 0644]
queue-4.14/skbuff-in-skb_copy_ubufs-unclone-before-releasing-zerocopy.patch [new file with mode: 0644]
queue-4.14/skbuff-orphan-frags-before-zerocopy-clone.patch [new file with mode: 0644]
queue-4.14/skbuff-skb_copy_ubufs-must-release-uarg-even-without-user-frags.patch [new file with mode: 0644]
queue-4.14/sock-free-skb-in-skb_complete_tx_timestamp-on-error.patch [new file with mode: 0644]
queue-4.14/tcp-fix-potential-underestimation-on-rcv_rtt.patch [new file with mode: 0644]
queue-4.14/tcp-invalidate-rate-samples-during-sack-reneging.patch [new file with mode: 0644]
queue-4.14/tcp-md5sig-use-skb-s-saddr-when-replying-to-an-incoming-segment.patch [new file with mode: 0644]
queue-4.14/tcp-refresh-tcp_mstamp-from-timers-callbacks.patch [new file with mode: 0644]
queue-4.14/tcp_bbr-record-full-bw-reached-decision-in-new-full_bw_reached-bit.patch [new file with mode: 0644]
queue-4.14/tcp_bbr-reset-full-pipe-detection-on-loss-recovery-undo.patch [new file with mode: 0644]
queue-4.14/tcp_bbr-reset-long-term-bandwidth-sampling-on-loss-recovery-undo.patch [new file with mode: 0644]
queue-4.14/tg3-fix-rx-hang-on-mtu-change-with-5717-5719.patch [new file with mode: 0644]
queue-4.14/tipc-fix-hanging-poll-for-stream-sockets.patch [new file with mode: 0644]
queue-4.14/vxlan-restore-dev-mtu-setting-based-on-lower-device.patch [new file with mode: 0644]

diff --git a/queue-4.14/adding-missing-rcu_read_unlock-in-ipxip6_rcv.patch b/queue-4.14/adding-missing-rcu_read_unlock-in-ipxip6_rcv.patch
new file mode 100644 (file)
index 0000000..c9aa5a4
--- /dev/null
@@ -0,0 +1,38 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: "Nikita V. Shirokov" <tehnerd@fb.com>
+Date: Wed, 6 Dec 2017 17:15:43 -0800
+Subject: adding missing rcu_read_unlock in ipxip6_rcv
+
+From: "Nikita V. Shirokov" <tehnerd@fb.com>
+
+
+[ Upstream commit 74c4b656c3d92ec4c824ea1a4afd726b7b6568c8 ]
+
+commit 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels")
+introduced new exit point in  ipxip6_rcv. however rcu_read_unlock is
+missing there. this diff is fixing this
+
+v1->v2:
+ instead of doing rcu_read_unlock in place, we are going to "drop"
+ section (to prevent skb leakage)
+
+Fixes: 8d79266bc48c ("ip6_tunnel: add collect_md mode to IPv6 tunnels")
+Signed-off-by: Nikita V. Shirokov <tehnerd@fb.com>
+Acked-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/ip6_tunnel.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ipv6/ip6_tunnel.c
++++ b/net/ipv6/ip6_tunnel.c
+@@ -912,7 +912,7 @@ static int ipxip6_rcv(struct sk_buff *sk
+               if (t->parms.collect_md) {
+                       tun_dst = ipv6_tun_rx_dst(skb, 0, 0, 0);
+                       if (!tun_dst)
+-                              return 0;
++                              goto drop;
+               }
+               ret = __ip6_tnl_rcv(t, skb, tpi, tun_dst, dscp_ecn_decapsulate,
+                                   log_ecn_error);
diff --git a/queue-4.14/bnxt_en-fix-sources-of-spurious-netpoll-warnings.patch b/queue-4.14/bnxt_en-fix-sources-of-spurious-netpoll-warnings.patch
new file mode 100644 (file)
index 0000000..84302e2
--- /dev/null
@@ -0,0 +1,73 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Calvin Owens <calvinowens@fb.com>
+Date: Fri, 8 Dec 2017 09:05:26 -0800
+Subject: bnxt_en: Fix sources of spurious netpoll warnings
+
+From: Calvin Owens <calvinowens@fb.com>
+
+
+[ Upstream commit 2edbdb3159d6f6bd3a9b6e7f789f2b879699a519 ]
+
+After applying 2270bc5da3497945 ("bnxt_en: Fix netpoll handling") and
+903649e718f80da2 ("bnxt_en: Improve -ENOMEM logic in NAPI poll loop."),
+we still see the following WARN fire:
+
+  ------------[ cut here ]------------
+  WARNING: CPU: 0 PID: 1875170 at net/core/netpoll.c:165 netpoll_poll_dev+0x15a/0x160
+  bnxt_poll+0x0/0xd0 exceeded budget in poll
+  <snip>
+  Call Trace:
+   [<ffffffff814be5cd>] dump_stack+0x4d/0x70
+   [<ffffffff8107e013>] __warn+0xd3/0xf0
+   [<ffffffff8107e07f>] warn_slowpath_fmt+0x4f/0x60
+   [<ffffffff8179519a>] netpoll_poll_dev+0x15a/0x160
+   [<ffffffff81795f38>] netpoll_send_skb_on_dev+0x168/0x250
+   [<ffffffff817962fc>] netpoll_send_udp+0x2dc/0x440
+   [<ffffffff815fa9be>] write_ext_msg+0x20e/0x250
+   [<ffffffff810c8125>] call_console_drivers.constprop.23+0xa5/0x110
+   [<ffffffff810c9549>] console_unlock+0x339/0x5b0
+   [<ffffffff810c9a88>] vprintk_emit+0x2c8/0x450
+   [<ffffffff810c9d5f>] vprintk_default+0x1f/0x30
+   [<ffffffff81173df5>] printk+0x48/0x50
+   [<ffffffffa0197713>] edac_raw_mc_handle_error+0x563/0x5c0 [edac_core]
+   [<ffffffffa0197b9b>] edac_mc_handle_error+0x42b/0x6e0 [edac_core]
+   [<ffffffffa01c3a60>] sbridge_mce_output_error+0x410/0x10d0 [sb_edac]
+   [<ffffffffa01c47cc>] sbridge_check_error+0xac/0x130 [sb_edac]
+   [<ffffffffa0197f3c>] edac_mc_workq_function+0x3c/0x90 [edac_core]
+   [<ffffffff81095f8b>] process_one_work+0x19b/0x480
+   [<ffffffff810967ca>] worker_thread+0x6a/0x520
+   [<ffffffff8109c7c4>] kthread+0xe4/0x100
+   [<ffffffff81884c52>] ret_from_fork+0x22/0x40
+
+This happens because we increment rx_pkts on -ENOMEM and -EIO, resulting
+in rx_pkts > 0. Fix this by only bumping rx_pkts if we were actually
+given a non-zero budget.
+
+Signed-off-by: Calvin Owens <calvinowens@fb.com>
+Acked-by: Michael Chan <michael.chan@broadcom.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+@@ -1875,7 +1875,7 @@ static int bnxt_poll_work(struct bnxt *b
+                        * here forever if we consistently cannot allocate
+                        * buffers.
+                        */
+-                      else if (rc == -ENOMEM)
++                      else if (rc == -ENOMEM && budget)
+                               rx_pkts++;
+                       else if (rc == -EBUSY)  /* partial completion */
+                               break;
+@@ -1961,7 +1961,7 @@ static int bnxt_poll_nitroa0(struct napi
+                               cpu_to_le32(RX_CMPL_ERRORS_CRC_ERROR);
+                       rc = bnxt_rx_pkt(bp, bnapi, &raw_cons, &event);
+-                      if (likely(rc == -EIO))
++                      if (likely(rc == -EIO) && budget)
+                               rx_pkts++;
+                       else if (rc == -EBUSY)  /* partial completion */
+                               break;
diff --git a/queue-4.14/ip6_gre-fix-device-features-for-ioctl-setup.patch b/queue-4.14/ip6_gre-fix-device-features-for-ioctl-setup.patch
new file mode 100644 (file)
index 0000000..1f1ce36
--- /dev/null
@@ -0,0 +1,140 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Alexey Kodanev <alexey.kodanev@oracle.com>
+Date: Wed, 20 Dec 2017 19:36:03 +0300
+Subject: ip6_gre: fix device features for ioctl setup
+
+From: Alexey Kodanev <alexey.kodanev@oracle.com>
+
+
+[ Upstream commit e5a9336adb317db55eb3fe8200856096f3c71109 ]
+
+When ip6gre is created using ioctl, its features, such as
+scatter-gather, GSO and tx-checksumming will be turned off:
+
+  # ip -f inet6 tunnel add gre6 mode ip6gre remote fd00::1
+  # ethtool -k gre6 (truncated output)
+    tx-checksumming: off
+    scatter-gather: off
+    tcp-segmentation-offload: off
+    generic-segmentation-offload: off [requested on]
+
+But when netlink is used, they will be enabled:
+  # ip link add gre6 type ip6gre remote fd00::1
+  # ethtool -k gre6 (truncated output)
+    tx-checksumming: on
+    scatter-gather: on
+    tcp-segmentation-offload: on
+    generic-segmentation-offload: on
+
+This results in a loss of performance when gre6 is created via ioctl.
+The issue was found with LTP/gre tests.
+
+Fix it by moving the setup of device features to a separate function
+and invoke it with ndo_init callback because both netlink and ioctl
+will eventually call it via register_netdevice():
+
+   register_netdevice()
+       - ndo_init() callback -> ip6gre_tunnel_init() or ip6gre_tap_init()
+           - ip6gre_tunnel_init_common()
+                - ip6gre_tnl_init_features()
+
+The moved code also contains two minor style fixes:
+  * removed needless tab from GRE6_FEATURES on NETIF_F_HIGHDMA line.
+  * fixed the issue reported by checkpatch: "Unnecessary parentheses around
+    'nt->encap.type == TUNNEL_ENCAP_NONE'"
+
+Fixes: ac4eb009e477 ("ip6gre: Add support for basic offloads offloads excluding GSO")
+Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/ip6_gre.c |   57 +++++++++++++++++++++++++++++------------------------
+ 1 file changed, 32 insertions(+), 25 deletions(-)
+
+--- a/net/ipv6/ip6_gre.c
++++ b/net/ipv6/ip6_gre.c
+@@ -1020,6 +1020,36 @@ static void ip6gre_tunnel_setup(struct n
+       eth_random_addr(dev->perm_addr);
+ }
++#define GRE6_FEATURES (NETIF_F_SG |           \
++                     NETIF_F_FRAGLIST |       \
++                     NETIF_F_HIGHDMA |        \
++                     NETIF_F_HW_CSUM)
++
++static void ip6gre_tnl_init_features(struct net_device *dev)
++{
++      struct ip6_tnl *nt = netdev_priv(dev);
++
++      dev->features           |= GRE6_FEATURES;
++      dev->hw_features        |= GRE6_FEATURES;
++
++      if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
++              /* TCP offload with GRE SEQ is not supported, nor
++               * can we support 2 levels of outer headers requiring
++               * an update.
++               */
++              if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
++                  nt->encap.type == TUNNEL_ENCAP_NONE) {
++                      dev->features    |= NETIF_F_GSO_SOFTWARE;
++                      dev->hw_features |= NETIF_F_GSO_SOFTWARE;
++              }
++
++              /* Can use a lockless transmit, unless we generate
++               * output sequences
++               */
++              dev->features |= NETIF_F_LLTX;
++      }
++}
++
+ static int ip6gre_tunnel_init_common(struct net_device *dev)
+ {
+       struct ip6_tnl *tunnel;
+@@ -1054,6 +1084,8 @@ static int ip6gre_tunnel_init_common(str
+       if (!(tunnel->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+               dev->mtu -= 8;
++      ip6gre_tnl_init_features(dev);
++
+       return 0;
+ }
+@@ -1302,11 +1334,6 @@ static const struct net_device_ops ip6gr
+       .ndo_get_iflink = ip6_tnl_get_iflink,
+ };
+-#define GRE6_FEATURES (NETIF_F_SG |           \
+-                     NETIF_F_FRAGLIST |       \
+-                     NETIF_F_HIGHDMA |                \
+-                     NETIF_F_HW_CSUM)
+-
+ static void ip6gre_tap_setup(struct net_device *dev)
+ {
+@@ -1386,26 +1413,6 @@ static int ip6gre_newlink(struct net *sr
+       nt->net = dev_net(dev);
+       ip6gre_tnl_link_config(nt, !tb[IFLA_MTU]);
+-      dev->features           |= GRE6_FEATURES;
+-      dev->hw_features        |= GRE6_FEATURES;
+-
+-      if (!(nt->parms.o_flags & TUNNEL_SEQ)) {
+-              /* TCP offload with GRE SEQ is not supported, nor
+-               * can we support 2 levels of outer headers requiring
+-               * an update.
+-               */
+-              if (!(nt->parms.o_flags & TUNNEL_CSUM) ||
+-                  (nt->encap.type == TUNNEL_ENCAP_NONE)) {
+-                      dev->features    |= NETIF_F_GSO_SOFTWARE;
+-                      dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+-              }
+-
+-              /* Can use a lockless transmit, unless we generate
+-               * output sequences
+-               */
+-              dev->features |= NETIF_F_LLTX;
+-      }
+-
+       err = register_netdevice(dev);
+       if (err)
+               goto out;
diff --git a/queue-4.14/ipv4-fib-fix-metrics-match-when-deleting-a-route.patch b/queue-4.14/ipv4-fib-fix-metrics-match-when-deleting-a-route.patch
new file mode 100644 (file)
index 0000000..49bb75a
--- /dev/null
@@ -0,0 +1,60 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Phil Sutter <phil@nwl.cc>
+Date: Tue, 19 Dec 2017 15:17:13 +0100
+Subject: ipv4: fib: Fix metrics match when deleting a route
+
+From: Phil Sutter <phil@nwl.cc>
+
+
+[ Upstream commit d03a45572efa068fa64db211d6d45222660e76c5 ]
+
+The recently added fib_metrics_match() causes a regression for routes
+with both RTAX_FEATURES and RTAX_CC_ALGO if the latter has
+TCP_CONG_NEEDS_ECN flag set:
+
+| # ip link add d0 type dummy
+| # ip link set d0 up
+| # ip route add 172.29.29.0/24 dev d0 features ecn congctl dctcp
+| # ip route del 172.29.29.0/24 dev d0 features ecn congctl dctcp
+| RTNETLINK answers: No such process
+
+During route insertion, fib_convert_metrics() detects that the given CC
+algo requires ECN and hence sets DST_FEATURE_ECN_CA bit in
+RTAX_FEATURES.
+
+During route deletion though, fib_metrics_match() compares stored
+RTAX_FEATURES value with that from userspace (which obviously has no
+knowledge about DST_FEATURE_ECN_CA) and fails.
+
+Fixes: 5f9ae3d9e7e4a ("ipv4: do metrics match when looking up and deleting a route")
+Signed-off-by: Phil Sutter <phil@nwl.cc>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/fib_semantics.c |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/fib_semantics.c
++++ b/net/ipv4/fib_semantics.c
+@@ -706,7 +706,7 @@ bool fib_metrics_match(struct fib_config
+       nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+               int type = nla_type(nla);
+-              u32 val;
++              u32 fi_val, val;
+               if (!type)
+                       continue;
+@@ -723,7 +723,11 @@ bool fib_metrics_match(struct fib_config
+                       val = nla_get_u32(nla);
+               }
+-              if (fi->fib_metrics->metrics[type - 1] != val)
++              fi_val = fi->fib_metrics->metrics[type - 1];
++              if (type == RTAX_FEATURES)
++                      fi_val &= ~DST_FEATURE_ECN_CA;
++
++              if (fi_val != val)
+                       return false;
+       }
diff --git a/queue-4.14/ipv4-fix-use-after-free-when-flushing-fib-tables.patch b/queue-4.14/ipv4-fix-use-after-free-when-flushing-fib-tables.patch
new file mode 100644 (file)
index 0000000..a2b8b88
--- /dev/null
@@ -0,0 +1,59 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Ido Schimmel <idosch@mellanox.com>
+Date: Wed, 20 Dec 2017 19:34:19 +0200
+Subject: ipv4: Fix use-after-free when flushing FIB tables
+
+From: Ido Schimmel <idosch@mellanox.com>
+
+
+[ Upstream commit b4681c2829e24943aadd1a7bb3a30d41d0a20050 ]
+
+Since commit 0ddcf43d5d4a ("ipv4: FIB Local/MAIN table collapse") the
+local table uses the same trie allocated for the main table when custom
+rules are not in use.
+
+When a net namespace is dismantled, the main table is flushed and freed
+(via an RCU callback) before the local table. In case the callback is
+invoked before the local table is iterated, a use-after-free can occur.
+
+Fix this by iterating over the FIB tables in reverse order, so that the
+main table is always freed after the local table.
+
+v3: Reworded comment according to Alex's suggestion.
+v2: Add a comment to make the fix more explicit per Dave's and Alex's
+feedback.
+
+Fixes: 0ddcf43d5d4a ("ipv4: FIB Local/MAIN table collapse")
+Signed-off-by: Ido Schimmel <idosch@mellanox.com>
+Reported-by: Fengguang Wu <fengguang.wu@intel.com>
+Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/fib_frontend.c |    9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/fib_frontend.c
++++ b/net/ipv4/fib_frontend.c
+@@ -1274,14 +1274,19 @@ err_table_hash_alloc:
+ static void ip_fib_net_exit(struct net *net)
+ {
+-      unsigned int i;
++      int i;
+       rtnl_lock();
+ #ifdef CONFIG_IP_MULTIPLE_TABLES
+       RCU_INIT_POINTER(net->ipv4.fib_main, NULL);
+       RCU_INIT_POINTER(net->ipv4.fib_default, NULL);
+ #endif
+-      for (i = 0; i < FIB_TABLE_HASHSZ; i++) {
++      /* Destroy the tables in reverse order to guarantee that the
++       * local table, ID 255, is destroyed before the main table, ID
++       * 254. This is necessary as the local table may contain
++       * references to data contained in the main table.
++       */
++      for (i = FIB_TABLE_HASHSZ - 1; i >= 0; i--) {
+               struct hlist_head *head = &net->ipv4.fib_table_hash[i];
+               struct hlist_node *tmp;
+               struct fib_table *tb;
diff --git a/queue-4.14/ipv4-igmp-guard-against-silly-mtu-values.patch b/queue-4.14/ipv4-igmp-guard-against-silly-mtu-values.patch
new file mode 100644 (file)
index 0000000..4e5fc3f
--- /dev/null
@@ -0,0 +1,142 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Mon, 11 Dec 2017 07:17:39 -0800
+Subject: ipv4: igmp: guard against silly MTU values
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit b5476022bbada3764609368f03329ca287528dc8 ]
+
+IPv4 stack reacts to changes to small MTU, by disabling itself under
+RTNL.
+
+But there is a window where threads not using RTNL can see a wrong
+device mtu. This can lead to surprises, in igmp code where it is
+assumed the mtu is suitable.
+
+Fix this by reading device mtu once and checking IPv4 minimal MTU.
+
+This patch adds missing IPV4_MIN_MTU define, to not abuse
+ETH_MIN_MTU anymore.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/ip.h     |    1 +
+ net/ipv4/devinet.c   |    2 +-
+ net/ipv4/igmp.c      |   24 +++++++++++++++---------
+ net/ipv4/ip_tunnel.c |    4 ++--
+ 4 files changed, 19 insertions(+), 12 deletions(-)
+
+--- a/include/net/ip.h
++++ b/include/net/ip.h
+@@ -34,6 +34,7 @@
+ #include <net/flow_dissector.h>
+ #define IPV4_MAX_PMTU         65535U          /* RFC 2675, Section 5.1 */
++#define IPV4_MIN_MTU          68                      /* RFC 791 */
+ struct sock;
+--- a/net/ipv4/devinet.c
++++ b/net/ipv4/devinet.c
+@@ -1420,7 +1420,7 @@ skip:
+ static bool inetdev_valid_mtu(unsigned int mtu)
+ {
+-      return mtu >= 68;
++      return mtu >= IPV4_MIN_MTU;
+ }
+ static void inetdev_send_gratuitous_arp(struct net_device *dev,
+--- a/net/ipv4/igmp.c
++++ b/net/ipv4/igmp.c
+@@ -404,16 +404,17 @@ static int grec_size(struct ip_mc_list *
+ }
+ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ip_mc_list *pmc,
+-      int type, struct igmpv3_grec **ppgr)
++      int type, struct igmpv3_grec **ppgr, unsigned int mtu)
+ {
+       struct net_device *dev = pmc->interface->dev;
+       struct igmpv3_report *pih;
+       struct igmpv3_grec *pgr;
+-      if (!skb)
+-              skb = igmpv3_newpack(dev, dev->mtu);
+-      if (!skb)
+-              return NULL;
++      if (!skb) {
++              skb = igmpv3_newpack(dev, mtu);
++              if (!skb)
++                      return NULL;
++      }
+       pgr = skb_put(skb, sizeof(struct igmpv3_grec));
+       pgr->grec_type = type;
+       pgr->grec_auxwords = 0;
+@@ -436,12 +437,17 @@ static struct sk_buff *add_grec(struct s
+       struct igmpv3_grec *pgr = NULL;
+       struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
+       int scount, stotal, first, isquery, truncate;
++      unsigned int mtu;
+       if (pmc->multiaddr == IGMP_ALL_HOSTS)
+               return skb;
+       if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
+               return skb;
++      mtu = READ_ONCE(dev->mtu);
++      if (mtu < IPV4_MIN_MTU)
++              return skb;
++
+       isquery = type == IGMPV3_MODE_IS_INCLUDE ||
+                 type == IGMPV3_MODE_IS_EXCLUDE;
+       truncate = type == IGMPV3_MODE_IS_EXCLUDE ||
+@@ -462,7 +468,7 @@ static struct sk_buff *add_grec(struct s
+                   AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
+                       if (skb)
+                               igmpv3_sendpack(skb);
+-                      skb = igmpv3_newpack(dev, dev->mtu);
++                      skb = igmpv3_newpack(dev, mtu);
+               }
+       }
+       first = 1;
+@@ -498,12 +504,12 @@ static struct sk_buff *add_grec(struct s
+                               pgr->grec_nsrcs = htons(scount);
+                       if (skb)
+                               igmpv3_sendpack(skb);
+-                      skb = igmpv3_newpack(dev, dev->mtu);
++                      skb = igmpv3_newpack(dev, mtu);
+                       first = 1;
+                       scount = 0;
+               }
+               if (first) {
+-                      skb = add_grhead(skb, pmc, type, &pgr);
++                      skb = add_grhead(skb, pmc, type, &pgr, mtu);
+                       first = 0;
+               }
+               if (!skb)
+@@ -538,7 +544,7 @@ empty_source:
+                               igmpv3_sendpack(skb);
+                               skb = NULL; /* add_grhead will get a new one */
+                       }
+-                      skb = add_grhead(skb, pmc, type, &pgr);
++                      skb = add_grhead(skb, pmc, type, &pgr, mtu);
+               }
+       }
+       if (pgr)
+--- a/net/ipv4/ip_tunnel.c
++++ b/net/ipv4/ip_tunnel.c
+@@ -349,8 +349,8 @@ static int ip_tunnel_bind_dev(struct net
+       dev->needed_headroom = t_hlen + hlen;
+       mtu -= (dev->hard_header_len + t_hlen);
+-      if (mtu < 68)
+-              mtu = 68;
++      if (mtu < IPV4_MIN_MTU)
++              mtu = IPV4_MIN_MTU;
+       return mtu;
+ }
diff --git a/queue-4.14/ipv6-honor-specified-parameters-in-fibmatch-lookup.patch b/queue-4.14/ipv6-honor-specified-parameters-in-fibmatch-lookup.patch
new file mode 100644 (file)
index 0000000..e3ce1c2
--- /dev/null
@@ -0,0 +1,96 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Ido Schimmel <idosch@mellanox.com>
+Date: Wed, 20 Dec 2017 12:28:25 +0200
+Subject: ipv6: Honor specified parameters in fibmatch lookup
+
+From: Ido Schimmel <idosch@mellanox.com>
+
+
+[ Upstream commit 58acfd714e6b02e8617448b431c2b64a2f1f0792 ]
+
+Currently, parameters such as oif and source address are not taken into
+account during fibmatch lookup. Example (IPv4 for reference) before
+patch:
+
+$ ip -4 route show
+192.0.2.0/24 dev dummy0 proto kernel scope link src 192.0.2.1
+198.51.100.0/24 dev dummy1 proto kernel scope link src 198.51.100.1
+
+$ ip -6 route show
+2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium
+2001:db8:2::/64 dev dummy1 proto kernel metric 256 pref medium
+fe80::/64 dev dummy0 proto kernel metric 256 pref medium
+fe80::/64 dev dummy1 proto kernel metric 256 pref medium
+
+$ ip -4 route get fibmatch 192.0.2.2 oif dummy0
+192.0.2.0/24 dev dummy0 proto kernel scope link src 192.0.2.1
+$ ip -4 route get fibmatch 192.0.2.2 oif dummy1
+RTNETLINK answers: No route to host
+
+$ ip -6 route get fibmatch 2001:db8:1::2 oif dummy0
+2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium
+$ ip -6 route get fibmatch 2001:db8:1::2 oif dummy1
+2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium
+
+After:
+
+$ ip -6 route get fibmatch 2001:db8:1::2 oif dummy0
+2001:db8:1::/64 dev dummy0 proto kernel metric 256 pref medium
+$ ip -6 route get fibmatch 2001:db8:1::2 oif dummy1
+RTNETLINK answers: Network is unreachable
+
+The problem stems from the fact that the necessary route lookup flags
+are not set based on these parameters.
+
+Instead of duplicating the same logic for fibmatch, we can simply
+resolve the original route from its copy and dump it instead.
+
+Fixes: 18c3a61c4264 ("net: ipv6: RTM_GETROUTE: return matched fib result when requested")
+Signed-off-by: Ido Schimmel <idosch@mellanox.com>
+Acked-by: David Ahern <dsahern@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/route.c |   19 +++++++++++--------
+ 1 file changed, 11 insertions(+), 8 deletions(-)
+
+--- a/net/ipv6/route.c
++++ b/net/ipv6/route.c
+@@ -3700,19 +3700,13 @@ static int inet6_rtm_getroute(struct sk_
+               if (!ipv6_addr_any(&fl6.saddr))
+                       flags |= RT6_LOOKUP_F_HAS_SADDR;
+-              if (!fibmatch)
+-                      dst = ip6_route_input_lookup(net, dev, &fl6, flags);
+-              else
+-                      dst = ip6_route_lookup(net, &fl6, 0);
++              dst = ip6_route_input_lookup(net, dev, &fl6, flags);
+               rcu_read_unlock();
+       } else {
+               fl6.flowi6_oif = oif;
+-              if (!fibmatch)
+-                      dst = ip6_route_output(net, NULL, &fl6);
+-              else
+-                      dst = ip6_route_lookup(net, &fl6, 0);
++              dst = ip6_route_output(net, NULL, &fl6);
+       }
+@@ -3729,6 +3723,15 @@ static int inet6_rtm_getroute(struct sk_
+               goto errout;
+       }
++      if (fibmatch && rt->dst.from) {
++              struct rt6_info *ort = container_of(rt->dst.from,
++                                                  struct rt6_info, dst);
++
++              dst_hold(&ort->dst);
++              ip6_rt_put(rt);
++              rt = ort;
++      }
++
+       skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
+       if (!skb) {
+               ip6_rt_put(rt);
diff --git a/queue-4.14/ipv6-mcast-better-catch-silly-mtu-values.patch b/queue-4.14/ipv6-mcast-better-catch-silly-mtu-values.patch
new file mode 100644 (file)
index 0000000..7fe11a1
--- /dev/null
@@ -0,0 +1,149 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Mon, 11 Dec 2017 07:03:38 -0800
+Subject: ipv6: mcast: better catch silly mtu values
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit b9b312a7a451e9c098921856e7cfbc201120e1a7 ]
+
+syzkaller reported crashes in IPv6 stack [1]
+
+Xin Long found that lo MTU was set to silly values.
+
+IPv6 stack reacts to changes to small MTU, by disabling itself under
+RTNL.
+
+But there is a window where threads not using RTNL can see a wrong
+device mtu. This can lead to surprises, in mld code where it is assumed
+the mtu is suitable.
+
+Fix this by reading device mtu once and checking IPv6 minimal MTU.
+
+[1]
+ skbuff: skb_over_panic: text:0000000010b86b8d len:196 put:20
+ head:000000003b477e60 data:000000000e85441e tail:0xd4 end:0xc0 dev:lo
+ ------------[ cut here ]------------
+ kernel BUG at net/core/skbuff.c:104!
+ invalid opcode: 0000 [#1] SMP KASAN
+ Dumping ftrace buffer:
+    (ftrace buffer empty)
+ Modules linked in:
+ CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.15.0-rc2-mm1+ #39
+ Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
+ Google 01/01/2011
+ RIP: 0010:skb_panic+0x15c/0x1f0 net/core/skbuff.c:100
+ RSP: 0018:ffff8801db307508 EFLAGS: 00010286
+ RAX: 0000000000000082 RBX: ffff8801c517e840 RCX: 0000000000000000
+ RDX: 0000000000000082 RSI: 1ffff1003b660e61 RDI: ffffed003b660e95
+ RBP: ffff8801db307570 R08: 1ffff1003b660e23 R09: 0000000000000000
+ R10: 0000000000000000 R11: 0000000000000000 R12: ffffffff85bd4020
+ R13: ffffffff84754ed2 R14: 0000000000000014 R15: ffff8801c4e26540
+ FS:  0000000000000000(0000) GS:ffff8801db300000(0000) knlGS:0000000000000000
+ CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ CR2: 0000000000463610 CR3: 00000001c6698000 CR4: 00000000001406e0
+ DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+ DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+ Call Trace:
+  <IRQ>
+  skb_over_panic net/core/skbuff.c:109 [inline]
+  skb_put+0x181/0x1c0 net/core/skbuff.c:1694
+  add_grhead.isra.24+0x42/0x3b0 net/ipv6/mcast.c:1695
+  add_grec+0xa55/0x1060 net/ipv6/mcast.c:1817
+  mld_send_cr net/ipv6/mcast.c:1903 [inline]
+  mld_ifc_timer_expire+0x4d2/0x770 net/ipv6/mcast.c:2448
+  call_timer_fn+0x23b/0x840 kernel/time/timer.c:1320
+  expire_timers kernel/time/timer.c:1357 [inline]
+  __run_timers+0x7e1/0xb60 kernel/time/timer.c:1660
+  run_timer_softirq+0x4c/0xb0 kernel/time/timer.c:1686
+  __do_softirq+0x29d/0xbb2 kernel/softirq.c:285
+  invoke_softirq kernel/softirq.c:365 [inline]
+  irq_exit+0x1d3/0x210 kernel/softirq.c:405
+  exiting_irq arch/x86/include/asm/apic.h:540 [inline]
+  smp_apic_timer_interrupt+0x16b/0x700 arch/x86/kernel/apic/apic.c:1052
+  apic_timer_interrupt+0xa9/0xb0 arch/x86/entry/entry_64.S:920
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Tested-by: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/mcast.c |   25 +++++++++++++++----------
+ 1 file changed, 15 insertions(+), 10 deletions(-)
+
+--- a/net/ipv6/mcast.c
++++ b/net/ipv6/mcast.c
+@@ -1682,16 +1682,16 @@ static int grec_size(struct ifmcaddr6 *p
+ }
+ static struct sk_buff *add_grhead(struct sk_buff *skb, struct ifmcaddr6 *pmc,
+-      int type, struct mld2_grec **ppgr)
++      int type, struct mld2_grec **ppgr, unsigned int mtu)
+ {
+-      struct net_device *dev = pmc->idev->dev;
+       struct mld2_report *pmr;
+       struct mld2_grec *pgr;
+-      if (!skb)
+-              skb = mld_newpack(pmc->idev, dev->mtu);
+-      if (!skb)
+-              return NULL;
++      if (!skb) {
++              skb = mld_newpack(pmc->idev, mtu);
++              if (!skb)
++                      return NULL;
++      }
+       pgr = skb_put(skb, sizeof(struct mld2_grec));
+       pgr->grec_type = type;
+       pgr->grec_auxwords = 0;
+@@ -1714,10 +1714,15 @@ static struct sk_buff *add_grec(struct s
+       struct mld2_grec *pgr = NULL;
+       struct ip6_sf_list *psf, *psf_next, *psf_prev, **psf_list;
+       int scount, stotal, first, isquery, truncate;
++      unsigned int mtu;
+       if (pmc->mca_flags & MAF_NOREPORT)
+               return skb;
++      mtu = READ_ONCE(dev->mtu);
++      if (mtu < IPV6_MIN_MTU)
++              return skb;
++
+       isquery = type == MLD2_MODE_IS_INCLUDE ||
+                 type == MLD2_MODE_IS_EXCLUDE;
+       truncate = type == MLD2_MODE_IS_EXCLUDE ||
+@@ -1738,7 +1743,7 @@ static struct sk_buff *add_grec(struct s
+                   AVAILABLE(skb) < grec_size(pmc, type, gdeleted, sdeleted)) {
+                       if (skb)
+                               mld_sendpack(skb);
+-                      skb = mld_newpack(idev, dev->mtu);
++                      skb = mld_newpack(idev, mtu);
+               }
+       }
+       first = 1;
+@@ -1774,12 +1779,12 @@ static struct sk_buff *add_grec(struct s
+                               pgr->grec_nsrcs = htons(scount);
+                       if (skb)
+                               mld_sendpack(skb);
+-                      skb = mld_newpack(idev, dev->mtu);
++                      skb = mld_newpack(idev, mtu);
+                       first = 1;
+                       scount = 0;
+               }
+               if (first) {
+-                      skb = add_grhead(skb, pmc, type, &pgr);
++                      skb = add_grhead(skb, pmc, type, &pgr, mtu);
+                       first = 0;
+               }
+               if (!skb)
+@@ -1814,7 +1819,7 @@ empty_source:
+                               mld_sendpack(skb);
+                               skb = NULL; /* add_grhead will get a new one */
+                       }
+-                      skb = add_grhead(skb, pmc, type, &pgr);
++                      skb = add_grhead(skb, pmc, type, &pgr, mtu);
+               }
+       }
+       if (pgr)
diff --git a/queue-4.14/ipv6-set-all.accept_dad-to-0-by-default.patch b/queue-4.14/ipv6-set-all.accept_dad-to-0-by-default.patch
new file mode 100644 (file)
index 0000000..a75bfb7
--- /dev/null
@@ -0,0 +1,67 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
+Date: Tue, 14 Nov 2017 14:21:32 +0100
+Subject: ipv6: set all.accept_dad to 0 by default
+
+From: Nicolas Dichtel <nicolas.dichtel@6wind.com>
+
+
+[ Upstream commit 094009531612246d9e13f9e0c3ae2205d7f63a0a ]
+
+With commits 35e015e1f577 and a2d3f3e33853, the global 'accept_dad' flag
+is also taken into account (default value is 1). If either global or
+per-interface flag is non-zero, DAD will be enabled on a given interface.
+
+This is not backward compatible: before those patches, the user could
+disable DAD just by setting the per-interface flag to 0. Now, the
+user instead needs to set both flags to 0 to actually disable DAD.
+
+Restore the previous behaviour by setting the default for the global
+'accept_dad' flag to 0. This way, DAD is still enabled by default,
+as per-interface flags are set to 1 on device creation, but setting
+them to 0 is enough to disable DAD on a given interface.
+
+- Before 35e015e1f57a7 and a2d3f3e33853:
+          global    per-interface    DAD enabled
+[default]   1             1              yes
+            X             0              no
+            X             1              yes
+
+- After 35e015e1f577 and a2d3f3e33853:
+          global    per-interface    DAD enabled
+[default]   1             1              yes
+            0             0              no
+            0             1              yes
+            1             0              yes
+
+- After this fix:
+          global    per-interface    DAD enabled
+            1             1              yes
+            0             0              no
+[default]   0             1              yes
+            1             0              yes
+
+Fixes: 35e015e1f577 ("ipv6: fix net.ipv6.conf.all interface DAD handlers")
+Fixes: a2d3f3e33853 ("ipv6: fix net.ipv6.conf.all.accept_dad behaviour for real")
+CC: Stefano Brivio <sbrivio@redhat.com>
+CC: Matteo Croce <mcroce@redhat.com>
+CC: Erik Kline <ek@google.com>
+Signed-off-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
+Acked-by: Stefano Brivio <sbrivio@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv6/addrconf.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/ipv6/addrconf.c
++++ b/net/ipv6/addrconf.c
+@@ -231,7 +231,7 @@ static struct ipv6_devconf ipv6_devconf
+       .proxy_ndp              = 0,
+       .accept_source_route    = 0,    /* we do not accept RH0 by default. */
+       .disable_ipv6           = 0,
+-      .accept_dad             = 1,
++      .accept_dad             = 0,
+       .suppress_frag_ndisc    = 1,
+       .accept_ra_mtu          = 1,
+       .stable_secret          = {
diff --git a/queue-4.14/mlxsw-spectrum-disable-mac-learning-for-ovs-port.patch b/queue-4.14/mlxsw-spectrum-disable-mac-learning-for-ovs-port.patch
new file mode 100644 (file)
index 0000000..56dc48f
--- /dev/null
@@ -0,0 +1,68 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Yuval Mintz <yuvalm@mellanox.com>
+Date: Fri, 15 Dec 2017 08:44:21 +0100
+Subject: mlxsw: spectrum: Disable MAC learning for ovs port
+
+From: Yuval Mintz <yuvalm@mellanox.com>
+
+
+[ Upstream commit fccff0862838908d21eaf956d57e09c6c189f7c5 ]
+
+Learning is currently enabled for ports which are OVS slaves -
+even though OVS doesn't need this indication.
+Since we're not associating a fid with the port, HW would continuously
+notify driver of learned [& aged] MACs which would be logged as errors.
+
+Fixes: 2b94e58df58c ("mlxsw: spectrum: Allow ports to work under OVS master")
+Signed-off-by: Yuval Mintz <yuvalm@mellanox.com>
+Reviewed-by: Ido Schimmel <idosch@mellanox.com>
+Signed-off-by: Jiri Pirko <jiri@mellanox.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlxsw/spectrum.c |   18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
++++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c
+@@ -4164,6 +4164,7 @@ static int mlxsw_sp_port_stp_set(struct
+ static int mlxsw_sp_port_ovs_join(struct mlxsw_sp_port *mlxsw_sp_port)
+ {
++      u16 vid = 1;
+       int err;
+       err = mlxsw_sp_port_vp_mode_set(mlxsw_sp_port, true);
+@@ -4176,8 +4177,19 @@ static int mlxsw_sp_port_ovs_join(struct
+                                    true, false);
+       if (err)
+               goto err_port_vlan_set;
++
++      for (; vid <= VLAN_N_VID - 1; vid++) {
++              err = mlxsw_sp_port_vid_learning_set(mlxsw_sp_port,
++                                                   vid, false);
++              if (err)
++                      goto err_vid_learning_set;
++      }
++
+       return 0;
++err_vid_learning_set:
++      for (vid--; vid >= 1; vid--)
++              mlxsw_sp_port_vid_learning_set(mlxsw_sp_port, vid, true);
+ err_port_vlan_set:
+       mlxsw_sp_port_stp_set(mlxsw_sp_port, false);
+ err_port_stp_set:
+@@ -4187,6 +4199,12 @@ err_port_stp_set:
+ static void mlxsw_sp_port_ovs_leave(struct mlxsw_sp_port *mlxsw_sp_port)
+ {
++      u16 vid;
++
++      for (vid = VLAN_N_VID - 1; vid >= 1; vid--)
++              mlxsw_sp_port_vid_learning_set(mlxsw_sp_port,
++                                             vid, true);
++
+       mlxsw_sp_port_vlan_set(mlxsw_sp_port, 2, VLAN_N_VID - 1,
+                              false, false);
+       mlxsw_sp_port_stp_set(mlxsw_sp_port, false);
diff --git a/queue-4.14/net-bridge-fix-early-call-to-br_stp_change_bridge_id-and-plug-newlink-leaks.patch b/queue-4.14/net-bridge-fix-early-call-to-br_stp_change_bridge_id-and-plug-newlink-leaks.patch
new file mode 100644 (file)
index 0000000..2c13326
--- /dev/null
@@ -0,0 +1,94 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
+Date: Mon, 18 Dec 2017 17:35:09 +0200
+Subject: net: bridge: fix early call to br_stp_change_bridge_id and plug newlink leaks
+
+From: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
+
+
+[ Upstream commit 84aeb437ab98a2bce3d4b2111c79723aedfceb33 ]
+
+The early call to br_stp_change_bridge_id in bridge's newlink can cause
+a memory leak if an error occurs during the newlink because the fdb
+entries are not cleaned up if a different lladdr was specified, also
+another minor issue is that it generates fdb notifications with
+ifindex = 0. Another unrelated memory leak is the bridge sysfs entries
+which get added on NETDEV_REGISTER event, but are not cleaned up in the
+newlink error path. To remove this special case the call to
+br_stp_change_bridge_id is done after netdev register and we cleanup the
+bridge on changelink error via br_dev_delete to plug all leaks.
+
+This patch makes netlink bridge destruction on newlink error the same as
+dellink and ioctl del which is necessary since at that point we have a
+fully initialized bridge device.
+
+To reproduce the issue:
+$ ip l add br0 address 00:11:22:33:44:55 type bridge group_fwd_mask 1
+RTNETLINK answers: Invalid argument
+
+$ rmmod bridge
+[ 1822.142525] =============================================================================
+[ 1822.143640] BUG bridge_fdb_cache (Tainted: G           O    ): Objects remaining in bridge_fdb_cache on __kmem_cache_shutdown()
+[ 1822.144821] -----------------------------------------------------------------------------
+
+[ 1822.145990] Disabling lock debugging due to kernel taint
+[ 1822.146732] INFO: Slab 0x0000000092a844b2 objects=32 used=2 fp=0x00000000fef011b0 flags=0x1ffff8000000100
+[ 1822.147700] CPU: 2 PID: 13584 Comm: rmmod Tainted: G    B      O     4.15.0-rc2+ #87
+[ 1822.148578] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.7.5-20140531_083030-gandalf 04/01/2014
+[ 1822.150008] Call Trace:
+[ 1822.150510]  dump_stack+0x78/0xa9
+[ 1822.151156]  slab_err+0xb1/0xd3
+[ 1822.151834]  ? __kmalloc+0x1bb/0x1ce
+[ 1822.152546]  __kmem_cache_shutdown+0x151/0x28b
+[ 1822.153395]  shutdown_cache+0x13/0x144
+[ 1822.154126]  kmem_cache_destroy+0x1c0/0x1fb
+[ 1822.154669]  SyS_delete_module+0x194/0x244
+[ 1822.155199]  ? trace_hardirqs_on_thunk+0x1a/0x1c
+[ 1822.155773]  entry_SYSCALL_64_fastpath+0x23/0x9a
+[ 1822.156343] RIP: 0033:0x7f929bd38b17
+[ 1822.156859] RSP: 002b:00007ffd160e9a98 EFLAGS: 00000202 ORIG_RAX: 00000000000000b0
+[ 1822.157728] RAX: ffffffffffffffda RBX: 00005578316ba090 RCX: 00007f929bd38b17
+[ 1822.158422] RDX: 00007f929bd9ec60 RSI: 0000000000000800 RDI: 00005578316ba0f0
+[ 1822.159114] RBP: 0000000000000003 R08: 00007f929bff5f20 R09: 00007ffd160e8a11
+[ 1822.159808] R10: 00007ffd160e9860 R11: 0000000000000202 R12: 00007ffd160e8a80
+[ 1822.160513] R13: 0000000000000000 R14: 0000000000000000 R15: 00005578316ba090
+[ 1822.161278] INFO: Object 0x000000007645de29 @offset=0
+[ 1822.161666] INFO: Object 0x00000000d5df2ab5 @offset=128
+
+Fixes: 30313a3d5794 ("bridge: Handle IFLA_ADDRESS correctly when creating bridge device")
+Fixes: 5b8d5429daa0 ("bridge: netlink: register netdevice before executing changelink")
+Signed-off-by: Nikolay Aleksandrov <nikolay@cumulusnetworks.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/bridge/br_netlink.c |   11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+--- a/net/bridge/br_netlink.c
++++ b/net/bridge/br_netlink.c
+@@ -1223,19 +1223,20 @@ static int br_dev_newlink(struct net *sr
+       struct net_bridge *br = netdev_priv(dev);
+       int err;
++      err = register_netdevice(dev);
++      if (err)
++              return err;
++
+       if (tb[IFLA_ADDRESS]) {
+               spin_lock_bh(&br->lock);
+               br_stp_change_bridge_id(br, nla_data(tb[IFLA_ADDRESS]));
+               spin_unlock_bh(&br->lock);
+       }
+-      err = register_netdevice(dev);
+-      if (err)
+-              return err;
+-
+       err = br_changelink(dev, tb, data, extack);
+       if (err)
+-              unregister_netdevice(dev);
++              br_dev_delete(dev, NULL);
++
+       return err;
+ }
diff --git a/queue-4.14/net-dsa-bcm_sf2-clear-iddq_global_pwr-bit-for-phy.patch b/queue-4.14/net-dsa-bcm_sf2-clear-iddq_global_pwr-bit-for-phy.patch
new file mode 100644 (file)
index 0000000..ec80528
--- /dev/null
@@ -0,0 +1,33 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Florian Fainelli <f.fainelli@gmail.com>
+Date: Tue, 21 Nov 2017 17:37:46 -0800
+Subject: net: dsa: bcm_sf2: Clear IDDQ_GLOBAL_PWR bit for PHY
+
+From: Florian Fainelli <f.fainelli@gmail.com>
+
+
+[ Upstream commit 4b52d010113e11006a389f2a8315167ede9e0b10 ]
+
+The PHY on BCM7278 has an additional bit that needs to be cleared:
+IDDQ_GLOBAL_PWR, without doing this, the PHY remains stuck in reset out
+of suspend/resume cycles.
+
+Fixes: 0fe9933804eb ("net: dsa: bcm_sf2: Add support for BCM7278 integrated switch")
+Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/dsa/bcm_sf2.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/dsa/bcm_sf2.c
++++ b/drivers/net/dsa/bcm_sf2.c
+@@ -167,7 +167,7 @@ static void bcm_sf2_gphy_enable_set(stru
+       reg = reg_readl(priv, REG_SPHY_CNTRL);
+       if (enable) {
+               reg |= PHY_RESET;
+-              reg &= ~(EXT_PWR_DOWN | IDDQ_BIAS | CK25_DIS);
++              reg &= ~(EXT_PWR_DOWN | IDDQ_BIAS | IDDQ_GLOBAL_PWR | CK25_DIS);
+               reg_writel(priv, reg, REG_SPHY_CNTRL);
+               udelay(21);
+               reg = reg_readl(priv, REG_SPHY_CNTRL);
diff --git a/queue-4.14/net-fec-unmap-the-xmit-buffer-that-are-not-transferred-by-dma.patch b/queue-4.14/net-fec-unmap-the-xmit-buffer-that-are-not-transferred-by-dma.patch
new file mode 100644 (file)
index 0000000..e48efe6
--- /dev/null
@@ -0,0 +1,45 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Fugang Duan <fugang.duan@nxp.com>
+Date: Fri, 22 Dec 2017 17:12:09 +0800
+Subject: net: fec: unmap the xmit buffer that are not transferred by DMA
+
+From: Fugang Duan <fugang.duan@nxp.com>
+
+
+[ Upstream commit 178e5f57a8d8f8fc5799a624b96fc31ef9a29ffa ]
+
+The enet IP only support 32 bit, it will use swiotlb buffer to do dma
+mapping when xmit buffer DMA memory address is bigger than 4G in i.MX
+platform. After stress suspend/resume test, it will print out:
+
+log:
+[12826.352864] fec 5b040000.ethernet: swiotlb buffer is full (sz: 191 bytes)
+[12826.359676] DMA: Out of SW-IOMMU space for 191 bytes at device 5b040000.ethernet
+[12826.367110] fec 5b040000.ethernet eth0: Tx DMA memory map failed
+
+The issue is that the ready xmit buffers that are dma mapped but DMA still
+don't copy them into fifo, once MAC restart, these DMA buffers are not unmapped.
+So it should check the dma mapping buffer and unmap them.
+
+Signed-off-by: Fugang Duan <fugang.duan@nxp.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/freescale/fec_main.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/drivers/net/ethernet/freescale/fec_main.c
++++ b/drivers/net/ethernet/freescale/fec_main.c
+@@ -818,6 +818,12 @@ static void fec_enet_bd_init(struct net_
+               for (i = 0; i < txq->bd.ring_size; i++) {
+                       /* Initialize the BD for every fragment in the page. */
+                       bdp->cbd_sc = cpu_to_fec16(0);
++                      if (bdp->cbd_bufaddr &&
++                          !IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr)))
++                              dma_unmap_single(&fep->pdev->dev,
++                                               fec32_to_cpu(bdp->cbd_bufaddr),
++                                               fec16_to_cpu(bdp->cbd_datlen),
++                                               DMA_TO_DEVICE);
+                       if (txq->tx_skbuff[i]) {
+                               dev_kfree_skb_any(txq->tx_skbuff[i]);
+                               txq->tx_skbuff[i] = NULL;
diff --git a/queue-4.14/net-fix-double-free-and-memory-corruption-in-get_net_ns_by_id.patch b/queue-4.14/net-fix-double-free-and-memory-corruption-in-get_net_ns_by_id.patch
new file mode 100644 (file)
index 0000000..e967ea2
--- /dev/null
@@ -0,0 +1,100 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+Date: Tue, 19 Dec 2017 11:27:56 -0600
+Subject: net: Fix double free and memory corruption in get_net_ns_by_id()
+
+From: "Eric W. Biederman" <ebiederm@xmission.com>
+
+
+[ Upstream commit 21b5944350052d2583e82dd59b19a9ba94a007f0 ]
+
+(I can trivially verify that that idr_remove in cleanup_net happens
+ after the network namespace count has dropped to zero --EWB)
+
+Function get_net_ns_by_id() does not check for net::count
+after it has found a peer in netns_ids idr.
+
+It may dereference a peer, after its count has already been
+finaly decremented. This leads to double free and memory
+corruption:
+
+put_net(peer)                                   rtnl_lock()
+atomic_dec_and_test(&peer->count) [count=0]     ...
+__put_net(peer)                                 get_net_ns_by_id(net, id)
+  spin_lock(&cleanup_list_lock)
+  list_add(&net->cleanup_list, &cleanup_list)
+  spin_unlock(&cleanup_list_lock)
+queue_work()                                      peer = idr_find(&net->netns_ids, id)
+  |                                               get_net(peer) [count=1]
+  |                                               ...
+  |                                               (use after final put)
+  v                                               ...
+  cleanup_net()                                   ...
+    spin_lock(&cleanup_list_lock)                 ...
+    list_replace_init(&cleanup_list, ..)          ...
+    spin_unlock(&cleanup_list_lock)               ...
+    ...                                           ...
+    ...                                           put_net(peer)
+    ...                                             atomic_dec_and_test(&peer->count) [count=0]
+    ...                                               spin_lock(&cleanup_list_lock)
+    ...                                               list_add(&net->cleanup_list, &cleanup_list)
+    ...                                               spin_unlock(&cleanup_list_lock)
+    ...                                             queue_work()
+    ...                                           rtnl_unlock()
+    rtnl_lock()                                   ...
+    for_each_net(tmp) {                           ...
+      id = __peernet2id(tmp, peer)                ...
+      spin_lock_irq(&tmp->nsid_lock)              ...
+      idr_remove(&tmp->netns_ids, id)             ...
+      ...                                         ...
+      net_drop_ns()                               ...
+       net_free(peer)                            ...
+    }                                             ...
+  |
+  v
+  cleanup_net()
+    ...
+    (Second free of peer)
+
+Also, put_net() on the right cpu may reorder with left's cpu
+list_replace_init(&cleanup_list, ..), and then cleanup_list
+will be corrupted.
+
+Since cleanup_net() is executed in worker thread, while
+put_net(peer) can happen everywhere, there should be
+enough time for concurrent get_net_ns_by_id() to pick
+the peer up, and the race does not seem to be unlikely.
+The patch fixes the problem in standard way.
+
+(Also, there is possible problem in peernet2id_alloc(), which requires
+check for net::count under nsid_lock and maybe_get_net(peer), but
+in current stable kernel it's used under rtnl_lock() and it has to be
+safe. Openswitch begun to use peernet2id_alloc(), and possibly it should
+be fixed too. While this is not in stable kernel yet, so I'll send
+a separate message to netdev@ later).
+
+Cc: Nicolas Dichtel <nicolas.dichtel@6wind.com>
+Signed-off-by: Kirill Tkhai <ktkhai@virtuozzo.com>
+Fixes: 0c7aecd4bde4 "netns: add rtnl cmd to add and get peer netns ids"
+Reviewed-by: Andrey Ryabinin <aryabinin@virtuozzo.com>
+Reviewed-by: "Eric W. Biederman" <ebiederm@xmission.com>
+Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Nicolas Dichtel <nicolas.dichtel@6wind.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/net_namespace.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/core/net_namespace.c
++++ b/net/core/net_namespace.c
+@@ -266,7 +266,7 @@ struct net *get_net_ns_by_id(struct net
+       spin_lock_bh(&net->nsid_lock);
+       peer = idr_find(&net->netns_ids, id);
+       if (peer)
+-              get_net(peer);
++              peer = maybe_get_net(peer);
+       spin_unlock_bh(&net->nsid_lock);
+       rcu_read_unlock();
diff --git a/queue-4.14/net-igmp-use-correct-source-address-on-igmpv3-reports.patch b/queue-4.14/net-igmp-use-correct-source-address-on-igmpv3-reports.patch
new file mode 100644 (file)
index 0000000..5a416cb
--- /dev/null
@@ -0,0 +1,88 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Kevin Cernekee <cernekee@chromium.org>
+Date: Mon, 11 Dec 2017 11:13:45 -0800
+Subject: net: igmp: Use correct source address on IGMPv3 reports
+
+From: Kevin Cernekee <cernekee@chromium.org>
+
+
+[ Upstream commit a46182b00290839fa3fa159d54fd3237bd8669f0 ]
+
+Closing a multicast socket after the final IPv4 address is deleted
+from an interface can generate a membership report that uses the
+source IP from a different interface.  The following test script, run
+from an isolated netns, reproduces the issue:
+
+    #!/bin/bash
+
+    ip link add dummy0 type dummy
+    ip link add dummy1 type dummy
+    ip link set dummy0 up
+    ip link set dummy1 up
+    ip addr add 10.1.1.1/24 dev dummy0
+    ip addr add 192.168.99.99/24 dev dummy1
+
+    tcpdump -U -i dummy0 &
+    socat EXEC:"sleep 2" \
+        UDP4-DATAGRAM:239.101.1.68:8889,ip-add-membership=239.0.1.68:10.1.1.1 &
+
+    sleep 1
+    ip addr del 10.1.1.1/24 dev dummy0
+    sleep 5
+    kill %tcpdump
+
+RFC 3376 specifies that the report must be sent with a valid IP source
+address from the destination subnet, or from address 0.0.0.0.  Add an
+extra check to make sure this is the case.
+
+Signed-off-by: Kevin Cernekee <cernekee@chromium.org>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/igmp.c |   20 +++++++++++++++++++-
+ 1 file changed, 19 insertions(+), 1 deletion(-)
+
+--- a/net/ipv4/igmp.c
++++ b/net/ipv4/igmp.c
+@@ -89,6 +89,7 @@
+ #include <linux/rtnetlink.h>
+ #include <linux/times.h>
+ #include <linux/pkt_sched.h>
++#include <linux/byteorder/generic.h>
+ #include <net/net_namespace.h>
+ #include <net/arp.h>
+@@ -321,6 +322,23 @@ igmp_scount(struct ip_mc_list *pmc, int
+       return scount;
+ }
++/* source address selection per RFC 3376 section 4.2.13 */
++static __be32 igmpv3_get_srcaddr(struct net_device *dev,
++                               const struct flowi4 *fl4)
++{
++      struct in_device *in_dev = __in_dev_get_rcu(dev);
++
++      if (!in_dev)
++              return htonl(INADDR_ANY);
++
++      for_ifa(in_dev) {
++              if (inet_ifa_match(fl4->saddr, ifa))
++                      return fl4->saddr;
++      } endfor_ifa(in_dev);
++
++      return htonl(INADDR_ANY);
++}
++
+ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
+ {
+       struct sk_buff *skb;
+@@ -368,7 +386,7 @@ static struct sk_buff *igmpv3_newpack(st
+       pip->frag_off = htons(IP_DF);
+       pip->ttl      = 1;
+       pip->daddr    = fl4.daddr;
+-      pip->saddr    = fl4.saddr;
++      pip->saddr    = igmpv3_get_srcaddr(dev, &fl4);
+       pip->protocol = IPPROTO_IGMP;
+       pip->tot_len  = 0;      /* filled in later */
+       ip_select_ident(net, skb, NULL);
diff --git a/queue-4.14/net-ipv4-fix-for-a-race-condition-in-raw_sendmsg.patch b/queue-4.14/net-ipv4-fix-for-a-race-condition-in-raw_sendmsg.patch
new file mode 100644 (file)
index 0000000..4d7bd27
--- /dev/null
@@ -0,0 +1,75 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Mohamed Ghannam <simo.ghannam@gmail.com>
+Date: Sun, 10 Dec 2017 03:50:58 +0000
+Subject: net: ipv4: fix for a race condition in raw_sendmsg
+
+From: Mohamed Ghannam <simo.ghannam@gmail.com>
+
+
+[ Upstream commit 8f659a03a0ba9289b9aeb9b4470e6fb263d6f483 ]
+
+inet->hdrincl is racy, and could lead to uninitialized stack pointer
+usage, so its value should be read only once.
+
+Fixes: c008ba5bdc9f ("ipv4: Avoid reading user iov twice after raw_probe_proto_opt")
+Signed-off-by: Mohamed Ghannam <simo.ghannam@gmail.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/raw.c |   15 ++++++++++-----
+ 1 file changed, 10 insertions(+), 5 deletions(-)
+
+--- a/net/ipv4/raw.c
++++ b/net/ipv4/raw.c
+@@ -513,11 +513,16 @@ static int raw_sendmsg(struct sock *sk,
+       int err;
+       struct ip_options_data opt_copy;
+       struct raw_frag_vec rfv;
++      int hdrincl;
+       err = -EMSGSIZE;
+       if (len > 0xFFFF)
+               goto out;
++      /* hdrincl should be READ_ONCE(inet->hdrincl)
++       * but READ_ONCE() doesn't work with bit fields
++       */
++      hdrincl = inet->hdrincl;
+       /*
+        *      Check the flags.
+        */
+@@ -593,7 +598,7 @@ static int raw_sendmsg(struct sock *sk,
+               /* Linux does not mangle headers on raw sockets,
+                * so that IP options + IP_HDRINCL is non-sense.
+                */
+-              if (inet->hdrincl)
++              if (hdrincl)
+                       goto done;
+               if (ipc.opt->opt.srr) {
+                       if (!daddr)
+@@ -615,12 +620,12 @@ static int raw_sendmsg(struct sock *sk,
+       flowi4_init_output(&fl4, ipc.oif, sk->sk_mark, tos,
+                          RT_SCOPE_UNIVERSE,
+-                         inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
++                         hdrincl ? IPPROTO_RAW : sk->sk_protocol,
+                          inet_sk_flowi_flags(sk) |
+-                          (inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
++                          (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
+                          daddr, saddr, 0, 0, sk->sk_uid);
+-      if (!inet->hdrincl) {
++      if (!hdrincl) {
+               rfv.msg = msg;
+               rfv.hlen = 0;
+@@ -645,7 +650,7 @@ static int raw_sendmsg(struct sock *sk,
+               goto do_confirm;
+ back_from_confirm:
+-      if (inet->hdrincl)
++      if (hdrincl)
+               err = raw_send_hdrinc(sk, &fl4, msg, len,
+                                     &rt, msg->msg_flags, &ipc.sockc);
diff --git a/queue-4.14/net-mlx5-fix-error-flow-in-create_qp-command.patch b/queue-4.14/net-mlx5-fix-error-flow-in-create_qp-command.patch
new file mode 100644 (file)
index 0000000..5c24524
--- /dev/null
@@ -0,0 +1,35 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Moni Shoua <monis@mellanox.com>
+Date: Mon, 4 Dec 2017 08:59:25 +0200
+Subject: net/mlx5: Fix error flow in CREATE_QP command
+
+From: Moni Shoua <monis@mellanox.com>
+
+
+[ Upstream commit dbff26e44dc3ec4de6578733b054a0114652a764 ]
+
+In error flow, when DESTROY_QP command should be executed, the wrong
+mailbox was set with data, not the one that is written to hardware,
+Fix that.
+
+Fixes: 09a7d9eca1a6 '{net,IB}/mlx5: QP/XRCD commands via mlx5 ifc'
+Signed-off-by: Moni Shoua <monis@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/qp.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c
+@@ -213,8 +213,8 @@ int mlx5_core_create_qp(struct mlx5_core
+ err_cmd:
+       memset(din, 0, sizeof(din));
+       memset(dout, 0, sizeof(dout));
+-      MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
+-      MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
++      MLX5_SET(destroy_qp_in, din, opcode, MLX5_CMD_OP_DESTROY_QP);
++      MLX5_SET(destroy_qp_in, din, qpn, qp->qpn);
+       mlx5_cmd_exec(dev, din, sizeof(din), dout, sizeof(dout));
+       return err;
+ }
diff --git a/queue-4.14/net-mlx5-fix-rate-limit-packet-pacing-naming-and-struct.patch b/queue-4.14/net-mlx5-fix-rate-limit-packet-pacing-naming-and-struct.patch
new file mode 100644 (file)
index 0000000..31a9e52
--- /dev/null
@@ -0,0 +1,141 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Eran Ben Elisha <eranbe@mellanox.com>
+Date: Mon, 13 Nov 2017 10:11:27 +0200
+Subject: net/mlx5: Fix rate limit packet pacing naming and struct
+
+From: Eran Ben Elisha <eranbe@mellanox.com>
+
+
+[ Upstream commit 37e92a9d4fe38dc3e7308913575983a6a088c8d4 ]
+
+In mlx5_ifc, struct size was not complete, and thus driver was sending
+garbage after the last defined field. Fixed it by adding reserved field
+to complete the struct size.
+
+In addition, rename all set_rate_limit to set_pp_rate_limit to be
+compliant with the Firmware <-> Driver definition.
+
+Fixes: 7486216b3a0b ("{net,IB}/mlx5: mlx5_ifc updates")
+Fixes: 1466cc5b23d1 ("net/mlx5: Rate limit tables support")
+Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/cmd.c |    4 ++--
+ drivers/net/ethernet/mellanox/mlx5/core/rl.c  |   22 +++++++++++-----------
+ include/linux/mlx5/mlx5_ifc.h                 |    8 +++++---
+ 3 files changed, 18 insertions(+), 16 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+@@ -362,7 +362,7 @@ static int mlx5_internal_err_ret_value(s
+       case MLX5_CMD_OP_QUERY_VPORT_COUNTER:
+       case MLX5_CMD_OP_ALLOC_Q_COUNTER:
+       case MLX5_CMD_OP_QUERY_Q_COUNTER:
+-      case MLX5_CMD_OP_SET_RATE_LIMIT:
++      case MLX5_CMD_OP_SET_PP_RATE_LIMIT:
+       case MLX5_CMD_OP_QUERY_RATE_LIMIT:
+       case MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT:
+       case MLX5_CMD_OP_QUERY_SCHEDULING_ELEMENT:
+@@ -505,7 +505,7 @@ const char *mlx5_command_str(int command
+       MLX5_COMMAND_STR_CASE(ALLOC_Q_COUNTER);
+       MLX5_COMMAND_STR_CASE(DEALLOC_Q_COUNTER);
+       MLX5_COMMAND_STR_CASE(QUERY_Q_COUNTER);
+-      MLX5_COMMAND_STR_CASE(SET_RATE_LIMIT);
++      MLX5_COMMAND_STR_CASE(SET_PP_RATE_LIMIT);
+       MLX5_COMMAND_STR_CASE(QUERY_RATE_LIMIT);
+       MLX5_COMMAND_STR_CASE(CREATE_SCHEDULING_ELEMENT);
+       MLX5_COMMAND_STR_CASE(DESTROY_SCHEDULING_ELEMENT);
+--- a/drivers/net/ethernet/mellanox/mlx5/core/rl.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/rl.c
+@@ -125,16 +125,16 @@ static struct mlx5_rl_entry *find_rl_ent
+       return ret_entry;
+ }
+-static int mlx5_set_rate_limit_cmd(struct mlx5_core_dev *dev,
++static int mlx5_set_pp_rate_limit_cmd(struct mlx5_core_dev *dev,
+                                  u32 rate, u16 index)
+ {
+-      u32 in[MLX5_ST_SZ_DW(set_rate_limit_in)]   = {0};
+-      u32 out[MLX5_ST_SZ_DW(set_rate_limit_out)] = {0};
++      u32 in[MLX5_ST_SZ_DW(set_pp_rate_limit_in)]   = {0};
++      u32 out[MLX5_ST_SZ_DW(set_pp_rate_limit_out)] = {0};
+-      MLX5_SET(set_rate_limit_in, in, opcode,
+-               MLX5_CMD_OP_SET_RATE_LIMIT);
+-      MLX5_SET(set_rate_limit_in, in, rate_limit_index, index);
+-      MLX5_SET(set_rate_limit_in, in, rate_limit, rate);
++      MLX5_SET(set_pp_rate_limit_in, in, opcode,
++               MLX5_CMD_OP_SET_PP_RATE_LIMIT);
++      MLX5_SET(set_pp_rate_limit_in, in, rate_limit_index, index);
++      MLX5_SET(set_pp_rate_limit_in, in, rate_limit, rate);
+       return mlx5_cmd_exec(dev, in, sizeof(in), out, sizeof(out));
+ }
+@@ -173,7 +173,7 @@ int mlx5_rl_add_rate(struct mlx5_core_de
+               entry->refcount++;
+       } else {
+               /* new rate limit */
+-              err = mlx5_set_rate_limit_cmd(dev, rate, entry->index);
++              err = mlx5_set_pp_rate_limit_cmd(dev, rate, entry->index);
+               if (err) {
+                       mlx5_core_err(dev, "Failed configuring rate: %u (%d)\n",
+                                     rate, err);
+@@ -209,7 +209,7 @@ void mlx5_rl_remove_rate(struct mlx5_cor
+       entry->refcount--;
+       if (!entry->refcount) {
+               /* need to remove rate */
+-              mlx5_set_rate_limit_cmd(dev, 0, entry->index);
++              mlx5_set_pp_rate_limit_cmd(dev, 0, entry->index);
+               entry->rate = 0;
+       }
+@@ -262,8 +262,8 @@ void mlx5_cleanup_rl_table(struct mlx5_c
+       /* Clear all configured rates */
+       for (i = 0; i < table->max_size; i++)
+               if (table->rl_entry[i].rate)
+-                      mlx5_set_rate_limit_cmd(dev, 0,
+-                                              table->rl_entry[i].index);
++                      mlx5_set_pp_rate_limit_cmd(dev, 0,
++                                                 table->rl_entry[i].index);
+       kfree(dev->priv.rl_table.rl_entry);
+ }
+--- a/include/linux/mlx5/mlx5_ifc.h
++++ b/include/linux/mlx5/mlx5_ifc.h
+@@ -147,7 +147,7 @@ enum {
+       MLX5_CMD_OP_ALLOC_Q_COUNTER               = 0x771,
+       MLX5_CMD_OP_DEALLOC_Q_COUNTER             = 0x772,
+       MLX5_CMD_OP_QUERY_Q_COUNTER               = 0x773,
+-      MLX5_CMD_OP_SET_RATE_LIMIT                = 0x780,
++      MLX5_CMD_OP_SET_PP_RATE_LIMIT             = 0x780,
+       MLX5_CMD_OP_QUERY_RATE_LIMIT              = 0x781,
+       MLX5_CMD_OP_CREATE_SCHEDULING_ELEMENT      = 0x782,
+       MLX5_CMD_OP_DESTROY_SCHEDULING_ELEMENT     = 0x783,
+@@ -7233,7 +7233,7 @@ struct mlx5_ifc_add_vxlan_udp_dport_in_b
+       u8         vxlan_udp_port[0x10];
+ };
+-struct mlx5_ifc_set_rate_limit_out_bits {
++struct mlx5_ifc_set_pp_rate_limit_out_bits {
+       u8         status[0x8];
+       u8         reserved_at_8[0x18];
+@@ -7242,7 +7242,7 @@ struct mlx5_ifc_set_rate_limit_out_bits
+       u8         reserved_at_40[0x40];
+ };
+-struct mlx5_ifc_set_rate_limit_in_bits {
++struct mlx5_ifc_set_pp_rate_limit_in_bits {
+       u8         opcode[0x10];
+       u8         reserved_at_10[0x10];
+@@ -7255,6 +7255,8 @@ struct mlx5_ifc_set_rate_limit_in_bits {
+       u8         reserved_at_60[0x20];
+       u8         rate_limit[0x20];
++
++      u8         reserved_at_a0[0x160];
+ };
+ struct mlx5_ifc_access_register_out_bits {
diff --git a/queue-4.14/net-mlx5-fpga-return-einval-if-size-is-zero.patch b/queue-4.14/net-mlx5-fpga-return-einval-if-size-is-zero.patch
new file mode 100644 (file)
index 0000000..c7df59d
--- /dev/null
@@ -0,0 +1,53 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Kamal Heib <kamalh@mellanox.com>
+Date: Sun, 29 Oct 2017 04:03:37 +0200
+Subject: net/mlx5: FPGA, return -EINVAL if size is zero
+
+From: Kamal Heib <kamalh@mellanox.com>
+
+
+[ Upstream commit bae115a2bb479142605726e6aa130f43f50e801a ]
+
+Currently, if a size of zero is passed to
+mlx5_fpga_mem_{read|write}_i2c()
+the "err" return value will not be initialized, which triggers gcc
+warnings:
+
+[..]/mlx5/core/fpga/sdk.c:87 mlx5_fpga_mem_read_i2c() error:
+uninitialized symbol 'err'.
+[..]/mlx5/core/fpga/sdk.c:115 mlx5_fpga_mem_write_i2c() error:
+uninitialized symbol 'err'.
+
+fix that.
+
+Fixes: a9956d35d199 ('net/mlx5: FPGA, Add SBU infrastructure')
+Signed-off-by: Kamal Heib <kamalh@mellanox.com>
+Reviewed-by: Yevgeny Kliteynik <kliteyn@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/fpga/sdk.c
+@@ -66,6 +66,9 @@ static int mlx5_fpga_mem_read_i2c(struct
+       u8 actual_size;
+       int err;
++      if (!size)
++              return -EINVAL;
++
+       if (!fdev->mdev)
+               return -ENOTCONN;
+@@ -95,6 +98,9 @@ static int mlx5_fpga_mem_write_i2c(struc
+       u8 actual_size;
+       int err;
++      if (!size)
++              return -EINVAL;
++
+       if (!fdev->mdev)
+               return -ENOTCONN;
diff --git a/queue-4.14/net-mlx5e-add-refcount-to-vxlan-structure.patch b/queue-4.14/net-mlx5e-add-refcount-to-vxlan-structure.patch
new file mode 100644 (file)
index 0000000..2fd4c61
--- /dev/null
@@ -0,0 +1,132 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Gal Pressman <galp@mellanox.com>
+Date: Sun, 3 Dec 2017 13:58:50 +0200
+Subject: net/mlx5e: Add refcount to VXLAN structure
+
+From: Gal Pressman <galp@mellanox.com>
+
+
+[ Upstream commit 23f4cc2cd9ed92570647220aca60d0197d8c1fa9 ]
+
+A refcount mechanism must be implemented in order to prevent unwanted
+scenarios such as:
+- Open an IPv4 VXLAN interface
+- Open an IPv6 VXLAN interface (different socket)
+- Remove one of the interfaces
+
+With current implementation, the UDP port will be removed from our VXLAN
+database and turn off the offloads for the other interface, which is
+still active.
+The reference count mechanism will only allow UDP port removals once all
+consumers are gone.
+
+Fixes: b3f63c3d5e2c ("net/mlx5e: Add netdev support for VXLAN tunneling")
+Signed-off-by: Gal Pressman <galp@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/vxlan.c |   50 ++++++++++++------------
+ drivers/net/ethernet/mellanox/mlx5/core/vxlan.h |    1 
+ 2 files changed, 28 insertions(+), 23 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+@@ -88,8 +88,11 @@ static void mlx5e_vxlan_add_port(struct
+       struct mlx5e_vxlan *vxlan;
+       int err;
+-      if (mlx5e_vxlan_lookup_port(priv, port))
++      vxlan = mlx5e_vxlan_lookup_port(priv, port);
++      if (vxlan) {
++              atomic_inc(&vxlan->refcount);
+               goto free_work;
++      }
+       if (mlx5e_vxlan_core_add_port_cmd(priv->mdev, port))
+               goto free_work;
+@@ -99,6 +102,7 @@ static void mlx5e_vxlan_add_port(struct
+               goto err_delete_port;
+       vxlan->udp_port = port;
++      atomic_set(&vxlan->refcount, 1);
+       spin_lock_bh(&vxlan_db->lock);
+       err = radix_tree_insert(&vxlan_db->tree, vxlan->udp_port, vxlan);
+@@ -116,32 +120,33 @@ free_work:
+       kfree(vxlan_work);
+ }
+-static void __mlx5e_vxlan_core_del_port(struct mlx5e_priv *priv, u16 port)
++static void mlx5e_vxlan_del_port(struct work_struct *work)
+ {
++      struct mlx5e_vxlan_work *vxlan_work =
++              container_of(work, struct mlx5e_vxlan_work, work);
++      struct mlx5e_priv *priv         = vxlan_work->priv;
+       struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
++      u16 port = vxlan_work->port;
+       struct mlx5e_vxlan *vxlan;
++      bool remove = false;
+       spin_lock_bh(&vxlan_db->lock);
+-      vxlan = radix_tree_delete(&vxlan_db->tree, port);
+-      spin_unlock_bh(&vxlan_db->lock);
+-
++      vxlan = radix_tree_lookup(&vxlan_db->tree, port);
+       if (!vxlan)
+-              return;
+-
+-      mlx5e_vxlan_core_del_port_cmd(priv->mdev, vxlan->udp_port);
+-
+-      kfree(vxlan);
+-}
++              goto out_unlock;
+-static void mlx5e_vxlan_del_port(struct work_struct *work)
+-{
+-      struct mlx5e_vxlan_work *vxlan_work =
+-              container_of(work, struct mlx5e_vxlan_work, work);
+-      struct mlx5e_priv *priv = vxlan_work->priv;
+-      u16 port = vxlan_work->port;
++      if (atomic_dec_and_test(&vxlan->refcount)) {
++              radix_tree_delete(&vxlan_db->tree, port);
++              remove = true;
++      }
+-      __mlx5e_vxlan_core_del_port(priv, port);
++out_unlock:
++      spin_unlock_bh(&vxlan_db->lock);
++      if (remove) {
++              mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
++              kfree(vxlan);
++      }
+       kfree(vxlan_work);
+ }
+@@ -171,12 +176,11 @@ void mlx5e_vxlan_cleanup(struct mlx5e_pr
+       struct mlx5e_vxlan *vxlan;
+       unsigned int port = 0;
+-      spin_lock_bh(&vxlan_db->lock);
++      /* Lockless since we are the only radix-tree consumers, wq is disabled */
+       while (radix_tree_gang_lookup(&vxlan_db->tree, (void **)&vxlan, port, 1)) {
+               port = vxlan->udp_port;
+-              spin_unlock_bh(&vxlan_db->lock);
+-              __mlx5e_vxlan_core_del_port(priv, (u16)port);
+-              spin_lock_bh(&vxlan_db->lock);
++              radix_tree_delete(&vxlan_db->tree, port);
++              mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
++              kfree(vxlan);
+       }
+-      spin_unlock_bh(&vxlan_db->lock);
+ }
+--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
++++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.h
+@@ -36,6 +36,7 @@
+ #include "en.h"
+ struct mlx5e_vxlan {
++      atomic_t refcount;
+       u16 udp_port;
+ };
diff --git a/queue-4.14/net-mlx5e-fix-features-check-of-ipv6-traffic.patch b/queue-4.14/net-mlx5e-fix-features-check-of-ipv6-traffic.patch
new file mode 100644 (file)
index 0000000..d78fc7a
--- /dev/null
@@ -0,0 +1,50 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Gal Pressman <galp@mellanox.com>
+Date: Tue, 21 Nov 2017 17:49:36 +0200
+Subject: net/mlx5e: Fix features check of IPv6 traffic
+
+From: Gal Pressman <galp@mellanox.com>
+
+
+[ Upstream commit 2989ad1ec03021ee6d2193c35414f1d970a243de ]
+
+The assumption that the next header field contains the transport
+protocol is wrong for IPv6 packets with extension headers.
+Instead, we should look the inner-most next header field in the buffer.
+This will fix TSO offload for tunnels over IPv6 with extension headers.
+
+Performance testing: 19.25x improvement, cool!
+Measuring bandwidth of 16 threads TCP traffic over IPv6 GRE tap.
+CPU: Intel(R) Xeon(R) CPU E5-2660 v2 @ 2.20GHz
+NIC: Mellanox Technologies MT28800 Family [ConnectX-5 Ex]
+TSO: Enabled
+Before: 4,926.24  Mbps
+Now   : 94,827.91 Mbps
+
+Fixes: b3f63c3d5e2c ("net/mlx5e: Add netdev support for VXLAN tunneling")
+Signed-off-by: Gal Pressman <galp@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_main.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+@@ -3554,6 +3554,7 @@ static netdev_features_t mlx5e_tunnel_fe
+                                                    struct sk_buff *skb,
+                                                    netdev_features_t features)
+ {
++      unsigned int offset = 0;
+       struct udphdr *udph;
+       u8 proto;
+       u16 port;
+@@ -3563,7 +3564,7 @@ static netdev_features_t mlx5e_tunnel_fe
+               proto = ip_hdr(skb)->protocol;
+               break;
+       case htons(ETH_P_IPV6):
+-              proto = ipv6_hdr(skb)->nexthdr;
++              proto = ipv6_find_hdr(skb, &offset, -1, NULL, NULL);
+               break;
+       default:
+               goto out;
diff --git a/queue-4.14/net-mlx5e-fix-possible-deadlock-of-vxlan-lock.patch b/queue-4.14/net-mlx5e-fix-possible-deadlock-of-vxlan-lock.patch
new file mode 100644 (file)
index 0000000..31a501d
--- /dev/null
@@ -0,0 +1,107 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Gal Pressman <galp@mellanox.com>
+Date: Thu, 23 Nov 2017 13:52:28 +0200
+Subject: net/mlx5e: Fix possible deadlock of VXLAN lock
+
+From: Gal Pressman <galp@mellanox.com>
+
+
+[ Upstream commit 6323514116404cc651df1b7fffa1311ddf8ce647 ]
+
+mlx5e_vxlan_lookup_port is called both from mlx5e_add_vxlan_port (user
+context) and mlx5e_features_check (softirq), but the lock acquired does
+not disable bottom half and might result in deadlock. Fix it by simply
+replacing spin_lock() with spin_lock_bh().
+While at it, replace all unnecessary spin_lock_irq() to spin_lock_bh().
+
+lockdep's WARNING: inconsistent lock state
+[  654.028136] inconsistent {SOFTIRQ-ON-W} -> {IN-SOFTIRQ-W} usage.
+[  654.028229] swapper/5/0 [HC0[0]:SC1[9]:HE1:SE0] takes:
+[  654.028321]  (&(&vxlan_db->lock)->rlock){+.?.}, at: [<ffffffffa06e7f0e>] mlx5e_vxlan_lookup_port+0x1e/0x50 [mlx5_core]
+[  654.028528] {SOFTIRQ-ON-W} state was registered at:
+[  654.028607]   _raw_spin_lock+0x3c/0x70
+[  654.028689]   mlx5e_vxlan_lookup_port+0x1e/0x50 [mlx5_core]
+[  654.028794]   mlx5e_vxlan_add_port+0x2e/0x120 [mlx5_core]
+[  654.028878]   process_one_work+0x1e9/0x640
+[  654.028942]   worker_thread+0x4a/0x3f0
+[  654.029002]   kthread+0x141/0x180
+[  654.029056]   ret_from_fork+0x24/0x30
+[  654.029114] irq event stamp: 579088
+[  654.029174] hardirqs last  enabled at (579088): [<ffffffff818f475a>] ip6_finish_output2+0x49a/0x8c0
+[  654.029309] hardirqs last disabled at (579087): [<ffffffff818f470e>] ip6_finish_output2+0x44e/0x8c0
+[  654.029446] softirqs last  enabled at (579030): [<ffffffff810b3b3d>] irq_enter+0x6d/0x80
+[  654.029567] softirqs last disabled at (579031): [<ffffffff810b3c05>] irq_exit+0xb5/0xc0
+[  654.029684] other info that might help us debug this:
+[  654.029781]  Possible unsafe locking scenario:
+
+[  654.029868]        CPU0
+[  654.029908]        ----
+[  654.029947]   lock(&(&vxlan_db->lock)->rlock);
+[  654.030045]   <Interrupt>
+[  654.030090]     lock(&(&vxlan_db->lock)->rlock);
+[  654.030162]
+ *** DEADLOCK ***
+
+Fixes: b3f63c3d5e2c ("net/mlx5e: Add netdev support for VXLAN tunneling")
+Signed-off-by: Gal Pressman <galp@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/vxlan.c |   20 ++++++++++----------
+ 1 file changed, 10 insertions(+), 10 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+@@ -71,9 +71,9 @@ struct mlx5e_vxlan *mlx5e_vxlan_lookup_p
+       struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
+       struct mlx5e_vxlan *vxlan;
+-      spin_lock(&vxlan_db->lock);
++      spin_lock_bh(&vxlan_db->lock);
+       vxlan = radix_tree_lookup(&vxlan_db->tree, port);
+-      spin_unlock(&vxlan_db->lock);
++      spin_unlock_bh(&vxlan_db->lock);
+       return vxlan;
+ }
+@@ -100,9 +100,9 @@ static void mlx5e_vxlan_add_port(struct
+       vxlan->udp_port = port;
+-      spin_lock_irq(&vxlan_db->lock);
++      spin_lock_bh(&vxlan_db->lock);
+       err = radix_tree_insert(&vxlan_db->tree, vxlan->udp_port, vxlan);
+-      spin_unlock_irq(&vxlan_db->lock);
++      spin_unlock_bh(&vxlan_db->lock);
+       if (err)
+               goto err_free;
+@@ -121,9 +121,9 @@ static void __mlx5e_vxlan_core_del_port(
+       struct mlx5e_vxlan_db *vxlan_db = &priv->vxlan;
+       struct mlx5e_vxlan *vxlan;
+-      spin_lock_irq(&vxlan_db->lock);
++      spin_lock_bh(&vxlan_db->lock);
+       vxlan = radix_tree_delete(&vxlan_db->tree, port);
+-      spin_unlock_irq(&vxlan_db->lock);
++      spin_unlock_bh(&vxlan_db->lock);
+       if (!vxlan)
+               return;
+@@ -171,12 +171,12 @@ void mlx5e_vxlan_cleanup(struct mlx5e_pr
+       struct mlx5e_vxlan *vxlan;
+       unsigned int port = 0;
+-      spin_lock_irq(&vxlan_db->lock);
++      spin_lock_bh(&vxlan_db->lock);
+       while (radix_tree_gang_lookup(&vxlan_db->tree, (void **)&vxlan, port, 1)) {
+               port = vxlan->udp_port;
+-              spin_unlock_irq(&vxlan_db->lock);
++              spin_unlock_bh(&vxlan_db->lock);
+               __mlx5e_vxlan_core_del_port(priv, (u16)port);
+-              spin_lock_irq(&vxlan_db->lock);
++              spin_lock_bh(&vxlan_db->lock);
+       }
+-      spin_unlock_irq(&vxlan_db->lock);
++      spin_unlock_bh(&vxlan_db->lock);
+ }
diff --git a/queue-4.14/net-mlx5e-prevent-possible-races-in-vxlan-control-flow.patch b/queue-4.14/net-mlx5e-prevent-possible-races-in-vxlan-control-flow.patch
new file mode 100644 (file)
index 0000000..8652bc4
--- /dev/null
@@ -0,0 +1,60 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Gal Pressman <galp@mellanox.com>
+Date: Mon, 4 Dec 2017 09:57:43 +0200
+Subject: net/mlx5e: Prevent possible races in VXLAN control flow
+
+From: Gal Pressman <galp@mellanox.com>
+
+
+[ Upstream commit 0c1cc8b2215f5122ca614b5adca60346018758c3 ]
+
+When calling add/remove VXLAN port, a lock must be held in order to
+prevent race scenarios when more than one add/remove happens at the
+same time.
+Fix by holding our state_lock (mutex) as done by all other parts of the
+driver.
+Note that the spinlock protecting the radix-tree is still needed in
+order to synchronize radix-tree access from softirq context.
+
+Fixes: b3f63c3d5e2c ("net/mlx5e: Add netdev support for VXLAN tunneling")
+Signed-off-by: Gal Pressman <galp@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/vxlan.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/vxlan.c
+@@ -88,6 +88,7 @@ static void mlx5e_vxlan_add_port(struct
+       struct mlx5e_vxlan *vxlan;
+       int err;
++      mutex_lock(&priv->state_lock);
+       vxlan = mlx5e_vxlan_lookup_port(priv, port);
+       if (vxlan) {
+               atomic_inc(&vxlan->refcount);
+@@ -117,6 +118,7 @@ err_free:
+ err_delete_port:
+       mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
+ free_work:
++      mutex_unlock(&priv->state_lock);
+       kfree(vxlan_work);
+ }
+@@ -130,6 +132,7 @@ static void mlx5e_vxlan_del_port(struct
+       struct mlx5e_vxlan *vxlan;
+       bool remove = false;
++      mutex_lock(&priv->state_lock);
+       spin_lock_bh(&vxlan_db->lock);
+       vxlan = radix_tree_lookup(&vxlan_db->tree, port);
+       if (!vxlan)
+@@ -147,6 +150,7 @@ out_unlock:
+               mlx5e_vxlan_core_del_port_cmd(priv->mdev, port);
+               kfree(vxlan);
+       }
++      mutex_unlock(&priv->state_lock);
+       kfree(vxlan_work);
+ }
diff --git a/queue-4.14/net-mvmdio-disable-unprepare-clocks-in-eprobe_defer-case.patch b/queue-4.14/net-mvmdio-disable-unprepare-clocks-in-eprobe_defer-case.patch
new file mode 100644 (file)
index 0000000..efb8aaf
--- /dev/null
@@ -0,0 +1,36 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Tobias Jordan <Tobias.Jordan@elektrobit.com>
+Date: Wed, 6 Dec 2017 15:23:23 +0100
+Subject: net: mvmdio: disable/unprepare clocks in EPROBE_DEFER case
+
+From: Tobias Jordan <Tobias.Jordan@elektrobit.com>
+
+
+[ Upstream commit 589bf32f09852041fbd3b7ce1a9e703f95c230ba ]
+
+add appropriate calls to clk_disable_unprepare() by jumping to out_mdio
+in case orion_mdio_probe() returns -EPROBE_DEFER.
+
+Found by Linux Driver Verification project (linuxtesting.org).
+
+Fixes: 3d604da1e954 ("net: mvmdio: get and enable optional clock")
+Signed-off-by: Tobias Jordan <Tobias.Jordan@elektrobit.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/marvell/mvmdio.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/marvell/mvmdio.c
++++ b/drivers/net/ethernet/marvell/mvmdio.c
+@@ -344,7 +344,8 @@ static int orion_mdio_probe(struct platf
+                       dev->regs + MVMDIO_ERR_INT_MASK);
+       } else if (dev->err_interrupt == -EPROBE_DEFER) {
+-              return -EPROBE_DEFER;
++              ret = -EPROBE_DEFER;
++              goto out_mdio;
+       }
+       if (pdev->dev.of_node)
diff --git a/queue-4.14/net-phy-marvell-limit-88m1101-autoneg-errata-to-88e1145-as-well.patch b/queue-4.14/net-phy-marvell-limit-88m1101-autoneg-errata-to-88e1145-as-well.patch
new file mode 100644 (file)
index 0000000..52aefc8
--- /dev/null
@@ -0,0 +1,31 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Zhao Qiang <qiang.zhao@nxp.com>
+Date: Mon, 18 Dec 2017 10:26:43 +0800
+Subject: net: phy: marvell: Limit 88m1101 autoneg errata to 88E1145 as well.
+
+From: Zhao Qiang <qiang.zhao@nxp.com>
+
+
+[ Upstream commit c505873eaece2b4aefd07d339dc7e1400e0235ac ]
+
+88E1145 also need this autoneg errata.
+
+Fixes: f2899788353c ("net: phy: marvell: Limit errata to 88m1101")
+Signed-off-by: Zhao Qiang <qiang.zhao@nxp.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/phy/marvell.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/phy/marvell.c
++++ b/drivers/net/phy/marvell.c
+@@ -2069,7 +2069,7 @@ static struct phy_driver marvell_drivers
+               .flags = PHY_HAS_INTERRUPT,
+               .probe = marvell_probe,
+               .config_init = &m88e1145_config_init,
+-              .config_aneg = &marvell_config_aneg,
++              .config_aneg = &m88e1101_config_aneg,
+               .read_status = &genphy_read_status,
+               .ack_interrupt = &marvell_ack_interrupt,
+               .config_intr = &marvell_config_intr,
diff --git a/queue-4.14/net-phy-micrel-ksz9031-reconfigure-autoneg-after-phy-autoneg-workaround.patch b/queue-4.14/net-phy-micrel-ksz9031-reconfigure-autoneg-after-phy-autoneg-workaround.patch
new file mode 100644 (file)
index 0000000..70d5018
--- /dev/null
@@ -0,0 +1,38 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Grygorii Strashko <grygorii.strashko@ti.com>
+Date: Wed, 20 Dec 2017 18:45:10 -0600
+Subject: net: phy: micrel: ksz9031: reconfigure autoneg after phy autoneg workaround
+
+From: Grygorii Strashko <grygorii.strashko@ti.com>
+
+
+[ Upstream commit c1a8d0a3accf64a014d605e6806ce05d1c17adf1 ]
+
+Under some circumstances driver will perform PHY reset in
+ksz9031_read_status() to fix autoneg failure case (idle error count =
+0xFF). When this happens ksz9031 will not detect link status change any
+more when connecting to Netgear 1G switch (link can be recovered sometimes by
+restarting netdevice "ifconfig down up"). Reproduced with TI am572x board
+equipped with ksz9031 PHY while connecting to Netgear 1G switch.
+
+Fix the issue by reconfiguring autonegotiation after PHY reset in
+ksz9031_read_status().
+
+Fixes: d2fd719bcb0e ("net/phy: micrel: Add workaround for bad autoneg")
+Signed-off-by: Grygorii Strashko <grygorii.strashko@ti.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/phy/micrel.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/net/phy/micrel.c
++++ b/drivers/net/phy/micrel.c
+@@ -622,6 +622,7 @@ static int ksz9031_read_status(struct ph
+               phydev->link = 0;
+               if (phydev->drv->config_intr && phy_interrupt_is_valid(phydev))
+                       phydev->drv->config_intr(phydev);
++              return genphy_config_aneg(phydev);
+       }
+       return 0;
diff --git a/queue-4.14/net-qmi_wwan-add-sierra-em7565-1199-9091.patch b/queue-4.14/net-qmi_wwan-add-sierra-em7565-1199-9091.patch
new file mode 100644 (file)
index 0000000..e8f8e58
--- /dev/null
@@ -0,0 +1,32 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Sebastian Sjoholm <ssjoholm@mac.com>
+Date: Mon, 11 Dec 2017 21:51:14 +0100
+Subject: net: qmi_wwan: add Sierra EM7565 1199:9091
+
+From: Sebastian Sjoholm <ssjoholm@mac.com>
+
+
+[ Upstream commit aceef61ee56898cfa7b6960fb60b9326c3860441 ]
+
+Sierra Wireless EM7565 is an Qualcomm MDM9x50 based M.2 modem.
+The USB id is added to qmi_wwan.c to allow QMI communication
+with the EM7565.
+
+Signed-off-by: Sebastian Sjoholm <ssjoholm@mac.com>
+Acked-by: Bjørn Mork <bjorn@mork.no>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/usb/qmi_wwan.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/net/usb/qmi_wwan.c
++++ b/drivers/net/usb/qmi_wwan.c
+@@ -1204,6 +1204,7 @@ static const struct usb_device_id produc
+       {QMI_FIXED_INTF(0x1199, 0x9079, 10)},   /* Sierra Wireless EM74xx */
+       {QMI_FIXED_INTF(0x1199, 0x907b, 8)},    /* Sierra Wireless EM74xx */
+       {QMI_FIXED_INTF(0x1199, 0x907b, 10)},   /* Sierra Wireless EM74xx */
++      {QMI_FIXED_INTF(0x1199, 0x9091, 8)},    /* Sierra Wireless EM7565 */
+       {QMI_FIXED_INTF(0x1bbb, 0x011e, 4)},    /* Telekom Speedstick LTE II (Alcatel One Touch L100V LTE) */
+       {QMI_FIXED_INTF(0x1bbb, 0x0203, 2)},    /* Alcatel L800MA */
+       {QMI_FIXED_INTF(0x2357, 0x0201, 4)},    /* TP-LINK HSUPA Modem MA180 */
diff --git a/queue-4.14/net-reevalulate-autoflowlabel-setting-after-sysctl-setting.patch b/queue-4.14/net-reevalulate-autoflowlabel-setting-after-sysctl-setting.patch
new file mode 100644 (file)
index 0000000..19df827
--- /dev/null
@@ -0,0 +1,118 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Shaohua Li <shli@fb.com>
+Date: Wed, 20 Dec 2017 12:10:21 -0800
+Subject: net: reevalulate autoflowlabel setting after sysctl setting
+
+From: Shaohua Li <shli@fb.com>
+
+
+[ Upstream commit 513674b5a2c9c7a67501506419da5c3c77ac6f08 ]
+
+sysctl.ip6.auto_flowlabels is default 1. In our hosts, we set it to 2.
+If sockopt doesn't set autoflowlabel, outcome packets from the hosts are
+supposed to not include flowlabel. This is true for normal packet, but
+not for reset packet.
+
+The reason is ipv6_pinfo.autoflowlabel is set in sock creation. Later if
+we change sysctl.ip6.auto_flowlabels, the ipv6_pinfo.autoflowlabel isn't
+changed, so the sock will keep the old behavior in terms of auto
+flowlabel. Reset packet is suffering from this problem, because reset
+packet is sent from a special control socket, which is created at boot
+time. Since sysctl.ipv6.auto_flowlabels is 1 by default, the control
+socket will always have its ipv6_pinfo.autoflowlabel set, even after
+user set sysctl.ipv6.auto_flowlabels to 1, so reset packset will always
+have flowlabel. Normal sock created before sysctl setting suffers from
+the same issue. We can't even turn off autoflowlabel unless we kill all
+socks in the hosts.
+
+To fix this, if IPV6_AUTOFLOWLABEL sockopt is used, we use the
+autoflowlabel setting from user, otherwise we always call
+ip6_default_np_autolabel() which has the new settings of sysctl.
+
+Note, this changes behavior a little bit. Before commit 42240901f7c4
+(ipv6: Implement different admin modes for automatic flow labels), the
+autoflowlabel behavior of a sock isn't sticky, eg, if sysctl changes,
+existing connection will change autoflowlabel behavior. After that
+commit, autoflowlabel behavior is sticky in the whole life of the sock.
+With this patch, the behavior isn't sticky again.
+
+Cc: Martin KaFai Lau <kafai@fb.com>
+Cc: Eric Dumazet <eric.dumazet@gmail.com>
+Cc: Tom Herbert <tom@quantonium.net>
+Signed-off-by: Shaohua Li <shli@fb.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/ipv6.h     |    3 ++-
+ net/ipv6/af_inet6.c      |    1 -
+ net/ipv6/ip6_output.c    |   12 ++++++++++--
+ net/ipv6/ipv6_sockglue.c |    1 +
+ 4 files changed, 13 insertions(+), 4 deletions(-)
+
+--- a/include/linux/ipv6.h
++++ b/include/linux/ipv6.h
+@@ -272,7 +272,8 @@ struct ipv6_pinfo {
+                                                * 100: prefer care-of address
+                                                */
+                               dontfrag:1,
+-                              autoflowlabel:1;
++                              autoflowlabel:1,
++                              autoflowlabel_set:1;
+       __u8                    min_hopcount;
+       __u8                    tclass;
+       __be32                  rcv_flowinfo;
+--- a/net/ipv6/af_inet6.c
++++ b/net/ipv6/af_inet6.c
+@@ -210,7 +210,6 @@ lookup_protocol:
+       np->mcast_hops  = IPV6_DEFAULT_MCASTHOPS;
+       np->mc_loop     = 1;
+       np->pmtudisc    = IPV6_PMTUDISC_WANT;
+-      np->autoflowlabel = ip6_default_np_autolabel(net);
+       np->repflow     = net->ipv6.sysctl.flowlabel_reflect;
+       sk->sk_ipv6only = net->ipv6.sysctl.bindv6only;
+--- a/net/ipv6/ip6_output.c
++++ b/net/ipv6/ip6_output.c
+@@ -166,6 +166,14 @@ int ip6_output(struct net *net, struct s
+                           !(IP6CB(skb)->flags & IP6SKB_REROUTED));
+ }
++static bool ip6_autoflowlabel(struct net *net, const struct ipv6_pinfo *np)
++{
++      if (!np->autoflowlabel_set)
++              return ip6_default_np_autolabel(net);
++      else
++              return np->autoflowlabel;
++}
++
+ /*
+  * xmit an sk_buff (used by TCP, SCTP and DCCP)
+  * Note : socket lock is not held for SYNACK packets, but might be modified
+@@ -230,7 +238,7 @@ int ip6_xmit(const struct sock *sk, stru
+               hlimit = ip6_dst_hoplimit(dst);
+       ip6_flow_hdr(hdr, tclass, ip6_make_flowlabel(net, skb, fl6->flowlabel,
+-                                                   np->autoflowlabel, fl6));
++                              ip6_autoflowlabel(net, np), fl6));
+       hdr->payload_len = htons(seg_len);
+       hdr->nexthdr = proto;
+@@ -1626,7 +1634,7 @@ struct sk_buff *__ip6_make_skb(struct so
+       ip6_flow_hdr(hdr, v6_cork->tclass,
+                    ip6_make_flowlabel(net, skb, fl6->flowlabel,
+-                                      np->autoflowlabel, fl6));
++                                      ip6_autoflowlabel(net, np), fl6));
+       hdr->hop_limit = v6_cork->hop_limit;
+       hdr->nexthdr = proto;
+       hdr->saddr = fl6->saddr;
+--- a/net/ipv6/ipv6_sockglue.c
++++ b/net/ipv6/ipv6_sockglue.c
+@@ -878,6 +878,7 @@ pref_skip_coa:
+               break;
+       case IPV6_AUTOFLOWLABEL:
+               np->autoflowlabel = valbool;
++              np->autoflowlabel_set = 1;
+               retv = 0;
+               break;
+       case IPV6_RECVFRAGSIZE:
diff --git a/queue-4.14/net-sched-fix-static-key-imbalance-in-case-of-ingress-clsact_init-error.patch b/queue-4.14/net-sched-fix-static-key-imbalance-in-case-of-ingress-clsact_init-error.patch
new file mode 100644 (file)
index 0000000..62f7305
--- /dev/null
@@ -0,0 +1,59 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Jiri Pirko <jiri@mellanox.com>
+Date: Fri, 15 Dec 2017 12:40:13 +0100
+Subject: net: sched: fix static key imbalance in case of ingress/clsact_init error
+
+From: Jiri Pirko <jiri@mellanox.com>
+
+
+[ Upstream commit b59e6979a86384e68b0ab6ffeab11f0034fba82d ]
+
+Move static key increments to the beginning of the init function
+so they pair 1:1 with decrements in ingress/clsact_destroy,
+which is called in case ingress/clsact_init fails.
+
+Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure")
+Signed-off-by: Jiri Pirko <jiri@mellanox.com>
+Acked-by: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sched/sch_ingress.c |    9 +++++----
+ 1 file changed, 5 insertions(+), 4 deletions(-)
+
+--- a/net/sched/sch_ingress.c
++++ b/net/sched/sch_ingress.c
+@@ -59,11 +59,12 @@ static int ingress_init(struct Qdisc *sc
+       struct net_device *dev = qdisc_dev(sch);
+       int err;
++      net_inc_ingress_queue();
++
+       err = tcf_block_get(&q->block, &dev->ingress_cl_list);
+       if (err)
+               return err;
+-      net_inc_ingress_queue();
+       sch->flags |= TCQ_F_CPUSTATS;
+       return 0;
+@@ -153,6 +154,9 @@ static int clsact_init(struct Qdisc *sch
+       struct net_device *dev = qdisc_dev(sch);
+       int err;
++      net_inc_ingress_queue();
++      net_inc_egress_queue();
++
+       err = tcf_block_get(&q->ingress_block, &dev->ingress_cl_list);
+       if (err)
+               return err;
+@@ -161,9 +165,6 @@ static int clsact_init(struct Qdisc *sch
+       if (err)
+               return err;
+-      net_inc_ingress_queue();
+-      net_inc_egress_queue();
+-
+       sch->flags |= TCQ_F_CPUSTATS;
+       return 0;
diff --git a/queue-4.14/netlink-add-netns-check-on-taps.patch b/queue-4.14/netlink-add-netns-check-on-taps.patch
new file mode 100644 (file)
index 0000000..0a66595
--- /dev/null
@@ -0,0 +1,44 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Kevin Cernekee <cernekee@chromium.org>
+Date: Wed, 6 Dec 2017 12:12:27 -0800
+Subject: netlink: Add netns check on taps
+
+From: Kevin Cernekee <cernekee@chromium.org>
+
+
+[ Upstream commit 93c647643b48f0131f02e45da3bd367d80443291 ]
+
+Currently, a nlmon link inside a child namespace can observe systemwide
+netlink activity.  Filter the traffic so that nlmon can only sniff
+netlink messages from its own netns.
+
+Test case:
+
+    vpnns -- bash -c "ip link add nlmon0 type nlmon; \
+                      ip link set nlmon0 up; \
+                      tcpdump -i nlmon0 -q -w /tmp/nlmon.pcap -U" &
+    sudo ip xfrm state add src 10.1.1.1 dst 10.1.1.2 proto esp \
+        spi 0x1 mode transport \
+        auth sha1 0x6162633132330000000000000000000000000000 \
+        enc aes 0x00000000000000000000000000000000
+    grep --binary abc123 /tmp/nlmon.pcap
+
+Signed-off-by: Kevin Cernekee <cernekee@chromium.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/netlink/af_netlink.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/net/netlink/af_netlink.c
++++ b/net/netlink/af_netlink.c
+@@ -254,6 +254,9 @@ static int __netlink_deliver_tap_skb(str
+       struct sock *sk = skb->sk;
+       int ret = -ENOMEM;
++      if (!net_eq(dev_net(dev), sock_net(sk)))
++              return 0;
++
+       dev_hold(dev);
+       if (is_vmalloc_addr(skb->head))
diff --git a/queue-4.14/openvswitch-fix-pop_vlan-action-for-double-tagged-frames.patch b/queue-4.14/openvswitch-fix-pop_vlan-action-for-double-tagged-frames.patch
new file mode 100644 (file)
index 0000000..e1cbe88
--- /dev/null
@@ -0,0 +1,60 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Eric Garver <e@erig.me>
+Date: Wed, 20 Dec 2017 15:09:22 -0500
+Subject: openvswitch: Fix pop_vlan action for double tagged frames
+
+From: Eric Garver <e@erig.me>
+
+
+[ Upstream commit c48e74736fccf25fb32bb015426359e1c2016e3b ]
+
+skb_vlan_pop() expects skb->protocol to be a valid TPID for double
+tagged frames. So set skb->protocol to the TPID and let skb_vlan_pop()
+shift the true ethertype into position for us.
+
+Fixes: 5108bbaddc37 ("openvswitch: add processing of L3 packets")
+Signed-off-by: Eric Garver <e@erig.me>
+Reviewed-by: Jiri Benc <jbenc@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/openvswitch/flow.c |   15 ++++++++++++---
+ 1 file changed, 12 insertions(+), 3 deletions(-)
+
+--- a/net/openvswitch/flow.c
++++ b/net/openvswitch/flow.c
+@@ -532,6 +532,7 @@ static int key_extract(struct sk_buff *s
+                       return -EINVAL;
+               skb_reset_network_header(skb);
++              key->eth.type = skb->protocol;
+       } else {
+               eth = eth_hdr(skb);
+               ether_addr_copy(key->eth.src, eth->h_source);
+@@ -545,15 +546,23 @@ static int key_extract(struct sk_buff *s
+               if (unlikely(parse_vlan(skb, key)))
+                       return -ENOMEM;
+-              skb->protocol = parse_ethertype(skb);
+-              if (unlikely(skb->protocol == htons(0)))
++              key->eth.type = parse_ethertype(skb);
++              if (unlikely(key->eth.type == htons(0)))
+                       return -ENOMEM;
++              /* Multiple tagged packets need to retain TPID to satisfy
++               * skb_vlan_pop(), which will later shift the ethertype into
++               * skb->protocol.
++               */
++              if (key->eth.cvlan.tci & htons(VLAN_TAG_PRESENT))
++                      skb->protocol = key->eth.cvlan.tpid;
++              else
++                      skb->protocol = key->eth.type;
++
+               skb_reset_network_header(skb);
+               __skb_push(skb, skb->data - skb_mac_header(skb));
+       }
+       skb_reset_mac_len(skb);
+-      key->eth.type = skb->protocol;
+       /* Network layer. */
+       if (key->eth.type == htons(ETH_P_IP)) {
diff --git a/queue-4.14/phylink-ensure-an-is-enabled.patch b/queue-4.14/phylink-ensure-an-is-enabled.patch
new file mode 100644 (file)
index 0000000..1741031
--- /dev/null
@@ -0,0 +1,33 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Russell King <rmk+kernel@armlinux.org.uk>
+Date: Wed, 20 Dec 2017 23:21:34 +0000
+Subject: phylink: ensure AN is enabled
+
+From: Russell King <rmk+kernel@armlinux.org.uk>
+
+
+[ Upstream commit 74ee0e8c1bf9925c59cc8f1c65c29adf6e4cf603 ]
+
+Ensure that we mark AN as enabled at boot time, rather than leaving
+it disabled.  This is noticable if your SFP module is fiber, and
+it supports faster speeds than 1G with 2.5G support in place.
+
+Fixes: 9525ae83959b ("phylink: add phylink infrastructure")
+Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
+Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/phy/phylink.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/net/phy/phylink.c
++++ b/drivers/net/phy/phylink.c
+@@ -525,6 +525,7 @@ struct phylink *phylink_create(struct ne
+       pl->link_config.pause = MLO_PAUSE_AN;
+       pl->link_config.speed = SPEED_UNKNOWN;
+       pl->link_config.duplex = DUPLEX_UNKNOWN;
++      pl->link_config.an_enabled = true;
+       pl->ops = ops;
+       __set_bit(PHYLINK_DISABLE_STOPPED, &pl->phylink_disable_state);
diff --git a/queue-4.14/phylink-ensure-the-phy-interface-mode-is-appropriately-set.patch b/queue-4.14/phylink-ensure-the-phy-interface-mode-is-appropriately-set.patch
new file mode 100644 (file)
index 0000000..a6ba6bb
--- /dev/null
@@ -0,0 +1,33 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Russell King <rmk+kernel@armlinux.org.uk>
+Date: Wed, 20 Dec 2017 23:21:28 +0000
+Subject: phylink: ensure the PHY interface mode is appropriately set
+
+From: Russell King <rmk+kernel@armlinux.org.uk>
+
+
+[ Upstream commit 182088aa3c6c7f7c20a2c1dcc9ded4a3fc631f38 ]
+
+When setting the ethtool settings, ensure that the validated PHY
+interface mode is propagated to the current link settings, so that
+2500BaseX can be selected.
+
+Fixes: 9525ae83959b ("phylink: add phylink infrastructure")
+Signed-off-by: Russell King <rmk+kernel@armlinux.org.uk>
+Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/phy/phylink.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/net/phy/phylink.c
++++ b/drivers/net/phy/phylink.c
+@@ -948,6 +948,7 @@ int phylink_ethtool_ksettings_set(struct
+       mutex_lock(&pl->state_mutex);
+       /* Configure the MAC to match the new settings */
+       linkmode_copy(pl->link_config.advertising, our_kset.link_modes.advertising);
++      pl->link_config.interface = config.interface;
+       pl->link_config.speed = our_kset.base.speed;
+       pl->link_config.duplex = our_kset.base.duplex;
+       pl->link_config.an_enabled = our_kset.base.autoneg != AUTONEG_DISABLE;
diff --git a/queue-4.14/ptr_ring-add-barriers.patch b/queue-4.14/ptr_ring-add-barriers.patch
new file mode 100644 (file)
index 0000000..397b4ee
--- /dev/null
@@ -0,0 +1,64 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: "Michael S. Tsirkin" <mst@redhat.com>
+Date: Tue, 5 Dec 2017 21:29:37 +0200
+Subject: ptr_ring: add barriers
+
+From: "Michael S. Tsirkin" <mst@redhat.com>
+
+
+[ Upstream commit a8ceb5dbfde1092b466936bca0ff3be127ecf38e ]
+
+Users of ptr_ring expect that it's safe to give the
+data structure a pointer and have it be available
+to consumers, but that actually requires an smb_wmb
+or a stronger barrier.
+
+In absence of such barriers and on architectures that reorder writes,
+consumer might read an un=initialized value from an skb pointer stored
+in the skb array.  This was observed causing crashes.
+
+To fix, add memory barriers.  The barrier we use is a wmb, the
+assumption being that producers do not need to read the value so we do
+not need to order these reads.
+
+Reported-by: George Cherian <george.cherian@cavium.com>
+Suggested-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: Michael S. Tsirkin <mst@redhat.com>
+Acked-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/ptr_ring.h |    9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/include/linux/ptr_ring.h
++++ b/include/linux/ptr_ring.h
+@@ -101,12 +101,18 @@ static inline bool ptr_ring_full_bh(stru
+ /* Note: callers invoking this in a loop must use a compiler barrier,
+  * for example cpu_relax(). Callers must hold producer_lock.
++ * Callers are responsible for making sure pointer that is being queued
++ * points to a valid data.
+  */
+ static inline int __ptr_ring_produce(struct ptr_ring *r, void *ptr)
+ {
+       if (unlikely(!r->size) || r->queue[r->producer])
+               return -ENOSPC;
++      /* Make sure the pointer we are storing points to a valid data. */
++      /* Pairs with smp_read_barrier_depends in __ptr_ring_consume. */
++      smp_wmb();
++
+       r->queue[r->producer++] = ptr;
+       if (unlikely(r->producer >= r->size))
+               r->producer = 0;
+@@ -275,6 +281,9 @@ static inline void *__ptr_ring_consume(s
+       if (ptr)
+               __ptr_ring_discard_one(r);
++      /* Make sure anyone accessing data through the pointer is up to date. */
++      /* Pairs with smp_wmb in __ptr_ring_produce. */
++      smp_read_barrier_depends();
+       return ptr;
+ }
diff --git a/queue-4.14/rds-check-cmsg_len-before-dereferencing-cmsg_data.patch b/queue-4.14/rds-check-cmsg_len-before-dereferencing-cmsg_data.patch
new file mode 100644 (file)
index 0000000..5e1518c
--- /dev/null
@@ -0,0 +1,72 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Avinash Repaka <avinash.repaka@oracle.com>
+Date: Thu, 21 Dec 2017 20:17:04 -0800
+Subject: RDS: Check cmsg_len before dereferencing CMSG_DATA
+
+From: Avinash Repaka <avinash.repaka@oracle.com>
+
+
+[ Upstream commit 14e138a86f6347c6199f610576d2e11c03bec5f0 ]
+
+RDS currently doesn't check if the length of the control message is
+large enough to hold the required data, before dereferencing the control
+message data. This results in following crash:
+
+BUG: KASAN: stack-out-of-bounds in rds_rdma_bytes net/rds/send.c:1013
+[inline]
+BUG: KASAN: stack-out-of-bounds in rds_sendmsg+0x1f02/0x1f90
+net/rds/send.c:1066
+Read of size 8 at addr ffff8801c928fb70 by task syzkaller455006/3157
+
+CPU: 0 PID: 3157 Comm: syzkaller455006 Not tainted 4.15.0-rc3+ #161
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
+Google 01/01/2011
+Call Trace:
+ __dump_stack lib/dump_stack.c:17 [inline]
+ dump_stack+0x194/0x257 lib/dump_stack.c:53
+ print_address_description+0x73/0x250 mm/kasan/report.c:252
+ kasan_report_error mm/kasan/report.c:351 [inline]
+ kasan_report+0x25b/0x340 mm/kasan/report.c:409
+ __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430
+ rds_rdma_bytes net/rds/send.c:1013 [inline]
+ rds_sendmsg+0x1f02/0x1f90 net/rds/send.c:1066
+ sock_sendmsg_nosec net/socket.c:628 [inline]
+ sock_sendmsg+0xca/0x110 net/socket.c:638
+ ___sys_sendmsg+0x320/0x8b0 net/socket.c:2018
+ __sys_sendmmsg+0x1ee/0x620 net/socket.c:2108
+ SYSC_sendmmsg net/socket.c:2139 [inline]
+ SyS_sendmmsg+0x35/0x60 net/socket.c:2134
+ entry_SYSCALL_64_fastpath+0x1f/0x96
+RIP: 0033:0x43fe49
+RSP: 002b:00007fffbe244ad8 EFLAGS: 00000217 ORIG_RAX: 0000000000000133
+RAX: ffffffffffffffda RBX: 00000000004002c8 RCX: 000000000043fe49
+RDX: 0000000000000001 RSI: 000000002020c000 RDI: 0000000000000003
+RBP: 00000000006ca018 R08: 0000000000000000 R09: 0000000000000000
+R10: 0000000000000000 R11: 0000000000000217 R12: 00000000004017b0
+R13: 0000000000401840 R14: 0000000000000000 R15: 0000000000000000
+
+To fix this, we verify that the cmsg_len is large enough to hold the
+data to be read, before proceeding further.
+
+Reported-by: syzbot <syzkaller-bugs@googlegroups.com>
+Signed-off-by: Avinash Repaka <avinash.repaka@oracle.com>
+Acked-by: Santosh Shilimkar <santosh.shilimkar@oracle.com>
+Reviewed-by: Yuval Shaia <yuval.shaia@oracle.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/rds/send.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/net/rds/send.c
++++ b/net/rds/send.c
+@@ -1009,6 +1009,9 @@ static int rds_rdma_bytes(struct msghdr
+                       continue;
+               if (cmsg->cmsg_type == RDS_CMSG_RDMA_ARGS) {
++                      if (cmsg->cmsg_len <
++                          CMSG_LEN(sizeof(struct rds_rdma_args)))
++                              return -EINVAL;
+                       args = CMSG_DATA(cmsg);
+                       *rdma_bytes += args->remote_vec.bytes;
+               }
diff --git a/queue-4.14/revert-mlx5-move-affinity-hints-assignments-to-generic-code.patch b/queue-4.14/revert-mlx5-move-affinity-hints-assignments-to-generic-code.patch
new file mode 100644 (file)
index 0000000..52f0b03
--- /dev/null
@@ -0,0 +1,338 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Saeed Mahameed <saeedm@mellanox.com>
+Date: Fri, 10 Nov 2017 15:59:52 +0900
+Subject: Revert "mlx5: move affinity hints assignments to generic code"
+
+From: Saeed Mahameed <saeedm@mellanox.com>
+
+
+[ Upstream commit 231243c82793428467524227ae02ca451e6a98e7 ]
+
+Before the offending commit, mlx5 core did the IRQ affinity itself,
+and it seems that the new generic code have some drawbacks and one
+of them is the lack for user ability to modify irq affinity after
+the initial affinity values got assigned.
+
+The issue is still being discussed and a solution in the new generic code
+is required, until then we need to revert this patch.
+
+This fixes the following issue:
+echo <new affinity> > /proc/irq/<x>/smp_affinity
+fails with  -EIO
+
+This reverts commit a435393acafbf0ecff4deb3e3cb554b34f0d0664.
+Note: kept mlx5_get_vector_affinity in include/linux/mlx5/driver.h since
+it is used in mlx5_ib driver.
+
+Fixes: a435393acafb ("mlx5: move affinity hints assignments to generic code")
+Cc: Sagi Grimberg <sagi@grimberg.me>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Jes Sorensen <jsorensen@fb.com>
+Reported-by: Jes Sorensen <jsorensen@fb.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en.h      |    1 
+ drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   45 ++++++-------
+ drivers/net/ethernet/mellanox/mlx5/core/main.c    |   75 ++++++++++++++++++++--
+ include/linux/mlx5/driver.h                       |    1 
+ 4 files changed, 93 insertions(+), 29 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
+@@ -590,6 +590,7 @@ struct mlx5e_channel {
+       struct mlx5_core_dev      *mdev;
+       struct mlx5e_tstamp       *tstamp;
+       int                        ix;
++      int                        cpu;
+ };
+ struct mlx5e_channels {
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+@@ -71,11 +71,6 @@ struct mlx5e_channel_param {
+       struct mlx5e_cq_param      icosq_cq;
+ };
+-static int mlx5e_get_node(struct mlx5e_priv *priv, int ix)
+-{
+-      return pci_irq_get_node(priv->mdev->pdev, MLX5_EQ_VEC_COMP_BASE + ix);
+-}
+-
+ static bool mlx5e_check_fragmented_striding_rq_cap(struct mlx5_core_dev *mdev)
+ {
+       return MLX5_CAP_GEN(mdev, striding_rq) &&
+@@ -452,17 +447,16 @@ static int mlx5e_rq_alloc_mpwqe_info(str
+       int wq_sz = mlx5_wq_ll_get_size(&rq->wq);
+       int mtt_sz = mlx5e_get_wqe_mtt_sz();
+       int mtt_alloc = mtt_sz + MLX5_UMR_ALIGN - 1;
+-      int node = mlx5e_get_node(c->priv, c->ix);
+       int i;
+       rq->mpwqe.info = kzalloc_node(wq_sz * sizeof(*rq->mpwqe.info),
+-                                      GFP_KERNEL, node);
++                                    GFP_KERNEL, cpu_to_node(c->cpu));
+       if (!rq->mpwqe.info)
+               goto err_out;
+       /* We allocate more than mtt_sz as we will align the pointer */
+-      rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz,
+-                                      GFP_KERNEL, node);
++      rq->mpwqe.mtt_no_align = kzalloc_node(mtt_alloc * wq_sz, GFP_KERNEL,
++                                      cpu_to_node(c->cpu));
+       if (unlikely(!rq->mpwqe.mtt_no_align))
+               goto err_free_wqe_info;
+@@ -570,7 +564,7 @@ static int mlx5e_alloc_rq(struct mlx5e_c
+       int err;
+       int i;
+-      rqp->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
++      rqp->wq.db_numa_node = cpu_to_node(c->cpu);
+       err = mlx5_wq_ll_create(mdev, &rqp->wq, rqc_wq, &rq->wq,
+                               &rq->wq_ctrl);
+@@ -636,8 +630,7 @@ static int mlx5e_alloc_rq(struct mlx5e_c
+       default: /* MLX5_WQ_TYPE_LINKED_LIST */
+               rq->wqe.frag_info =
+                       kzalloc_node(wq_sz * sizeof(*rq->wqe.frag_info),
+-                                   GFP_KERNEL,
+-                                   mlx5e_get_node(c->priv, c->ix));
++                                   GFP_KERNEL, cpu_to_node(c->cpu));
+               if (!rq->wqe.frag_info) {
+                       err = -ENOMEM;
+                       goto err_rq_wq_destroy;
+@@ -1007,13 +1000,13 @@ static int mlx5e_alloc_xdpsq(struct mlx5
+       sq->uar_map   = mdev->mlx5e_res.bfreg.map;
+       sq->min_inline_mode = params->tx_min_inline_mode;
+-      param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
++      param->wq.db_numa_node = cpu_to_node(c->cpu);
+       err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
+       if (err)
+               return err;
+       sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
+-      err = mlx5e_alloc_xdpsq_db(sq, mlx5e_get_node(c->priv, c->ix));
++      err = mlx5e_alloc_xdpsq_db(sq, cpu_to_node(c->cpu));
+       if (err)
+               goto err_sq_wq_destroy;
+@@ -1060,13 +1053,13 @@ static int mlx5e_alloc_icosq(struct mlx5
+       sq->channel   = c;
+       sq->uar_map   = mdev->mlx5e_res.bfreg.map;
+-      param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
++      param->wq.db_numa_node = cpu_to_node(c->cpu);
+       err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
+       if (err)
+               return err;
+       sq->wq.db = &sq->wq.db[MLX5_SND_DBR];
+-      err = mlx5e_alloc_icosq_db(sq, mlx5e_get_node(c->priv, c->ix));
++      err = mlx5e_alloc_icosq_db(sq, cpu_to_node(c->cpu));
+       if (err)
+               goto err_sq_wq_destroy;
+@@ -1132,13 +1125,13 @@ static int mlx5e_alloc_txqsq(struct mlx5
+       if (MLX5_IPSEC_DEV(c->priv->mdev))
+               set_bit(MLX5E_SQ_STATE_IPSEC, &sq->state);
+-      param->wq.db_numa_node = mlx5e_get_node(c->priv, c->ix);
++      param->wq.db_numa_node = cpu_to_node(c->cpu);
+       err = mlx5_wq_cyc_create(mdev, &param->wq, sqc_wq, &sq->wq, &sq->wq_ctrl);
+       if (err)
+               return err;
+       sq->wq.db    = &sq->wq.db[MLX5_SND_DBR];
+-      err = mlx5e_alloc_txqsq_db(sq, mlx5e_get_node(c->priv, c->ix));
++      err = mlx5e_alloc_txqsq_db(sq, cpu_to_node(c->cpu));
+       if (err)
+               goto err_sq_wq_destroy;
+@@ -1510,8 +1503,8 @@ static int mlx5e_alloc_cq(struct mlx5e_c
+       struct mlx5_core_dev *mdev = c->priv->mdev;
+       int err;
+-      param->wq.buf_numa_node = mlx5e_get_node(c->priv, c->ix);
+-      param->wq.db_numa_node  = mlx5e_get_node(c->priv, c->ix);
++      param->wq.buf_numa_node = cpu_to_node(c->cpu);
++      param->wq.db_numa_node  = cpu_to_node(c->cpu);
+       param->eq_ix   = c->ix;
+       err = mlx5e_alloc_cq_common(mdev, param, cq);
+@@ -1610,6 +1603,11 @@ static void mlx5e_close_cq(struct mlx5e_
+       mlx5e_free_cq(cq);
+ }
++static int mlx5e_get_cpu(struct mlx5e_priv *priv, int ix)
++{
++      return cpumask_first(priv->mdev->priv.irq_info[ix].mask);
++}
++
+ static int mlx5e_open_tx_cqs(struct mlx5e_channel *c,
+                            struct mlx5e_params *params,
+                            struct mlx5e_channel_param *cparam)
+@@ -1758,12 +1756,13 @@ static int mlx5e_open_channel(struct mlx
+ {
+       struct mlx5e_cq_moder icocq_moder = {0, 0};
+       struct net_device *netdev = priv->netdev;
++      int cpu = mlx5e_get_cpu(priv, ix);
+       struct mlx5e_channel *c;
+       unsigned int irq;
+       int err;
+       int eqn;
+-      c = kzalloc_node(sizeof(*c), GFP_KERNEL, mlx5e_get_node(priv, ix));
++      c = kzalloc_node(sizeof(*c), GFP_KERNEL, cpu_to_node(cpu));
+       if (!c)
+               return -ENOMEM;
+@@ -1771,6 +1770,7 @@ static int mlx5e_open_channel(struct mlx
+       c->mdev     = priv->mdev;
+       c->tstamp   = &priv->tstamp;
+       c->ix       = ix;
++      c->cpu      = cpu;
+       c->pdev     = &priv->mdev->pdev->dev;
+       c->netdev   = priv->netdev;
+       c->mkey_be  = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key);
+@@ -1859,8 +1859,7 @@ static void mlx5e_activate_channel(struc
+       for (tc = 0; tc < c->num_tc; tc++)
+               mlx5e_activate_txqsq(&c->sq[tc]);
+       mlx5e_activate_rq(&c->rq);
+-      netif_set_xps_queue(c->netdev,
+-              mlx5_get_vector_affinity(c->priv->mdev, c->ix), c->ix);
++      netif_set_xps_queue(c->netdev, get_cpu_mask(c->cpu), c->ix);
+ }
+ static void mlx5e_deactivate_channel(struct mlx5e_channel *c)
+--- a/drivers/net/ethernet/mellanox/mlx5/core/main.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c
+@@ -316,9 +316,6 @@ static int mlx5_alloc_irq_vectors(struct
+ {
+       struct mlx5_priv *priv = &dev->priv;
+       struct mlx5_eq_table *table = &priv->eq_table;
+-      struct irq_affinity irqdesc = {
+-              .pre_vectors = MLX5_EQ_VEC_COMP_BASE,
+-      };
+       int num_eqs = 1 << MLX5_CAP_GEN(dev, log_max_eq);
+       int nvec;
+@@ -332,10 +329,9 @@ static int mlx5_alloc_irq_vectors(struct
+       if (!priv->irq_info)
+               goto err_free_msix;
+-      nvec = pci_alloc_irq_vectors_affinity(dev->pdev,
++      nvec = pci_alloc_irq_vectors(dev->pdev,
+                       MLX5_EQ_VEC_COMP_BASE + 1, nvec,
+-                      PCI_IRQ_MSIX | PCI_IRQ_AFFINITY,
+-                      &irqdesc);
++                      PCI_IRQ_MSIX);
+       if (nvec < 0)
+               return nvec;
+@@ -621,6 +617,63 @@ u64 mlx5_read_internal_timer(struct mlx5
+       return (u64)timer_l | (u64)timer_h1 << 32;
+ }
++static int mlx5_irq_set_affinity_hint(struct mlx5_core_dev *mdev, int i)
++{
++      struct mlx5_priv *priv  = &mdev->priv;
++      int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
++
++      if (!zalloc_cpumask_var(&priv->irq_info[i].mask, GFP_KERNEL)) {
++              mlx5_core_warn(mdev, "zalloc_cpumask_var failed");
++              return -ENOMEM;
++      }
++
++      cpumask_set_cpu(cpumask_local_spread(i, priv->numa_node),
++                      priv->irq_info[i].mask);
++
++      if (IS_ENABLED(CONFIG_SMP) &&
++          irq_set_affinity_hint(irq, priv->irq_info[i].mask))
++              mlx5_core_warn(mdev, "irq_set_affinity_hint failed, irq 0x%.4x", irq);
++
++      return 0;
++}
++
++static void mlx5_irq_clear_affinity_hint(struct mlx5_core_dev *mdev, int i)
++{
++      struct mlx5_priv *priv  = &mdev->priv;
++      int irq = pci_irq_vector(mdev->pdev, MLX5_EQ_VEC_COMP_BASE + i);
++
++      irq_set_affinity_hint(irq, NULL);
++      free_cpumask_var(priv->irq_info[i].mask);
++}
++
++static int mlx5_irq_set_affinity_hints(struct mlx5_core_dev *mdev)
++{
++      int err;
++      int i;
++
++      for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++) {
++              err = mlx5_irq_set_affinity_hint(mdev, i);
++              if (err)
++                      goto err_out;
++      }
++
++      return 0;
++
++err_out:
++      for (i--; i >= 0; i--)
++              mlx5_irq_clear_affinity_hint(mdev, i);
++
++      return err;
++}
++
++static void mlx5_irq_clear_affinity_hints(struct mlx5_core_dev *mdev)
++{
++      int i;
++
++      for (i = 0; i < mdev->priv.eq_table.num_comp_vectors; i++)
++              mlx5_irq_clear_affinity_hint(mdev, i);
++}
++
+ int mlx5_vector2eqn(struct mlx5_core_dev *dev, int vector, int *eqn,
+                   unsigned int *irqn)
+ {
+@@ -1093,6 +1146,12 @@ static int mlx5_load_one(struct mlx5_cor
+               goto err_stop_eqs;
+       }
++      err = mlx5_irq_set_affinity_hints(dev);
++      if (err) {
++              dev_err(&pdev->dev, "Failed to alloc affinity hint cpumask\n");
++              goto err_affinity_hints;
++      }
++
+       err = mlx5_init_fs(dev);
+       if (err) {
+               dev_err(&pdev->dev, "Failed to init flow steering\n");
+@@ -1150,6 +1209,9 @@ err_sriov:
+       mlx5_cleanup_fs(dev);
+ err_fs:
++      mlx5_irq_clear_affinity_hints(dev);
++
++err_affinity_hints:
+       free_comp_eqs(dev);
+ err_stop_eqs:
+@@ -1218,6 +1280,7 @@ static int mlx5_unload_one(struct mlx5_c
+       mlx5_sriov_detach(dev);
+       mlx5_cleanup_fs(dev);
++      mlx5_irq_clear_affinity_hints(dev);
+       free_comp_eqs(dev);
+       mlx5_stop_eqs(dev);
+       mlx5_put_uars_page(dev, priv->uar);
+--- a/include/linux/mlx5/driver.h
++++ b/include/linux/mlx5/driver.h
+@@ -546,6 +546,7 @@ struct mlx5_core_sriov {
+ };
+ struct mlx5_irq_info {
++      cpumask_var_t mask;
+       char name[MLX5_MAX_IRQ_NAME];
+ };
diff --git a/queue-4.14/s390-qeth-apply-takeover-changes-when-mode-is-toggled.patch b/queue-4.14/s390-qeth-apply-takeover-changes-when-mode-is-toggled.patch
new file mode 100644 (file)
index 0000000..8999504
--- /dev/null
@@ -0,0 +1,98 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Date: Wed, 13 Dec 2017 18:56:29 +0100
+Subject: s390/qeth: apply takeover changes when mode is toggled
+
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+
+
+[ Upstream commit 7fbd9493f0eeae8cef58300505a9ef5c8fce6313 ]
+
+Just as for an explicit enable/disable, toggling the takeover mode also
+requires that the IP addresses get updated. Otherwise all IPs that were
+added to the table before the mode-toggle, get registered with the old
+settings.
+
+Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/net/qeth_core.h      |    2 +-
+ drivers/s390/net/qeth_core_main.c |    2 +-
+ drivers/s390/net/qeth_l3_sys.c    |   35 +++++++++++++++++------------------
+ 3 files changed, 19 insertions(+), 20 deletions(-)
+
+--- a/drivers/s390/net/qeth_core.h
++++ b/drivers/s390/net/qeth_core.h
+@@ -564,7 +564,7 @@ enum qeth_cq {
+ };
+ struct qeth_ipato {
+-      int enabled;
++      bool enabled;
+       int invert4;
+       int invert6;
+       struct list_head entries;
+--- a/drivers/s390/net/qeth_core_main.c
++++ b/drivers/s390/net/qeth_core_main.c
+@@ -1479,7 +1479,7 @@ static int qeth_setup_card(struct qeth_c
+       qeth_set_intial_options(card);
+       /* IP address takeover */
+       INIT_LIST_HEAD(&card->ipato.entries);
+-      card->ipato.enabled = 0;
++      card->ipato.enabled = false;
+       card->ipato.invert4 = 0;
+       card->ipato.invert6 = 0;
+       /* init QDIO stuff */
+--- a/drivers/s390/net/qeth_l3_sys.c
++++ b/drivers/s390/net/qeth_l3_sys.c
+@@ -372,6 +372,7 @@ static ssize_t qeth_l3_dev_ipato_enable_
+       struct qeth_card *card = dev_get_drvdata(dev);
+       struct qeth_ipaddr *addr;
+       int i, rc = 0;
++      bool enable;
+       if (!card)
+               return -EINVAL;
+@@ -384,25 +385,23 @@ static ssize_t qeth_l3_dev_ipato_enable_
+       }
+       if (sysfs_streq(buf, "toggle")) {
+-              card->ipato.enabled = (card->ipato.enabled)? 0 : 1;
+-      } else if (sysfs_streq(buf, "1")) {
+-              card->ipato.enabled = 1;
+-              hash_for_each(card->ip_htable, i, addr, hnode) {
+-                              if ((addr->type == QETH_IP_TYPE_NORMAL) &&
+-                              qeth_l3_is_addr_covered_by_ipato(card, addr))
+-                                      addr->set_flags |=
+-                                      QETH_IPA_SETIP_TAKEOVER_FLAG;
+-                      }
+-      } else if (sysfs_streq(buf, "0")) {
+-              card->ipato.enabled = 0;
+-              hash_for_each(card->ip_htable, i, addr, hnode) {
+-                      if (addr->set_flags &
+-                      QETH_IPA_SETIP_TAKEOVER_FLAG)
+-                              addr->set_flags &=
+-                              ~QETH_IPA_SETIP_TAKEOVER_FLAG;
+-                      }
+-      } else
++              enable = !card->ipato.enabled;
++      } else if (kstrtobool(buf, &enable)) {
+               rc = -EINVAL;
++              goto out;
++      }
++
++      if (card->ipato.enabled == enable)
++              goto out;
++      card->ipato.enabled = enable;
++
++      hash_for_each(card->ip_htable, i, addr, hnode) {
++              if (!enable)
++                      addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG;
++              else if (addr->type == QETH_IP_TYPE_NORMAL &&
++                       qeth_l3_is_addr_covered_by_ipato(card, addr))
++                      addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
++      }
+ out:
+       mutex_unlock(&card->conf_mutex);
+       return rc ? rc : count;
diff --git a/queue-4.14/s390-qeth-don-t-apply-takeover-changes-to-rxip.patch b/queue-4.14/s390-qeth-don-t-apply-takeover-changes-to-rxip.patch
new file mode 100644 (file)
index 0000000..22f59b4
--- /dev/null
@@ -0,0 +1,61 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Date: Wed, 13 Dec 2017 18:56:30 +0100
+Subject: s390/qeth: don't apply takeover changes to RXIP
+
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+
+
+[ Upstream commit b22d73d6689fd902a66c08ebe71ab2f3b351e22f ]
+
+When takeover is switched off, current code clears the 'TAKEOVER' flag on
+all IPs. But the flag is also used for RXIP addresses, and those should
+not be affected by the takeover mode.
+Fix the behaviour by consistenly applying takover logic to NORMAL
+addresses only.
+
+Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/net/qeth_l3_main.c |    5 +++--
+ drivers/s390/net/qeth_l3_sys.c  |    5 +++--
+ 2 files changed, 6 insertions(+), 4 deletions(-)
+
+--- a/drivers/s390/net/qeth_l3_main.c
++++ b/drivers/s390/net/qeth_l3_main.c
+@@ -173,6 +173,8 @@ int qeth_l3_is_addr_covered_by_ipato(str
+       if (!card->ipato.enabled)
+               return 0;
++      if (addr->type != QETH_IP_TYPE_NORMAL)
++              return 0;
+       qeth_l3_convert_addr_to_bits((u8 *) &addr->u, addr_bits,
+                                 (addr->proto == QETH_PROT_IPV4)? 4:16);
+@@ -289,8 +291,7 @@ int qeth_l3_add_ip(struct qeth_card *car
+               memcpy(addr, tmp_addr, sizeof(struct qeth_ipaddr));
+               addr->ref_counter = 1;
+-              if (addr->type == QETH_IP_TYPE_NORMAL  &&
+-                              qeth_l3_is_addr_covered_by_ipato(card, addr)) {
++              if (qeth_l3_is_addr_covered_by_ipato(card, addr)) {
+                       QETH_CARD_TEXT(card, 2, "tkovaddr");
+                       addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
+               }
+--- a/drivers/s390/net/qeth_l3_sys.c
++++ b/drivers/s390/net/qeth_l3_sys.c
+@@ -396,10 +396,11 @@ static ssize_t qeth_l3_dev_ipato_enable_
+       card->ipato.enabled = enable;
+       hash_for_each(card->ip_htable, i, addr, hnode) {
++              if (addr->type != QETH_IP_TYPE_NORMAL)
++                      continue;
+               if (!enable)
+                       addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG;
+-              else if (addr->type == QETH_IP_TYPE_NORMAL &&
+-                       qeth_l3_is_addr_covered_by_ipato(card, addr))
++              else if (qeth_l3_is_addr_covered_by_ipato(card, addr))
+                       addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
+       }
+ out:
diff --git a/queue-4.14/s390-qeth-fix-error-handling-in-checksum-cmd-callback.patch b/queue-4.14/s390-qeth-fix-error-handling-in-checksum-cmd-callback.patch
new file mode 100644 (file)
index 0000000..50db1d5
--- /dev/null
@@ -0,0 +1,46 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Date: Wed, 20 Dec 2017 18:07:18 +0100
+Subject: s390/qeth: fix error handling in checksum cmd callback
+
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+
+
+[ Upstream commit ad3cbf61332914711e5f506972b1dc9af8d62146 ]
+
+Make sure to check both return code fields before processing the
+response. Otherwise we risk operating on invalid data.
+
+Fixes: c9475369bd2b ("s390/qeth: rework RX/TX checksum offload")
+Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/net/qeth_core_main.c |    9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/drivers/s390/net/qeth_core_main.c
++++ b/drivers/s390/net/qeth_core_main.c
+@@ -5445,6 +5445,13 @@ out:
+ }
+ EXPORT_SYMBOL_GPL(qeth_poll);
++static int qeth_setassparms_inspect_rc(struct qeth_ipa_cmd *cmd)
++{
++      if (!cmd->hdr.return_code)
++              cmd->hdr.return_code = cmd->data.setassparms.hdr.return_code;
++      return cmd->hdr.return_code;
++}
++
+ int qeth_setassparms_cb(struct qeth_card *card,
+                       struct qeth_reply *reply, unsigned long data)
+ {
+@@ -6304,7 +6311,7 @@ static int qeth_ipa_checksum_run_cmd_cb(
+                               (struct qeth_checksum_cmd *)reply->param;
+       QETH_CARD_TEXT(card, 4, "chkdoccb");
+-      if (cmd->hdr.return_code)
++      if (qeth_setassparms_inspect_rc(cmd))
+               return 0;
+       memset(chksum_cb, 0, sizeof(*chksum_cb));
diff --git a/queue-4.14/s390-qeth-lock-ip-table-while-applying-takeover-changes.patch b/queue-4.14/s390-qeth-lock-ip-table-while-applying-takeover-changes.patch
new file mode 100644 (file)
index 0000000..b814a75
--- /dev/null
@@ -0,0 +1,39 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Date: Wed, 13 Dec 2017 18:56:31 +0100
+Subject: s390/qeth: lock IP table while applying takeover changes
+
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+
+
+[ Upstream commit 8a03a3692b100d84785ee7a834e9215e304c9e00 ]
+
+Modifying the flags of an IP addr object needs to be protected against
+eg. concurrent removal of the same object from the IP table.
+
+Fixes: 5f78e29ceebf ("qeth: optimize IP handling in rx_mode callback")
+Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/net/qeth_l3_sys.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/drivers/s390/net/qeth_l3_sys.c
++++ b/drivers/s390/net/qeth_l3_sys.c
+@@ -395,6 +395,7 @@ static ssize_t qeth_l3_dev_ipato_enable_
+               goto out;
+       card->ipato.enabled = enable;
++      spin_lock_bh(&card->ip_lock);
+       hash_for_each(card->ip_htable, i, addr, hnode) {
+               if (addr->type != QETH_IP_TYPE_NORMAL)
+                       continue;
+@@ -403,6 +404,7 @@ static ssize_t qeth_l3_dev_ipato_enable_
+               else if (qeth_l3_is_addr_covered_by_ipato(card, addr))
+                       addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
+       }
++      spin_unlock_bh(&card->ip_lock);
+ out:
+       mutex_unlock(&card->conf_mutex);
+       return rc ? rc : count;
diff --git a/queue-4.14/s390-qeth-update-takeover-ips-after-configuration-change.patch b/queue-4.14/s390-qeth-update-takeover-ips-after-configuration-change.patch
new file mode 100644 (file)
index 0000000..29ee5b5
--- /dev/null
@@ -0,0 +1,240 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Date: Wed, 13 Dec 2017 18:56:32 +0100
+Subject: s390/qeth: update takeover IPs after configuration change
+
+From: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+
+
+[ Upstream commit 02f510f326501470348a5df341e8232c3497bbbb ]
+
+Any modification to the takeover IP-ranges requires that we re-evaluate
+which IP addresses are takeover-eligible. Otherwise we might do takeover
+for some addresses when we no longer should, or vice-versa.
+
+Signed-off-by: Julian Wiedmann <jwi@linux.vnet.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/s390/net/qeth_core.h      |    4 +-
+ drivers/s390/net/qeth_core_main.c |    4 +-
+ drivers/s390/net/qeth_l3.h        |    2 -
+ drivers/s390/net/qeth_l3_main.c   |   31 ++++++++++++++++--
+ drivers/s390/net/qeth_l3_sys.c    |   63 ++++++++++++++++++++------------------
+ 5 files changed, 67 insertions(+), 37 deletions(-)
+
+--- a/drivers/s390/net/qeth_core.h
++++ b/drivers/s390/net/qeth_core.h
+@@ -565,8 +565,8 @@ enum qeth_cq {
+ struct qeth_ipato {
+       bool enabled;
+-      int invert4;
+-      int invert6;
++      bool invert4;
++      bool invert6;
+       struct list_head entries;
+ };
+--- a/drivers/s390/net/qeth_core_main.c
++++ b/drivers/s390/net/qeth_core_main.c
+@@ -1480,8 +1480,8 @@ static int qeth_setup_card(struct qeth_c
+       /* IP address takeover */
+       INIT_LIST_HEAD(&card->ipato.entries);
+       card->ipato.enabled = false;
+-      card->ipato.invert4 = 0;
+-      card->ipato.invert6 = 0;
++      card->ipato.invert4 = false;
++      card->ipato.invert6 = false;
+       /* init QDIO stuff */
+       qeth_init_qdio_info(card);
+       INIT_DELAYED_WORK(&card->buffer_reclaim_work, qeth_buffer_reclaim_work);
+--- a/drivers/s390/net/qeth_l3.h
++++ b/drivers/s390/net/qeth_l3.h
+@@ -82,7 +82,7 @@ void qeth_l3_del_vipa(struct qeth_card *
+ int qeth_l3_add_rxip(struct qeth_card *, enum qeth_prot_versions, const u8 *);
+ void qeth_l3_del_rxip(struct qeth_card *card, enum qeth_prot_versions,
+                       const u8 *);
+-int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *, struct qeth_ipaddr *);
++void qeth_l3_update_ipato(struct qeth_card *card);
+ struct qeth_ipaddr *qeth_l3_get_addr_buffer(enum qeth_prot_versions);
+ int qeth_l3_add_ip(struct qeth_card *, struct qeth_ipaddr *);
+ int qeth_l3_delete_ip(struct qeth_card *, struct qeth_ipaddr *);
+--- a/drivers/s390/net/qeth_l3_main.c
++++ b/drivers/s390/net/qeth_l3_main.c
+@@ -163,8 +163,8 @@ static void qeth_l3_convert_addr_to_bits
+       }
+ }
+-int qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card,
+-                                              struct qeth_ipaddr *addr)
++static bool qeth_l3_is_addr_covered_by_ipato(struct qeth_card *card,
++                                           struct qeth_ipaddr *addr)
+ {
+       struct qeth_ipato_entry *ipatoe;
+       u8 addr_bits[128] = {0, };
+@@ -605,6 +605,27 @@ int qeth_l3_setrouting_v6(struct qeth_ca
+ /*
+  * IP address takeover related functions
+  */
++
++/**
++ * qeth_l3_update_ipato() - Update 'takeover' property, for all NORMAL IPs.
++ *
++ * Caller must hold ip_lock.
++ */
++void qeth_l3_update_ipato(struct qeth_card *card)
++{
++      struct qeth_ipaddr *addr;
++      unsigned int i;
++
++      hash_for_each(card->ip_htable, i, addr, hnode) {
++              if (addr->type != QETH_IP_TYPE_NORMAL)
++                      continue;
++              if (qeth_l3_is_addr_covered_by_ipato(card, addr))
++                      addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
++              else
++                      addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG;
++      }
++}
++
+ static void qeth_l3_clear_ipato_list(struct qeth_card *card)
+ {
+       struct qeth_ipato_entry *ipatoe, *tmp;
+@@ -616,6 +637,7 @@ static void qeth_l3_clear_ipato_list(str
+               kfree(ipatoe);
+       }
++      qeth_l3_update_ipato(card);
+       spin_unlock_bh(&card->ip_lock);
+ }
+@@ -640,8 +662,10 @@ int qeth_l3_add_ipato_entry(struct qeth_
+               }
+       }
+-      if (!rc)
++      if (!rc) {
+               list_add_tail(&new->entry, &card->ipato.entries);
++              qeth_l3_update_ipato(card);
++      }
+       spin_unlock_bh(&card->ip_lock);
+@@ -664,6 +688,7 @@ void qeth_l3_del_ipato_entry(struct qeth
+                           (proto == QETH_PROT_IPV4)? 4:16) &&
+                   (ipatoe->mask_bits == mask_bits)) {
+                       list_del(&ipatoe->entry);
++                      qeth_l3_update_ipato(card);
+                       kfree(ipatoe);
+               }
+       }
+--- a/drivers/s390/net/qeth_l3_sys.c
++++ b/drivers/s390/net/qeth_l3_sys.c
+@@ -370,9 +370,8 @@ static ssize_t qeth_l3_dev_ipato_enable_
+               struct device_attribute *attr, const char *buf, size_t count)
+ {
+       struct qeth_card *card = dev_get_drvdata(dev);
+-      struct qeth_ipaddr *addr;
+-      int i, rc = 0;
+       bool enable;
++      int rc = 0;
+       if (!card)
+               return -EINVAL;
+@@ -391,20 +390,12 @@ static ssize_t qeth_l3_dev_ipato_enable_
+               goto out;
+       }
+-      if (card->ipato.enabled == enable)
+-              goto out;
+-      card->ipato.enabled = enable;
+-
+-      spin_lock_bh(&card->ip_lock);
+-      hash_for_each(card->ip_htable, i, addr, hnode) {
+-              if (addr->type != QETH_IP_TYPE_NORMAL)
+-                      continue;
+-              if (!enable)
+-                      addr->set_flags &= ~QETH_IPA_SETIP_TAKEOVER_FLAG;
+-              else if (qeth_l3_is_addr_covered_by_ipato(card, addr))
+-                      addr->set_flags |= QETH_IPA_SETIP_TAKEOVER_FLAG;
++      if (card->ipato.enabled != enable) {
++              card->ipato.enabled = enable;
++              spin_lock_bh(&card->ip_lock);
++              qeth_l3_update_ipato(card);
++              spin_unlock_bh(&card->ip_lock);
+       }
+-      spin_unlock_bh(&card->ip_lock);
+ out:
+       mutex_unlock(&card->conf_mutex);
+       return rc ? rc : count;
+@@ -430,20 +421,27 @@ static ssize_t qeth_l3_dev_ipato_invert4
+                               const char *buf, size_t count)
+ {
+       struct qeth_card *card = dev_get_drvdata(dev);
++      bool invert;
+       int rc = 0;
+       if (!card)
+               return -EINVAL;
+       mutex_lock(&card->conf_mutex);
+-      if (sysfs_streq(buf, "toggle"))
+-              card->ipato.invert4 = (card->ipato.invert4)? 0 : 1;
+-      else if (sysfs_streq(buf, "1"))
+-              card->ipato.invert4 = 1;
+-      else if (sysfs_streq(buf, "0"))
+-              card->ipato.invert4 = 0;
+-      else
++      if (sysfs_streq(buf, "toggle")) {
++              invert = !card->ipato.invert4;
++      } else if (kstrtobool(buf, &invert)) {
+               rc = -EINVAL;
++              goto out;
++      }
++
++      if (card->ipato.invert4 != invert) {
++              card->ipato.invert4 = invert;
++              spin_lock_bh(&card->ip_lock);
++              qeth_l3_update_ipato(card);
++              spin_unlock_bh(&card->ip_lock);
++      }
++out:
+       mutex_unlock(&card->conf_mutex);
+       return rc ? rc : count;
+ }
+@@ -609,20 +607,27 @@ static ssize_t qeth_l3_dev_ipato_invert6
+               struct device_attribute *attr, const char *buf, size_t count)
+ {
+       struct qeth_card *card = dev_get_drvdata(dev);
++      bool invert;
+       int rc = 0;
+       if (!card)
+               return -EINVAL;
+       mutex_lock(&card->conf_mutex);
+-      if (sysfs_streq(buf, "toggle"))
+-              card->ipato.invert6 = (card->ipato.invert6)? 0 : 1;
+-      else if (sysfs_streq(buf, "1"))
+-              card->ipato.invert6 = 1;
+-      else if (sysfs_streq(buf, "0"))
+-              card->ipato.invert6 = 0;
+-      else
++      if (sysfs_streq(buf, "toggle")) {
++              invert = !card->ipato.invert6;
++      } else if (kstrtobool(buf, &invert)) {
+               rc = -EINVAL;
++              goto out;
++      }
++
++      if (card->ipato.invert6 != invert) {
++              card->ipato.invert6 = invert;
++              spin_lock_bh(&card->ip_lock);
++              qeth_l3_update_ipato(card);
++              spin_unlock_bh(&card->ip_lock);
++      }
++out:
+       mutex_unlock(&card->conf_mutex);
+       return rc ? rc : count;
+ }
diff --git a/queue-4.14/sctp-make-sure-stream-nums-can-match-optlen-in-sctp_setsockopt_reset_streams.patch b/queue-4.14/sctp-make-sure-stream-nums-can-match-optlen-in-sctp_setsockopt_reset_streams.patch
new file mode 100644 (file)
index 0000000..85c3cfa
--- /dev/null
@@ -0,0 +1,55 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Xin Long <lucien.xin@gmail.com>
+Date: Sun, 10 Dec 2017 15:40:51 +0800
+Subject: sctp: make sure stream nums can match optlen in sctp_setsockopt_reset_streams
+
+From: Xin Long <lucien.xin@gmail.com>
+
+
+[ Upstream commit 2342b8d95bcae5946e1b9b8d58645f37500ef2e7 ]
+
+Now in sctp_setsockopt_reset_streams, it only does the check
+optlen < sizeof(*params) for optlen. But it's not enough, as
+params->srs_number_streams should also match optlen.
+
+If the streams in params->srs_stream_list are less than stream
+nums in params->srs_number_streams, later when dereferencing
+the stream list, it could cause a slab-out-of-bounds crash, as
+reported by syzbot.
+
+This patch is to fix it by also checking the stream numbers in
+sctp_setsockopt_reset_streams to make sure at least it's not
+greater than the streams in the list.
+
+Fixes: 7f9d68ac944e ("sctp: implement sender-side procedures for SSN Reset Request Parameter")
+Reported-by: Dmitry Vyukov <dvyukov@google.com>
+Signed-off-by: Xin Long <lucien.xin@gmail.com>
+Acked-by: Marcelo Ricardo Leitner <marcelo.leitner@gmail.com>
+Acked-by: Neil Horman <nhorman@tuxdriver.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sctp/socket.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/net/sctp/socket.c
++++ b/net/sctp/socket.c
+@@ -3874,13 +3874,17 @@ static int sctp_setsockopt_reset_streams
+       struct sctp_association *asoc;
+       int retval = -EINVAL;
+-      if (optlen < sizeof(struct sctp_reset_streams))
++      if (optlen < sizeof(*params))
+               return -EINVAL;
+       params = memdup_user(optval, optlen);
+       if (IS_ERR(params))
+               return PTR_ERR(params);
++      if (params->srs_number_streams * sizeof(__u16) >
++          optlen - sizeof(*params))
++              goto out;
++
+       asoc = sctp_id2assoc(sk, params->srs_assoc_id);
+       if (!asoc)
+               goto out;
diff --git a/queue-4.14/sctp-replace-use-of-sockets_allocated-with-specified-macro.patch b/queue-4.14/sctp-replace-use-of-sockets_allocated-with-specified-macro.patch
new file mode 100644 (file)
index 0000000..ee003fd
--- /dev/null
@@ -0,0 +1,44 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
+Date: Fri, 22 Dec 2017 10:15:20 -0800
+Subject: sctp: Replace use of sockets_allocated with specified macro.
+
+From: Tonghao Zhang <xiangxia.m.yue@gmail.com>
+
+
+[ Upstream commit 8cb38a602478e9f806571f6920b0a3298aabf042 ]
+
+The patch(180d8cd942ce) replaces all uses of struct sock fields'
+memory_pressure, memory_allocated, sockets_allocated, and sysctl_mem
+to accessor macros. But the sockets_allocated field of sctp sock is
+not replaced at all. Then replace it now for unifying the code.
+
+Fixes: 180d8cd942ce ("foundations of per-cgroup memory pressure controlling.")
+Cc: Glauber Costa <glommer@parallels.com>
+Signed-off-by: Tonghao Zhang <zhangtonghao@didichuxing.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/sctp/socket.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/sctp/socket.c
++++ b/net/sctp/socket.c
+@@ -4413,7 +4413,7 @@ static int sctp_init_sock(struct sock *s
+       SCTP_DBG_OBJCNT_INC(sock);
+       local_bh_disable();
+-      percpu_counter_inc(&sctp_sockets_allocated);
++      sk_sockets_allocated_inc(sk);
+       sock_prot_inuse_add(net, sk->sk_prot, 1);
+       /* Nothing can fail after this block, otherwise
+@@ -4457,7 +4457,7 @@ static void sctp_destroy_sock(struct soc
+       }
+       sctp_endpoint_free(sp->ep);
+       local_bh_disable();
+-      percpu_counter_dec(&sctp_sockets_allocated);
++      sk_sockets_allocated_dec(sk);
+       sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
+       local_bh_enable();
+ }
index a830d40a1b7cc342a41edc817d57c3addf268ff5..70c106f12aca15a1e549d0992d4548718f61bbc4 100644 (file)
@@ -56,3 +56,61 @@ cpufreq-schedutil-use-idle_calls-counter-of-the-remote-cpu.patch
 block-fix-blk_rq_append_bio.patch
 block-don-t-let-passthrough-io-go-into-.make_request_fn.patch
 kbuild-add-fno-stack-check-to-kernel-build-options.patch
+ipv4-igmp-guard-against-silly-mtu-values.patch
+ipv6-mcast-better-catch-silly-mtu-values.patch
+net-fec-unmap-the-xmit-buffer-that-are-not-transferred-by-dma.patch
+net-igmp-use-correct-source-address-on-igmpv3-reports.patch
+netlink-add-netns-check-on-taps.patch
+net-qmi_wwan-add-sierra-em7565-1199-9091.patch
+net-reevalulate-autoflowlabel-setting-after-sysctl-setting.patch
+ptr_ring-add-barriers.patch
+rds-check-cmsg_len-before-dereferencing-cmsg_data.patch
+tcp_bbr-record-full-bw-reached-decision-in-new-full_bw_reached-bit.patch
+tcp-md5sig-use-skb-s-saddr-when-replying-to-an-incoming-segment.patch
+tg3-fix-rx-hang-on-mtu-change-with-5717-5719.patch
+tcp_bbr-reset-full-pipe-detection-on-loss-recovery-undo.patch
+tcp_bbr-reset-long-term-bandwidth-sampling-on-loss-recovery-undo.patch
+s390-qeth-apply-takeover-changes-when-mode-is-toggled.patch
+s390-qeth-don-t-apply-takeover-changes-to-rxip.patch
+s390-qeth-lock-ip-table-while-applying-takeover-changes.patch
+s390-qeth-update-takeover-ips-after-configuration-change.patch
+net-ipv4-fix-for-a-race-condition-in-raw_sendmsg.patch
+net-mvmdio-disable-unprepare-clocks-in-eprobe_defer-case.patch
+sctp-replace-use-of-sockets_allocated-with-specified-macro.patch
+adding-missing-rcu_read_unlock-in-ipxip6_rcv.patch
+ip6_gre-fix-device-features-for-ioctl-setup.patch
+ipv4-fix-use-after-free-when-flushing-fib-tables.patch
+net-bridge-fix-early-call-to-br_stp_change_bridge_id-and-plug-newlink-leaks.patch
+net-fix-double-free-and-memory-corruption-in-get_net_ns_by_id.patch
+net-phy-micrel-ksz9031-reconfigure-autoneg-after-phy-autoneg-workaround.patch
+sock-free-skb-in-skb_complete_tx_timestamp-on-error.patch
+tcp-invalidate-rate-samples-during-sack-reneging.patch
+net-mlx5-fix-rate-limit-packet-pacing-naming-and-struct.patch
+net-mlx5e-fix-possible-deadlock-of-vxlan-lock.patch
+net-mlx5e-fix-features-check-of-ipv6-traffic.patch
+net-mlx5e-add-refcount-to-vxlan-structure.patch
+net-mlx5e-prevent-possible-races-in-vxlan-control-flow.patch
+net-mlx5-fix-error-flow-in-create_qp-command.patch
+openvswitch-fix-pop_vlan-action-for-double-tagged-frames.patch
+sfc-pass-valid-pointers-from-efx_enqueue_unwind.patch
+net-dsa-bcm_sf2-clear-iddq_global_pwr-bit-for-phy.patch
+s390-qeth-fix-error-handling-in-checksum-cmd-callback.patch
+sctp-make-sure-stream-nums-can-match-optlen-in-sctp_setsockopt_reset_streams.patch
+tipc-fix-hanging-poll-for-stream-sockets.patch
+mlxsw-spectrum-disable-mac-learning-for-ovs-port.patch
+tcp-fix-potential-underestimation-on-rcv_rtt.patch
+net-phy-marvell-limit-88m1101-autoneg-errata-to-88e1145-as-well.patch
+ipv6-honor-specified-parameters-in-fibmatch-lookup.patch
+tcp-refresh-tcp_mstamp-from-timers-callbacks.patch
+net-mlx5-fpga-return-einval-if-size-is-zero.patch
+vxlan-restore-dev-mtu-setting-based-on-lower-device.patch
+net-sched-fix-static-key-imbalance-in-case-of-ingress-clsact_init-error.patch
+bnxt_en-fix-sources-of-spurious-netpoll-warnings.patch
+phylink-ensure-the-phy-interface-mode-is-appropriately-set.patch
+phylink-ensure-an-is-enabled.patch
+ipv4-fib-fix-metrics-match-when-deleting-a-route.patch
+ipv6-set-all.accept_dad-to-0-by-default.patch
+revert-mlx5-move-affinity-hints-assignments-to-generic-code.patch
+skbuff-orphan-frags-before-zerocopy-clone.patch
+skbuff-skb_copy_ubufs-must-release-uarg-even-without-user-frags.patch
+skbuff-in-skb_copy_ubufs-unclone-before-releasing-zerocopy.patch
diff --git a/queue-4.14/sfc-pass-valid-pointers-from-efx_enqueue_unwind.patch b/queue-4.14/sfc-pass-valid-pointers-from-efx_enqueue_unwind.patch
new file mode 100644 (file)
index 0000000..0738597
--- /dev/null
@@ -0,0 +1,55 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Bert Kenward <bkenward@solarflare.com>
+Date: Thu, 7 Dec 2017 17:18:58 +0000
+Subject: sfc: pass valid pointers from efx_enqueue_unwind
+
+From: Bert Kenward <bkenward@solarflare.com>
+
+
+[ Upstream commit d4a7a8893d4cdbc89d79ac4aa704bf8d4b67b368 ]
+
+The bytes_compl and pkts_compl pointers passed to efx_dequeue_buffers
+cannot be NULL. Add a paranoid warning to check this condition and fix
+the one case where they were NULL.
+
+efx_enqueue_unwind() is called very rarely, during error handling.
+Without this fix it would fail with a NULL pointer dereference in
+efx_dequeue_buffer, with efx_enqueue_skb in the call stack.
+
+Fixes: e9117e5099ea ("sfc: Firmware-Assisted TSO version 2")
+Reported-by: Jarod Wilson <jarod@redhat.com>
+Signed-off-by: Bert Kenward <bkenward@solarflare.com>
+Tested-by: Jarod Wilson <jarod@redhat.com>
+Acked-by: Jarod Wilson <jarod@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/sfc/tx.c |    5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/sfc/tx.c
++++ b/drivers/net/ethernet/sfc/tx.c
+@@ -77,6 +77,7 @@ static void efx_dequeue_buffer(struct ef
+       }
+       if (buffer->flags & EFX_TX_BUF_SKB) {
++              EFX_WARN_ON_PARANOID(!pkts_compl || !bytes_compl);
+               (*pkts_compl)++;
+               (*bytes_compl) += buffer->skb->len;
+               dev_consume_skb_any((struct sk_buff *)buffer->skb);
+@@ -426,12 +427,14 @@ static int efx_tx_map_data(struct efx_tx
+ static void efx_enqueue_unwind(struct efx_tx_queue *tx_queue)
+ {
+       struct efx_tx_buffer *buffer;
++      unsigned int bytes_compl = 0;
++      unsigned int pkts_compl = 0;
+       /* Work backwards until we hit the original insert pointer value */
+       while (tx_queue->insert_count != tx_queue->write_count) {
+               --tx_queue->insert_count;
+               buffer = __efx_tx_queue_get_insert_buffer(tx_queue);
+-              efx_dequeue_buffer(tx_queue, buffer, NULL, NULL);
++              efx_dequeue_buffer(tx_queue, buffer, &pkts_compl, &bytes_compl);
+       }
+ }
diff --git a/queue-4.14/skbuff-in-skb_copy_ubufs-unclone-before-releasing-zerocopy.patch b/queue-4.14/skbuff-in-skb_copy_ubufs-unclone-before-releasing-zerocopy.patch
new file mode 100644 (file)
index 0000000..5b42bee
--- /dev/null
@@ -0,0 +1,54 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Willem de Bruijn <willemb@google.com>
+Date: Thu, 28 Dec 2017 12:38:13 -0500
+Subject: skbuff: in skb_copy_ubufs unclone before releasing zerocopy
+
+From: Willem de Bruijn <willemb@google.com>
+
+
+skb_copy_ubufs must unclone before it is safe to modify its
+skb_shared_info with skb_zcopy_clear.
+
+Commit b90ddd568792 ("skbuff: skb_copy_ubufs must release uarg even
+without user frags") ensures that all skbs release their zerocopy
+state, even those without frags.
+
+But I forgot an edge case where such an skb arrives that is cloned.
+
+The stack does not build such packets. Vhost/tun skbs have their
+frags orphaned before cloning. TCP skbs only attach zerocopy state
+when a frag is added.
+
+But if TCP packets can be trimmed or linearized, this might occur.
+Tracing the code I found no instance so far (e.g., skb_linearize
+ends up calling skb_zcopy_clear if !skb->data_len).
+
+Still, it is non-obvious that no path exists. And it is fragile to
+rely on this.
+
+Fixes: b90ddd568792 ("skbuff: skb_copy_ubufs must release uarg even without user frags")
+Signed-off-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/skbuff.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -1181,12 +1181,12 @@ int skb_copy_ubufs(struct sk_buff *skb,
+       int i, new_frags;
+       u32 d_off;
+-      if (!num_frags)
+-              goto release;
+-
+       if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
+               return -EINVAL;
++      if (!num_frags)
++              goto release;
++
+       new_frags = (__skb_pagelen(skb) + PAGE_SIZE - 1) >> PAGE_SHIFT;
+       for (i = 0; i < new_frags; i++) {
+               page = alloc_page(gfp_mask);
diff --git a/queue-4.14/skbuff-orphan-frags-before-zerocopy-clone.patch b/queue-4.14/skbuff-orphan-frags-before-zerocopy-clone.patch
new file mode 100644 (file)
index 0000000..f2450d0
--- /dev/null
@@ -0,0 +1,54 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Willem de Bruijn <willemb@google.com>
+Date: Wed, 20 Dec 2017 17:37:49 -0500
+Subject: skbuff: orphan frags before zerocopy clone
+
+From: Willem de Bruijn <willemb@google.com>
+
+
+[ Upstream commit 268b790679422a89e9ab0685d9f291edae780c98 ]
+
+Call skb_zerocopy_clone after skb_orphan_frags, to avoid duplicate
+calls to skb_uarg(skb)->callback for the same data.
+
+skb_zerocopy_clone associates skb_shinfo(skb)->uarg from frag_skb
+with each segment. This is only safe for uargs that do refcounting,
+which is those that pass skb_orphan_frags without dropping their
+shared frags. For others, skb_orphan_frags drops the user frags and
+sets the uarg to NULL, after which sock_zerocopy_clone has no effect.
+
+Qemu hangs were reported due to duplicate vhost_net_zerocopy_callback
+calls for the same data causing the vhost_net_ubuf_ref_>refcount to
+drop below zero.
+
+Link: http://lkml.kernel.org/r/<CAF=yD-LWyCD4Y0aJ9O0e_CHLR+3JOeKicRRTEVCPxgw4XOcqGQ@mail.gmail.com>
+Fixes: 1f8b977ab32d ("sock: enable MSG_ZEROCOPY")
+Reported-by: Andreas Hartmann <andihartmann@01019freenet.de>
+Reported-by: David Hill <dhill@redhat.com>
+Signed-off-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/skbuff.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -3657,8 +3657,6 @@ normal:
+               skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags &
+                                             SKBTX_SHARED_FRAG;
+-              if (skb_zerocopy_clone(nskb, head_skb, GFP_ATOMIC))
+-                      goto err;
+               while (pos < offset + len) {
+                       if (i >= nfrags) {
+@@ -3684,6 +3682,8 @@ normal:
+                       if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC)))
+                               goto err;
++                      if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC))
++                              goto err;
+                       *nskb_frag = *frag;
+                       __skb_frag_ref(nskb_frag);
diff --git a/queue-4.14/skbuff-skb_copy_ubufs-must-release-uarg-even-without-user-frags.patch b/queue-4.14/skbuff-skb_copy_ubufs-must-release-uarg-even-without-user-frags.patch
new file mode 100644 (file)
index 0000000..91db3fa
--- /dev/null
@@ -0,0 +1,44 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Willem de Bruijn <willemb@google.com>
+Date: Wed, 20 Dec 2017 17:37:50 -0500
+Subject: skbuff: skb_copy_ubufs must release uarg even without user frags
+
+From: Willem de Bruijn <willemb@google.com>
+
+
+[ Upstream commit b90ddd568792bcb0054eaf0f61785c8f80c3bd1c ]
+
+skb_copy_ubufs creates a private copy of frags[] to release its hold
+on user frags, then calls uarg->callback to notify the owner.
+
+Call uarg->callback even when no frags exist. This edge case can
+happen when zerocopy_sg_from_iter finds enough room in skb_headlen
+to copy all the data.
+
+Fixes: 3ece782693c4 ("sock: skb_copy_ubufs support for compound pages")
+Signed-off-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/skbuff.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -1182,7 +1182,7 @@ int skb_copy_ubufs(struct sk_buff *skb,
+       u32 d_off;
+       if (!num_frags)
+-              return 0;
++              goto release;
+       if (skb_shared(skb) || skb_unclone(skb, gfp_mask))
+               return -EINVAL;
+@@ -1242,6 +1242,7 @@ int skb_copy_ubufs(struct sk_buff *skb,
+       __skb_fill_page_desc(skb, new_frags - 1, head, 0, d_off);
+       skb_shinfo(skb)->nr_frags = new_frags;
++release:
+       skb_zcopy_clear(skb, false);
+       return 0;
+ }
diff --git a/queue-4.14/sock-free-skb-in-skb_complete_tx_timestamp-on-error.patch b/queue-4.14/sock-free-skb-in-skb_complete_tx_timestamp-on-error.patch
new file mode 100644 (file)
index 0000000..98e5773
--- /dev/null
@@ -0,0 +1,47 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Willem de Bruijn <willemb@google.com>
+Date: Wed, 13 Dec 2017 14:41:06 -0500
+Subject: sock: free skb in skb_complete_tx_timestamp on error
+
+From: Willem de Bruijn <willemb@google.com>
+
+
+[ Upstream commit 35b99dffc3f710cafceee6c8c6ac6a98eb2cb4bf ]
+
+skb_complete_tx_timestamp must ingest the skb it is passed. Call
+kfree_skb if the skb cannot be enqueued.
+
+Fixes: b245be1f4db1 ("net-timestamp: no-payload only sysctl")
+Fixes: 9ac25fc06375 ("net: fix socket refcounting in skb_complete_tx_timestamp()")
+Reported-by: Richard Cochran <richardcochran@gmail.com>
+Signed-off-by: Willem de Bruijn <willemb@google.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/skbuff.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -4296,7 +4296,7 @@ void skb_complete_tx_timestamp(struct sk
+       struct sock *sk = skb->sk;
+       if (!skb_may_tx_timestamp(sk, false))
+-              return;
++              goto err;
+       /* Take a reference to prevent skb_orphan() from freeing the socket,
+        * but only if the socket refcount is not zero.
+@@ -4305,7 +4305,11 @@ void skb_complete_tx_timestamp(struct sk
+               *skb_hwtstamps(skb) = *hwtstamps;
+               __skb_complete_tx_timestamp(skb, sk, SCM_TSTAMP_SND, false);
+               sock_put(sk);
++              return;
+       }
++
++err:
++      kfree_skb(skb);
+ }
+ EXPORT_SYMBOL_GPL(skb_complete_tx_timestamp);
diff --git a/queue-4.14/tcp-fix-potential-underestimation-on-rcv_rtt.patch b/queue-4.14/tcp-fix-potential-underestimation-on-rcv_rtt.patch
new file mode 100644 (file)
index 0000000..4a4e0a8
--- /dev/null
@@ -0,0 +1,59 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Wei Wang <weiwan@google.com>
+Date: Tue, 12 Dec 2017 16:28:58 -0800
+Subject: tcp: fix potential underestimation on rcv_rtt
+
+From: Wei Wang <weiwan@google.com>
+
+
+[ Upstream commit 9ee11bd03cb1a5c3ca33c2bb70e7ed325f68890f ]
+
+When ms timestamp is used, current logic uses 1us in
+tcp_rcv_rtt_update() when the real rcv_rtt is within 1 - 999us.
+This could cause rcv_rtt underestimation.
+Fix it by always using a min value of 1ms if ms timestamp is used.
+
+Fixes: 645f4c6f2ebd ("tcp: switch rcv_rtt_est and rcvq_space to high resolution timestamps")
+Signed-off-by: Wei Wang <weiwan@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |   10 ++++++----
+ 1 file changed, 6 insertions(+), 4 deletions(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -521,9 +521,6 @@ static void tcp_rcv_rtt_update(struct tc
+       u32 new_sample = tp->rcv_rtt_est.rtt_us;
+       long m = sample;
+-      if (m == 0)
+-              m = 1;
+-
+       if (new_sample != 0) {
+               /* If we sample in larger samples in the non-timestamp
+                * case, we could grossly overestimate the RTT especially
+@@ -560,6 +557,8 @@ static inline void tcp_rcv_rtt_measure(s
+       if (before(tp->rcv_nxt, tp->rcv_rtt_est.seq))
+               return;
+       delta_us = tcp_stamp_us_delta(tp->tcp_mstamp, tp->rcv_rtt_est.time);
++      if (!delta_us)
++              delta_us = 1;
+       tcp_rcv_rtt_update(tp, delta_us, 1);
+ new_measure:
+@@ -576,8 +575,11 @@ static inline void tcp_rcv_rtt_measure_t
+           (TCP_SKB_CB(skb)->end_seq -
+            TCP_SKB_CB(skb)->seq >= inet_csk(sk)->icsk_ack.rcv_mss)) {
+               u32 delta = tcp_time_stamp(tp) - tp->rx_opt.rcv_tsecr;
+-              u32 delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
++              u32 delta_us;
++              if (!delta)
++                      delta = 1;
++              delta_us = delta * (USEC_PER_SEC / TCP_TS_HZ);
+               tcp_rcv_rtt_update(tp, delta_us, 0);
+       }
+ }
diff --git a/queue-4.14/tcp-invalidate-rate-samples-during-sack-reneging.patch b/queue-4.14/tcp-invalidate-rate-samples-during-sack-reneging.patch
new file mode 100644 (file)
index 0000000..df41487
--- /dev/null
@@ -0,0 +1,153 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Yousuk Seung <ysseung@google.com>
+Date: Thu, 7 Dec 2017 13:41:34 -0800
+Subject: tcp: invalidate rate samples during SACK reneging
+
+From: Yousuk Seung <ysseung@google.com>
+
+
+[ Upstream commit d4761754b4fb2ef8d9a1e9d121c4bec84e1fe292 ]
+
+Mark tcp_sock during a SACK reneging event and invalidate rate samples
+while marked. Such rate samples may overestimate bw by including packets
+that were SACKed before reneging.
+
+< ack 6001 win 10000 sack 7001:38001
+< ack 7001 win 0 sack 8001:38001 // Reneg detected
+> seq 7001:8001 // RTO, SACK cleared.
+< ack 38001 win 10000
+
+In above example the rate sample taken after the last ack will count
+7001-38001 as delivered while the actual delivery rate likely could
+be much lower i.e. 7001-8001.
+
+This patch adds a new field tcp_sock.sack_reneg and marks it when we
+declare SACK reneging and entering TCP_CA_Loss, and unmarks it after
+the last rate sample was taken before moving back to TCP_CA_Open. This
+patch also invalidates rate samples taken while tcp_sock.is_sack_reneg
+is set.
+
+Fixes: b9f64820fb22 ("tcp: track data delivery rate for a TCP connection")
+Signed-off-by: Yousuk Seung <ysseung@google.com>
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Acked-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Priyaranjan Jha <priyarjha@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/tcp.h  |    3 ++-
+ include/net/tcp.h    |    2 +-
+ net/ipv4/tcp.c       |    1 +
+ net/ipv4/tcp_input.c |   10 ++++++++--
+ net/ipv4/tcp_rate.c  |   10 +++++++---
+ 5 files changed, 19 insertions(+), 7 deletions(-)
+
+--- a/include/linux/tcp.h
++++ b/include/linux/tcp.h
+@@ -214,7 +214,8 @@ struct tcp_sock {
+       u8      chrono_type:2,  /* current chronograph type */
+               rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
+               fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
+-              unused:4;
++              is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
++              unused:3;
+       u8      nonagle     : 4,/* Disable Nagle algorithm?             */
+               thin_lto    : 1,/* Use linear timeouts for thin streams */
+               unused1     : 1,
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -1085,7 +1085,7 @@ void tcp_rate_skb_sent(struct sock *sk,
+ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb,
+                           struct rate_sample *rs);
+ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+-                struct rate_sample *rs);
++                bool is_sack_reneg, struct rate_sample *rs);
+ void tcp_rate_check_app_limited(struct sock *sk);
+ /* These functions determine how the current flow behaves in respect of SACK
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -2356,6 +2356,7 @@ int tcp_disconnect(struct sock *sk, int
+       tp->snd_cwnd_cnt = 0;
+       tp->window_clamp = 0;
+       tcp_set_ca_state(sk, TCP_CA_Open);
++      tp->is_sack_reneg = 0;
+       tcp_clear_retrans(tp);
+       inet_csk_delack_init(sk);
+       /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -1975,6 +1975,8 @@ void tcp_enter_loss(struct sock *sk)
+               NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSACKRENEGING);
+               tp->sacked_out = 0;
+               tp->fackets_out = 0;
++              /* Mark SACK reneging until we recover from this loss event. */
++              tp->is_sack_reneg = 1;
+       }
+       tcp_clear_all_retrans_hints(tp);
+@@ -2428,6 +2430,7 @@ static bool tcp_try_undo_recovery(struct
+               return true;
+       }
+       tcp_set_ca_state(sk, TCP_CA_Open);
++      tp->is_sack_reneg = 0;
+       return false;
+ }
+@@ -2459,8 +2462,10 @@ static bool tcp_try_undo_loss(struct soc
+                       NET_INC_STATS(sock_net(sk),
+                                       LINUX_MIB_TCPSPURIOUSRTOS);
+               inet_csk(sk)->icsk_retransmits = 0;
+-              if (frto_undo || tcp_is_sack(tp))
++              if (frto_undo || tcp_is_sack(tp)) {
+                       tcp_set_ca_state(sk, TCP_CA_Open);
++                      tp->is_sack_reneg = 0;
++              }
+               return true;
+       }
+       return false;
+@@ -3551,6 +3556,7 @@ static int tcp_ack(struct sock *sk, cons
+       struct tcp_sacktag_state sack_state;
+       struct rate_sample rs = { .prior_delivered = 0 };
+       u32 prior_snd_una = tp->snd_una;
++      bool is_sack_reneg = tp->is_sack_reneg;
+       u32 ack_seq = TCP_SKB_CB(skb)->seq;
+       u32 ack = TCP_SKB_CB(skb)->ack_seq;
+       bool is_dupack = false;
+@@ -3666,7 +3672,7 @@ static int tcp_ack(struct sock *sk, cons
+       delivered = tp->delivered - delivered;  /* freshly ACKed or SACKed */
+       lost = tp->lost - lost;                 /* freshly marked lost */
+-      tcp_rate_gen(sk, delivered, lost, sack_state.rate);
++      tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate);
+       tcp_cong_control(sk, ack, delivered, flag, sack_state.rate);
+       tcp_xmit_recovery(sk, rexmit);
+       return 1;
+--- a/net/ipv4/tcp_rate.c
++++ b/net/ipv4/tcp_rate.c
+@@ -106,7 +106,7 @@ void tcp_rate_skb_delivered(struct sock
+ /* Update the connection delivery information and generate a rate sample. */
+ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost,
+-                struct rate_sample *rs)
++                bool is_sack_reneg, struct rate_sample *rs)
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
+       u32 snd_us, ack_us;
+@@ -124,8 +124,12 @@ void tcp_rate_gen(struct sock *sk, u32 d
+       rs->acked_sacked = delivered;   /* freshly ACKed or SACKed */
+       rs->losses = lost;              /* freshly marked lost */
+-      /* Return an invalid sample if no timing information is available. */
+-      if (!rs->prior_mstamp) {
++      /* Return an invalid sample if no timing information is available or
++       * in recovery from loss with SACK reneging. Rate samples taken during
++       * a SACK reneging event may overestimate bw by including packets that
++       * were SACKed before the reneg.
++       */
++      if (!rs->prior_mstamp || is_sack_reneg) {
+               rs->delivered = -1;
+               rs->interval_us = -1;
+               return;
diff --git a/queue-4.14/tcp-md5sig-use-skb-s-saddr-when-replying-to-an-incoming-segment.patch b/queue-4.14/tcp-md5sig-use-skb-s-saddr-when-replying-to-an-incoming-segment.patch
new file mode 100644 (file)
index 0000000..ce1b883
--- /dev/null
@@ -0,0 +1,55 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Christoph Paasch <cpaasch@apple.com>
+Date: Mon, 11 Dec 2017 00:05:46 -0800
+Subject: tcp md5sig: Use skb's saddr when replying to an incoming segment
+
+From: Christoph Paasch <cpaasch@apple.com>
+
+
+[ Upstream commit 30791ac41927ebd3e75486f9504b6d2280463bf0 ]
+
+The MD5-key that belongs to a connection is identified by the peer's
+IP-address. When we are in tcp_v4(6)_reqsk_send_ack(), we are replying
+to an incoming segment from tcp_check_req() that failed the seq-number
+checks.
+
+Thus, to find the correct key, we need to use the skb's saddr and not
+the daddr.
+
+This bug seems to have been there since quite a while, but probably got
+unnoticed because the consequences are not catastrophic. We will call
+tcp_v4_reqsk_send_ack only to send a challenge-ACK back to the peer,
+thus the connection doesn't really fail.
+
+Fixes: 9501f9722922 ("tcp md5sig: Let the caller pass appropriate key for tcp_v{4,6}_do_calc_md5_hash().")
+Signed-off-by: Christoph Paasch <cpaasch@apple.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_ipv4.c |    2 +-
+ net/ipv6/tcp_ipv6.c |    2 +-
+ 2 files changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -844,7 +844,7 @@ static void tcp_v4_reqsk_send_ack(const
+                       tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
+                       req->ts_recent,
+                       0,
+-                      tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->daddr,
++                      tcp_md5_do_lookup(sk, (union tcp_md5_addr *)&ip_hdr(skb)->saddr,
+                                         AF_INET),
+                       inet_rsk(req)->no_srccheck ? IP_REPLY_ARG_NOSRCCHECK : 0,
+                       ip_hdr(skb)->tos);
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -988,7 +988,7 @@ static void tcp_v6_reqsk_send_ack(const
+                       req->rsk_rcv_wnd >> inet_rsk(req)->rcv_wscale,
+                       tcp_time_stamp_raw() + tcp_rsk(req)->ts_off,
+                       req->ts_recent, sk->sk_bound_dev_if,
+-                      tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr),
++                      tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr),
+                       0, 0);
+ }
diff --git a/queue-4.14/tcp-refresh-tcp_mstamp-from-timers-callbacks.patch b/queue-4.14/tcp-refresh-tcp_mstamp-from-timers-callbacks.patch
new file mode 100644 (file)
index 0000000..4fec855
--- /dev/null
@@ -0,0 +1,49 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Eric Dumazet <edumazet@google.com>
+Date: Tue, 12 Dec 2017 18:22:52 -0800
+Subject: tcp: refresh tcp_mstamp from timers callbacks
+
+From: Eric Dumazet <edumazet@google.com>
+
+
+[ Upstream commit 4688eb7cf3ae2c2721d1dacff5c1384cba47d176 ]
+
+Only the retransmit timer currently refreshes tcp_mstamp
+
+We should do the same for delayed acks and keepalives.
+
+Even if RFC 7323 does not request it, this is consistent to what linux
+did in the past, when TS values were based on jiffies.
+
+Fixes: 385e20706fac ("tcp: use tp->tcp_mstamp in output path")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Soheil Hassas Yeganeh <soheil@google.com>
+Cc: Mike Maloney <maloney@google.com>
+Cc: Neal Cardwell <ncardwell@google.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Acked-by:  Mike Maloney <maloney@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_timer.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/net/ipv4/tcp_timer.c
++++ b/net/ipv4/tcp_timer.c
+@@ -264,6 +264,7 @@ void tcp_delack_timer_handler(struct soc
+                       icsk->icsk_ack.pingpong = 0;
+                       icsk->icsk_ack.ato      = TCP_ATO_MIN;
+               }
++              tcp_mstamp_refresh(tcp_sk(sk));
+               tcp_send_ack(sk);
+               __NET_INC_STATS(sock_net(sk), LINUX_MIB_DELAYEDACKS);
+       }
+@@ -627,6 +628,7 @@ static void tcp_keepalive_timer (unsigne
+               goto out;
+       }
++      tcp_mstamp_refresh(tp);
+       if (sk->sk_state == TCP_FIN_WAIT2 && sock_flag(sk, SOCK_DEAD)) {
+               if (tp->linger2 >= 0) {
+                       const int tmo = tcp_fin_time(sk) - TCP_TIMEWAIT_LEN;
diff --git a/queue-4.14/tcp_bbr-record-full-bw-reached-decision-in-new-full_bw_reached-bit.patch b/queue-4.14/tcp_bbr-record-full-bw-reached-decision-in-new-full_bw_reached-bit.patch
new file mode 100644 (file)
index 0000000..a481eed
--- /dev/null
@@ -0,0 +1,70 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Neal Cardwell <ncardwell@google.com>
+Date: Thu, 7 Dec 2017 12:43:30 -0500
+Subject: tcp_bbr: record "full bw reached" decision in new full_bw_reached bit
+
+From: Neal Cardwell <ncardwell@google.com>
+
+
+[ Upstream commit c589e69b508d29ed8e644dfecda453f71c02ec27 ]
+
+This commit records the "full bw reached" decision in a new
+full_bw_reached bit. This is a pure refactor that does not change the
+current behavior, but enables subsequent fixes and improvements.
+
+In particular, this enables simple and clean fixes because the full_bw
+and full_bw_cnt can be unconditionally zeroed without worrying about
+forgetting that we estimated we filled the pipe in Startup. And it
+enables future improvements because multiple code paths can be used
+for estimating that we filled the pipe in Startup; any new code paths
+only need to set this bit when they think the pipe is full.
+
+Note that this fix intentionally reduces the width of the full_bw_cnt
+counter, since we have never used the most significant bit.
+
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Reviewed-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_bbr.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -110,7 +110,8 @@ struct bbr {
+       u32     lt_last_lost;        /* LT intvl start: tp->lost */
+       u32     pacing_gain:10, /* current gain for setting pacing rate */
+               cwnd_gain:10,   /* current gain for setting cwnd */
+-              full_bw_cnt:3,  /* number of rounds without large bw gains */
++              full_bw_reached:1,   /* reached full bw in Startup? */
++              full_bw_cnt:2,  /* number of rounds without large bw gains */
+               cycle_idx:3,    /* current index in pacing_gain cycle array */
+               has_seen_rtt:1, /* have we seen an RTT sample yet? */
+               unused_b:5;
+@@ -180,7 +181,7 @@ static bool bbr_full_bw_reached(const st
+ {
+       const struct bbr *bbr = inet_csk_ca(sk);
+-      return bbr->full_bw_cnt >= bbr_full_bw_cnt;
++      return bbr->full_bw_reached;
+ }
+ /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */
+@@ -717,6 +718,7 @@ static void bbr_check_full_bw_reached(st
+               return;
+       }
+       ++bbr->full_bw_cnt;
++      bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt;
+ }
+ /* If pipe is probably full, drain the queue and then enter steady-state. */
+@@ -850,6 +852,7 @@ static void bbr_init(struct sock *sk)
+       bbr->restore_cwnd = 0;
+       bbr->round_start = 0;
+       bbr->idle_restart = 0;
++      bbr->full_bw_reached = 0;
+       bbr->full_bw = 0;
+       bbr->full_bw_cnt = 0;
+       bbr->cycle_mstamp = 0;
diff --git a/queue-4.14/tcp_bbr-reset-full-pipe-detection-on-loss-recovery-undo.patch b/queue-4.14/tcp_bbr-reset-full-pipe-detection-on-loss-recovery-undo.patch
new file mode 100644 (file)
index 0000000..f5a4ba6
--- /dev/null
@@ -0,0 +1,44 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Neal Cardwell <ncardwell@google.com>
+Date: Thu, 7 Dec 2017 12:43:31 -0500
+Subject: tcp_bbr: reset full pipe detection on loss recovery undo
+
+From: Neal Cardwell <ncardwell@google.com>
+
+
+[ Upstream commit 2f6c498e4f15d27852c04ed46d804a39137ba364 ]
+
+Fix BBR so that upon notification of a loss recovery undo BBR resets
+the full pipe detection (STARTUP exit) state machine.
+
+Under high reordering, reordering events can be interpreted as loss.
+If the reordering and spurious loss estimates are high enough, this
+could previously cause BBR to spuriously estimate that the pipe is
+full.
+
+Since spurious loss recovery means that our overall sending will have
+slowed down spuriously, this commit gives a flow more time to probe
+robustly for bandwidth and decide the pipe is really full.
+
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Reviewed-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_bbr.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -874,6 +874,10 @@ static u32 bbr_sndbuf_expand(struct sock
+  */
+ static u32 bbr_undo_cwnd(struct sock *sk)
+ {
++      struct bbr *bbr = inet_csk_ca(sk);
++
++      bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
++      bbr->full_bw_cnt = 0;
+       return tcp_sk(sk)->snd_cwnd;
+ }
diff --git a/queue-4.14/tcp_bbr-reset-long-term-bandwidth-sampling-on-loss-recovery-undo.patch b/queue-4.14/tcp_bbr-reset-long-term-bandwidth-sampling-on-loss-recovery-undo.patch
new file mode 100644 (file)
index 0000000..fcfc98b
--- /dev/null
@@ -0,0 +1,39 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Neal Cardwell <ncardwell@google.com>
+Date: Thu, 7 Dec 2017 12:43:32 -0500
+Subject: tcp_bbr: reset long-term bandwidth sampling on loss recovery undo
+
+From: Neal Cardwell <ncardwell@google.com>
+
+
+[ Upstream commit 600647d467c6d04b3954b41a6ee1795b5ae00550 ]
+
+Fix BBR so that upon notification of a loss recovery undo BBR resets
+long-term bandwidth sampling.
+
+Under high reordering, reordering events can be interpreted as loss.
+If the reordering and spurious loss estimates are high enough, this
+can cause BBR to spuriously estimate that we are seeing loss rates
+high enough to trigger long-term bandwidth estimation. To avoid that
+problem, this commit resets long-term bandwidth sampling on loss
+recovery undo events.
+
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Reviewed-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_bbr.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/net/ipv4/tcp_bbr.c
++++ b/net/ipv4/tcp_bbr.c
+@@ -878,6 +878,7 @@ static u32 bbr_undo_cwnd(struct sock *sk
+       bbr->full_bw = 0;   /* spurious slow-down; reset full pipe detection */
+       bbr->full_bw_cnt = 0;
++      bbr_reset_lt_bw_sampling(sk);
+       return tcp_sk(sk)->snd_cwnd;
+ }
diff --git a/queue-4.14/tg3-fix-rx-hang-on-mtu-change-with-5717-5719.patch b/queue-4.14/tg3-fix-rx-hang-on-mtu-change-with-5717-5719.patch
new file mode 100644 (file)
index 0000000..3dee70b
--- /dev/null
@@ -0,0 +1,37 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Brian King <brking@linux.vnet.ibm.com>
+Date: Fri, 15 Dec 2017 15:21:50 -0600
+Subject: tg3: Fix rx hang on MTU change with 5717/5719
+
+From: Brian King <brking@linux.vnet.ibm.com>
+
+
+[ Upstream commit 748a240c589824e9121befb1cba5341c319885bc ]
+
+This fixes a hang issue seen when changing the MTU size from 1500 MTU
+to 9000 MTU on both 5717 and 5719 chips. In discussion with Broadcom,
+they've indicated that these chipsets have the same phy as the 57766
+chipset, so the same workarounds apply. This has been tested by IBM
+on both Power 8 and Power 9 systems as well as by Broadcom on x86
+hardware and has been confirmed to resolve the hang issue.
+
+Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/broadcom/tg3.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/broadcom/tg3.c
++++ b/drivers/net/ethernet/broadcom/tg3.c
+@@ -14227,7 +14227,9 @@ static int tg3_change_mtu(struct net_dev
+       /* Reset PHY, otherwise the read DMA engine will be in a mode that
+        * breaks all requests to 256 bytes.
+        */
+-      if (tg3_asic_rev(tp) == ASIC_REV_57766)
++      if (tg3_asic_rev(tp) == ASIC_REV_57766 ||
++          tg3_asic_rev(tp) == ASIC_REV_5717 ||
++          tg3_asic_rev(tp) == ASIC_REV_5719)
+               reset_phy = true;
+       err = tg3_restart_hw(tp, reset_phy);
diff --git a/queue-4.14/tipc-fix-hanging-poll-for-stream-sockets.patch b/queue-4.14/tipc-fix-hanging-poll-for-stream-sockets.patch
new file mode 100644 (file)
index 0000000..c6d3d60
--- /dev/null
@@ -0,0 +1,45 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@gmail.com>
+Date: Thu, 28 Dec 2017 12:03:06 +0100
+Subject: tipc: fix hanging poll() for stream sockets
+
+From: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@gmail.com>
+
+
+[ Upstream commit 517d7c79bdb39864e617960504bdc1aa560c75c6 ]
+
+In commit 42b531de17d2f6 ("tipc: Fix missing connection request
+handling"), we replaced unconditional wakeup() with condtional
+wakeup for clients with flags POLLIN | POLLRDNORM | POLLRDBAND.
+
+This breaks the applications which do a connect followed by poll
+with POLLOUT flag. These applications are not woken when the
+connection is ESTABLISHED and hence sleep forever.
+
+In this commit, we fix it by including the POLLOUT event for
+sockets in TIPC_CONNECTING state.
+
+Fixes: 42b531de17d2f6 ("tipc: Fix missing connection request handling")
+Acked-by: Jon Maloy <jon.maloy@ericsson.com>
+Signed-off-by: Parthasarathy Bhuvaragan <parthasarathy.bhuvaragan@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/tipc/socket.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/net/tipc/socket.c
++++ b/net/tipc/socket.c
+@@ -709,11 +709,11 @@ static unsigned int tipc_poll(struct fil
+       switch (sk->sk_state) {
+       case TIPC_ESTABLISHED:
++      case TIPC_CONNECTING:
+               if (!tsk->cong_link_cnt && !tsk_conn_cong(tsk))
+                       mask |= POLLOUT;
+               /* fall thru' */
+       case TIPC_LISTEN:
+-      case TIPC_CONNECTING:
+               if (!skb_queue_empty(&sk->sk_receive_queue))
+                       mask |= (POLLIN | POLLRDNORM);
+               break;
diff --git a/queue-4.14/vxlan-restore-dev-mtu-setting-based-on-lower-device.patch b/queue-4.14/vxlan-restore-dev-mtu-setting-based-on-lower-device.patch
new file mode 100644 (file)
index 0000000..0fc8111
--- /dev/null
@@ -0,0 +1,58 @@
+From foo@baz Sun Dec 31 11:12:48 CET 2017
+From: Alexey Kodanev <alexey.kodanev@oracle.com>
+Date: Thu, 14 Dec 2017 20:20:00 +0300
+Subject: vxlan: restore dev->mtu setting based on lower device
+
+From: Alexey Kodanev <alexey.kodanev@oracle.com>
+
+
+[ Upstream commit f870c1ff65a6d1f3a083f277280802ee09a5b44d ]
+
+Stefano Brivio says:
+    Commit a985343ba906 ("vxlan: refactor verification and
+    application of configuration") introduced a change in the
+    behaviour of initial MTU setting: earlier, the MTU for a link
+    created on top of a given lower device, without an initial MTU
+    specification, was set to the MTU of the lower device minus
+    headroom as a result of this path in vxlan_dev_configure():
+
+       if (!conf->mtu)
+               dev->mtu = lowerdev->mtu -
+                          (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
+
+    which is now gone. Now, the initial MTU, in absence of a
+    configured value, is simply set by ether_setup() to ETH_DATA_LEN
+    (1500 bytes).
+
+    This breaks userspace expectations in case the MTU of
+    the lower device is higher than 1500 bytes minus headroom.
+
+This patch restores the previous behaviour on newlink operation. Since
+max_mtu can be negative and we update dev->mtu directly, also check it
+for valid minimum.
+
+Reported-by: Junhan Yan <juyan@redhat.com>
+Fixes: a985343ba906 ("vxlan: refactor verification and application of configuration")
+Signed-off-by: Alexey Kodanev <alexey.kodanev@oracle.com>
+Acked-by: Stefano Brivio <sbrivio@redhat.com>
+Signed-off-by: Stefano Brivio <sbrivio@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/vxlan.c |    5 +++++
+ 1 file changed, 5 insertions(+)
+
+--- a/drivers/net/vxlan.c
++++ b/drivers/net/vxlan.c
+@@ -3105,6 +3105,11 @@ static void vxlan_config_apply(struct ne
+               max_mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM :
+                                          VXLAN_HEADROOM);
++              if (max_mtu < ETH_MIN_MTU)
++                      max_mtu = ETH_MIN_MTU;
++
++              if (!changelink && !conf->mtu)
++                      dev->mtu = max_mtu;
+       }
+       if (dev->mtu > max_mtu)