]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
4.17-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 27 Jul 2018 06:33:10 +0000 (08:33 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 27 Jul 2018 06:33:10 +0000 (08:33 +0200)
added patches:
bonding-set-default-miimon-value-for-non-arp-modes-if-not-set.patch
clk-meson-gxbb-set-fclk_div2-as-clk_is_critical.patch
ip-hash-fragments-consistently.patch
ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch
multicast-do-not-restore-deleted-record-source-filter-mode-to-new-one.patch
net-dsa-mv88e6xxx-fix-races-between-lock-and-irq-freeing.patch
net-ipv6-fix-linklocal-to-global-address-with-vrf.patch
net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch
net-mlx5-adjust-clock-overflow-work-period.patch
net-mlx5-e-switch-ubsan-fix-undefined-behavior-in-mlx5_eswitch_mode.patch
net-mlx5e-add-ingress-egress-indication-for-offloaded-tc-flows.patch
net-mlx5e-don-t-allow-arfs-for-encapsulated-packets.patch
net-mlx5e-fix-quota-counting-in-arfs-expire-flow.patch
net-mlx5e-only-allow-offloading-decap-egress-egdev-flows.patch
net-mlx5e-refine-ets-validation-function.patch
net-next-hinic-fix-a-problem-in-hinic_xmit_frame.patch
net-phy-consider-phy_ignore_interrupt-in-phy_start_aneg_priv.patch
net-skb_segment-should-not-return-null.patch
nfp-flower-ensure-dead-neighbour-entries-are-not-offloaded.patch
r8169-restore-previous-behavior-to-accept-bios-wol-settings.patch
rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch
sock-fix-sg-page-frag-coalescing-in-sk_alloc_sg.patch
tcp-add-tcp_ooo_try_coalesce-helper.patch
tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch
tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch
tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch
tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch
tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch
tcp-fix-dctcp-delayed-ack-schedule.patch
tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch
tcp-helpers-to-send-special-dctcp-ack.patch
tls-check-rcv_shutdown-in-tls_wait_data.patch
vxlan-add-new-fdb-alloc-and-create-helpers.patch
vxlan-fix-default-fdb-entry-netlink-notify-ordering-during-netdev-create.patch
vxlan-make-netlink-notify-in-vxlan_fdb_destroy-optional.patch

36 files changed:
queue-4.17/bonding-set-default-miimon-value-for-non-arp-modes-if-not-set.patch [new file with mode: 0644]
queue-4.17/clk-meson-gxbb-set-fclk_div2-as-clk_is_critical.patch [new file with mode: 0644]
queue-4.17/ip-hash-fragments-consistently.patch [new file with mode: 0644]
queue-4.17/ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch [new file with mode: 0644]
queue-4.17/multicast-do-not-restore-deleted-record-source-filter-mode-to-new-one.patch [new file with mode: 0644]
queue-4.17/net-dsa-mv88e6xxx-fix-races-between-lock-and-irq-freeing.patch [new file with mode: 0644]
queue-4.17/net-ipv6-fix-linklocal-to-global-address-with-vrf.patch [new file with mode: 0644]
queue-4.17/net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch [new file with mode: 0644]
queue-4.17/net-mlx5-adjust-clock-overflow-work-period.patch [new file with mode: 0644]
queue-4.17/net-mlx5-e-switch-ubsan-fix-undefined-behavior-in-mlx5_eswitch_mode.patch [new file with mode: 0644]
queue-4.17/net-mlx5e-add-ingress-egress-indication-for-offloaded-tc-flows.patch [new file with mode: 0644]
queue-4.17/net-mlx5e-don-t-allow-arfs-for-encapsulated-packets.patch [new file with mode: 0644]
queue-4.17/net-mlx5e-fix-quota-counting-in-arfs-expire-flow.patch [new file with mode: 0644]
queue-4.17/net-mlx5e-only-allow-offloading-decap-egress-egdev-flows.patch [new file with mode: 0644]
queue-4.17/net-mlx5e-refine-ets-validation-function.patch [new file with mode: 0644]
queue-4.17/net-next-hinic-fix-a-problem-in-hinic_xmit_frame.patch [new file with mode: 0644]
queue-4.17/net-phy-consider-phy_ignore_interrupt-in-phy_start_aneg_priv.patch [new file with mode: 0644]
queue-4.17/net-skb_segment-should-not-return-null.patch [new file with mode: 0644]
queue-4.17/nfp-flower-ensure-dead-neighbour-entries-are-not-offloaded.patch [new file with mode: 0644]
queue-4.17/r8169-restore-previous-behavior-to-accept-bios-wol-settings.patch [new file with mode: 0644]
queue-4.17/rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch [new file with mode: 0644]
queue-4.17/series
queue-4.17/sock-fix-sg-page-frag-coalescing-in-sk_alloc_sg.patch [new file with mode: 0644]
queue-4.17/tcp-add-tcp_ooo_try_coalesce-helper.patch [new file with mode: 0644]
queue-4.17/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch [new file with mode: 0644]
queue-4.17/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch [new file with mode: 0644]
queue-4.17/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch [new file with mode: 0644]
queue-4.17/tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch [new file with mode: 0644]
queue-4.17/tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch [new file with mode: 0644]
queue-4.17/tcp-fix-dctcp-delayed-ack-schedule.patch [new file with mode: 0644]
queue-4.17/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch [new file with mode: 0644]
queue-4.17/tcp-helpers-to-send-special-dctcp-ack.patch [new file with mode: 0644]
queue-4.17/tls-check-rcv_shutdown-in-tls_wait_data.patch [new file with mode: 0644]
queue-4.17/vxlan-add-new-fdb-alloc-and-create-helpers.patch [new file with mode: 0644]
queue-4.17/vxlan-fix-default-fdb-entry-netlink-notify-ordering-during-netdev-create.patch [new file with mode: 0644]
queue-4.17/vxlan-make-netlink-notify-in-vxlan_fdb_destroy-optional.patch [new file with mode: 0644]

diff --git a/queue-4.17/bonding-set-default-miimon-value-for-non-arp-modes-if-not-set.patch b/queue-4.17/bonding-set-default-miimon-value-for-non-arp-modes-if-not-set.patch
new file mode 100644 (file)
index 0000000..d8e0bfb
--- /dev/null
@@ -0,0 +1,103 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Jarod Wilson <jarod@redhat.com>
+Date: Wed, 18 Jul 2018 14:49:36 -0400
+Subject: bonding: set default miimon value for non-arp modes if not set
+
+From: Jarod Wilson <jarod@redhat.com>
+
+[ Upstream commit c1f897ce186a529a494441642125479d38727a3d ]
+
+For some time now, if you load the bonding driver and configure bond
+parameters via sysfs using minimal config options, such as specifying
+nothing but the mode, relying on defaults for everything else, modes
+that cannot use arp monitoring (802.3ad, balance-tlb, balance-alb) all
+wind up with both arp_interval=0 (as it should be) and miimon=0, which
+means the miimon monitor thread never actually runs. This is particularly
+problematic for 802.3ad.
+
+For example, from an LNST recipe I've set up:
+
+$ modprobe bonding max_bonds=0"
+$ echo "+t_bond0" > /sys/class/net/bonding_masters"
+$ ip link set t_bond0 down"
+$ echo "802.3ad" > /sys/class/net/t_bond0/bonding/mode"
+$ ip link set ens1f1 down"
+$ echo "+ens1f1" > /sys/class/net/t_bond0/bonding/slaves"
+$ ip link set ens1f0 down"
+$ echo "+ens1f0" > /sys/class/net/t_bond0/bonding/slaves"
+$ ethtool -i t_bond0"
+$ ip link set ens1f1 up"
+$ ip link set ens1f0 up"
+$ ip link set t_bond0 up"
+$ ip addr add 192.168.9.1/24 dev t_bond0"
+$ ip addr add 2002::1/64 dev t_bond0"
+
+This bond comes up okay, but things look slightly suspect in
+/proc/net/bonding/t_bond0 output:
+
+$ grep -i mii /proc/net/bonding/t_bond0
+MII Status: up
+MII Polling Interval (ms): 0
+MII Status: up
+MII Status: up
+
+Now, pull a cable on one of the ports in the bond, then reconnect it, and
+you'll see:
+
+Slave Interface: ens1f0
+MII Status: down
+Speed: 1000 Mbps
+Duplex: full
+
+I believe this became a major issue as of commit 4d2c0cda0744, which for
+802.3ad bonds, sets slave->link = BOND_LINK_DOWN, with a comment about
+relying on link monitoring via miimon to set it correctly, but since the
+miimon work queue never runs, the link just stays marked down.
+
+If we simply tweak bond_option_mode_set() slightly, we can check for the
+non-arp modes having no miimon value set, and insert BOND_DEFAULT_MIIMON,
+which gets things back in full working order. This problem exists as far
+back as 4.14, and might be worth fixing in all stable trees since, though
+the work-around is to simply specify an miimon value yourself.
+
+Reported-by: Bob Ball <ball@umich.edu>
+Signed-off-by: Jarod Wilson <jarod@redhat.com>
+Acked-by: Mahesh Bandewar <maheshb@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/bonding/bond_options.c |   23 ++++++++++++++---------
+ 1 file changed, 14 insertions(+), 9 deletions(-)
+
+--- a/drivers/net/bonding/bond_options.c
++++ b/drivers/net/bonding/bond_options.c
+@@ -743,15 +743,20 @@ const struct bond_option *bond_opt_get(u
+ static int bond_option_mode_set(struct bonding *bond,
+                               const struct bond_opt_value *newval)
+ {
+-      if (!bond_mode_uses_arp(newval->value) && bond->params.arp_interval) {
+-              netdev_dbg(bond->dev, "%s mode is incompatible with arp monitoring, start mii monitoring\n",
+-                         newval->string);
+-              /* disable arp monitoring */
+-              bond->params.arp_interval = 0;
+-              /* set miimon to default value */
+-              bond->params.miimon = BOND_DEFAULT_MIIMON;
+-              netdev_dbg(bond->dev, "Setting MII monitoring interval to %d\n",
+-                         bond->params.miimon);
++      if (!bond_mode_uses_arp(newval->value)) {
++              if (bond->params.arp_interval) {
++                      netdev_dbg(bond->dev, "%s mode is incompatible with arp monitoring, start mii monitoring\n",
++                                 newval->string);
++                      /* disable arp monitoring */
++                      bond->params.arp_interval = 0;
++              }
++
++              if (!bond->params.miimon) {
++                      /* set miimon to default value */
++                      bond->params.miimon = BOND_DEFAULT_MIIMON;
++                      netdev_dbg(bond->dev, "Setting MII monitoring interval to %d\n",
++                                 bond->params.miimon);
++              }
+       }
+       if (newval->value == BOND_MODE_ALB)
diff --git a/queue-4.17/clk-meson-gxbb-set-fclk_div2-as-clk_is_critical.patch b/queue-4.17/clk-meson-gxbb-set-fclk_div2-as-clk_is_critical.patch
new file mode 100644 (file)
index 0000000..dc0959e
--- /dev/null
@@ -0,0 +1,57 @@
+From c987ac6f1f088663b6dad39281071aeb31d450a8 Mon Sep 17 00:00:00 2001
+From: Neil Armstrong <narmstrong@baylibre.com>
+Date: Wed, 13 Jun 2018 14:20:21 +0200
+Subject: clk: meson-gxbb: set fclk_div2 as CLK_IS_CRITICAL
+
+From: Neil Armstrong <narmstrong@baylibre.com>
+
+commit c987ac6f1f088663b6dad39281071aeb31d450a8 upstream.
+
+On Amlogic Meson GXBB & GXL platforms, the SCPI Cortex-M4 Co-Processor
+seems to be dependent on the FCLK_DIV2 to be operationnal.
+
+The issue occurred since v4.17-rc1 by freezing the kernel boot when
+the 'schedutil' cpufreq governor was selected as default :
+
+  [   12.071837] scpi_protocol scpi: SCP Protocol 0.0 Firmware 0.0.0 version
+  domain-0 init dvfs: 4
+  [   12.087757] hctosys: unable to open rtc device (rtc0)
+  [   12.087907] cfg80211: Loading compiled-in X.509 certificates for regulatory database
+  [   12.102241] cfg80211: Loaded X.509 cert 'sforshee: 00b28ddf47aef9cea7'
+
+But when disabling the MMC driver, the boot finished but cpufreq failed to
+change the CPU frequency :
+
+  [   12.153045] cpufreq: __target_index: Failed to change cpu frequency: -5
+
+A bisect between v4.16 and v4.16-rc1 gave
+05f814402d61 ("clk: meson: add fdiv clock gates") to be the first bad commit.
+This commit added support for the missing clock gates before the fixed PLL
+fixed dividers (FCLK_DIVx) and the clock framework basically disabled
+all the unused fixed dividers, thus disabled a critical clock path for
+the SCPI Co-Processor.
+
+This patch simply sets the FCLK_DIV2 gate as critical to ensure
+nobody can disable it.
+
+Fixes: 05f814402d61 ("clk: meson: add fdiv clock gates")
+Signed-off-by: Neil Armstrong <narmstrong@baylibre.com>
+Tested-by: Kevin Hilman <khilman@baylibre.com>
+[few corrections in the commit description]
+Signed-off-by: Jerome Brunet <jbrunet@baylibre.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/clk/meson/gxbb.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/clk/meson/gxbb.c
++++ b/drivers/clk/meson/gxbb.c
+@@ -511,6 +511,7 @@ static struct clk_regmap gxbb_fclk_div2
+               .ops = &clk_regmap_gate_ops,
+               .parent_names = (const char *[]){ "fclk_div2_div" },
+               .num_parents = 1,
++              .flags = CLK_IS_CRITICAL,
+       },
+ };
diff --git a/queue-4.17/ip-hash-fragments-consistently.patch b/queue-4.17/ip-hash-fragments-consistently.patch
new file mode 100644 (file)
index 0000000..d363c46
--- /dev/null
@@ -0,0 +1,73 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Mon, 23 Jul 2018 16:50:48 +0200
+Subject: ip: hash fragments consistently
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+[ Upstream commit 3dd1c9a1270736029ffca670e9bd0265f4120600 ]
+
+The skb hash for locally generated ip[v6] fragments belonging
+to the same datagram can vary in several circumstances:
+* for connected UDP[v6] sockets, the first fragment get its hash
+  via set_owner_w()/skb_set_hash_from_sk()
+* for unconnected IPv6 UDPv6 sockets, the first fragment can get
+  its hash via ip6_make_flowlabel()/skb_get_hash_flowi6(), if
+  auto_flowlabel is enabled
+
+For the following frags the hash is usually computed via
+skb_get_hash().
+The above can cause OoO for unconnected IPv6 UDPv6 socket: in that
+scenario the egress tx queue can be selected on a per packet basis
+via the skb hash.
+It may also fool flow-oriented schedulers to place fragments belonging
+to the same datagram in different flows.
+
+Fix the issue by copying the skb hash from the head frag into
+the others at fragmentation time.
+
+Before this commit:
+perf probe -a "dev_queue_xmit skb skb->hash skb->l4_hash:b1@0/8 skb->sw_hash:b1@1/8"
+netperf -H $IPV4 -t UDP_STREAM -l 5 -- -m 2000 -n &
+perf record -e probe:dev_queue_xmit -e probe:skb_set_owner_w -a sleep 0.1
+perf script
+probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=3713014309 l4_hash=1 sw_hash=0
+probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=0 l4_hash=0 sw_hash=0
+
+After this commit:
+probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=2171763177 l4_hash=1 sw_hash=0
+probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=2171763177 l4_hash=1 sw_hash=0
+
+Fixes: b73c3d0e4f0e ("net: Save TX flow hash in sock and set in skbuf on xmit")
+Fixes: 67800f9b1f4e ("ipv6: Call skb_get_hash_flowi6 to get skb->hash in ip6_make_flowlabel")
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/ip_output.c  |    2 ++
+ net/ipv6/ip6_output.c |    2 ++
+ 2 files changed, 4 insertions(+)
+
+--- a/net/ipv4/ip_output.c
++++ b/net/ipv4/ip_output.c
+@@ -523,6 +523,8 @@ static void ip_copy_metadata(struct sk_b
+       to->dev = from->dev;
+       to->mark = from->mark;
++      skb_copy_hash(to, from);
++
+       /* Copy the flags to each fragment. */
+       IPCB(to)->flags = IPCB(from)->flags;
+--- a/net/ipv6/ip6_output.c
++++ b/net/ipv6/ip6_output.c
+@@ -596,6 +596,8 @@ static void ip6_copy_metadata(struct sk_
+       to->dev = from->dev;
+       to->mark = from->mark;
++      skb_copy_hash(to, from);
++
+ #ifdef CONFIG_NET_SCHED
+       to->tc_index = from->tc_index;
+ #endif
diff --git a/queue-4.17/ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch b/queue-4.17/ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch
new file mode 100644 (file)
index 0000000..37d85ca
--- /dev/null
@@ -0,0 +1,93 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Willem de Bruijn <willemb@google.com>
+Date: Mon, 23 Jul 2018 19:36:48 -0400
+Subject: ip: in cmsg IP(V6)_ORIGDSTADDR call pskb_may_pull
+
+From: Willem de Bruijn <willemb@google.com>
+
+[ Upstream commit 2efd4fca703a6707cad16ab486eaab8fc7f0fd49 ]
+
+Syzbot reported a read beyond the end of the skb head when returning
+IPV6_ORIGDSTADDR:
+
+  BUG: KMSAN: kernel-infoleak in put_cmsg+0x5ef/0x860 net/core/scm.c:242
+  CPU: 0 PID: 4501 Comm: syz-executor128 Not tainted 4.17.0+ #9
+  Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
+  Google 01/01/2011
+  Call Trace:
+    __dump_stack lib/dump_stack.c:77 [inline]
+    dump_stack+0x185/0x1d0 lib/dump_stack.c:113
+    kmsan_report+0x188/0x2a0 mm/kmsan/kmsan.c:1125
+    kmsan_internal_check_memory+0x138/0x1f0 mm/kmsan/kmsan.c:1219
+    kmsan_copy_to_user+0x7a/0x160 mm/kmsan/kmsan.c:1261
+    copy_to_user include/linux/uaccess.h:184 [inline]
+    put_cmsg+0x5ef/0x860 net/core/scm.c:242
+    ip6_datagram_recv_specific_ctl+0x1cf3/0x1eb0 net/ipv6/datagram.c:719
+    ip6_datagram_recv_ctl+0x41c/0x450 net/ipv6/datagram.c:733
+    rawv6_recvmsg+0x10fb/0x1460 net/ipv6/raw.c:521
+    [..]
+
+This logic and its ipv4 counterpart read the destination port from
+the packet at skb_transport_offset(skb) + 4.
+
+With MSG_MORE and a local SOCK_RAW sender, syzbot was able to cook a
+packet that stores headers exactly up to skb_transport_offset(skb) in
+the head and the remainder in a frag.
+
+Call pskb_may_pull before accessing the pointer to ensure that it lies
+in skb head.
+
+Link: http://lkml.kernel.org/r/CAF=yD-LEJwZj5a1-bAAj2Oy_hKmGygV6rsJ_WOrAYnv-fnayiQ@mail.gmail.com
+Reported-by: syzbot+9adb4b567003cac781f0@syzkaller.appspotmail.com
+Signed-off-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/ip_sockglue.c |    7 +++++--
+ net/ipv6/datagram.c    |    7 +++++--
+ 2 files changed, 10 insertions(+), 4 deletions(-)
+
+--- a/net/ipv4/ip_sockglue.c
++++ b/net/ipv4/ip_sockglue.c
+@@ -148,15 +148,18 @@ static void ip_cmsg_recv_dstaddr(struct
+ {
+       struct sockaddr_in sin;
+       const struct iphdr *iph = ip_hdr(skb);
+-      __be16 *ports = (__be16 *)skb_transport_header(skb);
++      __be16 *ports;
++      int end;
+-      if (skb_transport_offset(skb) + 4 > (int)skb->len)
++      end = skb_transport_offset(skb) + 4;
++      if (end > 0 && !pskb_may_pull(skb, end))
+               return;
+       /* All current transport protocols have the port numbers in the
+        * first four bytes of the transport header and this function is
+        * written with this assumption in mind.
+        */
++      ports = (__be16 *)skb_transport_header(skb);
+       sin.sin_family = AF_INET;
+       sin.sin_addr.s_addr = iph->daddr;
+--- a/net/ipv6/datagram.c
++++ b/net/ipv6/datagram.c
+@@ -700,13 +700,16 @@ void ip6_datagram_recv_specific_ctl(stru
+       }
+       if (np->rxopt.bits.rxorigdstaddr) {
+               struct sockaddr_in6 sin6;
+-              __be16 *ports = (__be16 *) skb_transport_header(skb);
++              __be16 *ports;
++              int end;
+-              if (skb_transport_offset(skb) + 4 <= (int)skb->len) {
++              end = skb_transport_offset(skb) + 4;
++              if (end <= 0 || pskb_may_pull(skb, end)) {
+                       /* All current transport protocols have the port numbers in the
+                        * first four bytes of the transport header and this function is
+                        * written with this assumption in mind.
+                        */
++                      ports = (__be16 *)skb_transport_header(skb);
+                       sin6.sin6_family = AF_INET6;
+                       sin6.sin6_addr = ipv6_hdr(skb)->daddr;
diff --git a/queue-4.17/multicast-do-not-restore-deleted-record-source-filter-mode-to-new-one.patch b/queue-4.17/multicast-do-not-restore-deleted-record-source-filter-mode-to-new-one.patch
new file mode 100644 (file)
index 0000000..d87fd8c
--- /dev/null
@@ -0,0 +1,56 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Hangbin Liu <liuhangbin@gmail.com>
+Date: Fri, 20 Jul 2018 14:04:27 +0800
+Subject: multicast: do not restore deleted record source filter mode to new one
+
+From: Hangbin Liu <liuhangbin@gmail.com>
+
+There are two scenarios that we will restore deleted records. The first is
+when device down and up(or unmap/remap). In this scenario the new filter
+mode is same with previous one. Because we get it from in_dev->mc_list and
+we do not touch it during device down and up.
+
+The other scenario is when a new socket join a group which was just delete
+and not finish sending status reports. In this scenario, we should use the
+current filter mode instead of restore old one. Here are 4 cases in total.
+
+old_socket        new_socket       before_fix       after_fix
+  IN(A)             IN(A)           ALLOW(A)         ALLOW(A)
+  IN(A)             EX( )           TO_IN( )         TO_EX( )
+  EX( )             IN(A)           TO_EX( )         ALLOW(A)
+  EX( )             EX( )           TO_EX( )         TO_EX( )
+
+Fixes: 24803f38a5c0b (igmp: do not remove igmp souce list info when set link down)
+Fixes: 1666d49e1d416 (mld: do not remove mld souce list info when set link down)
+Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/igmp.c  |    3 +--
+ net/ipv6/mcast.c |    3 +--
+ 2 files changed, 2 insertions(+), 4 deletions(-)
+
+--- a/net/ipv4/igmp.c
++++ b/net/ipv4/igmp.c
+@@ -1201,8 +1201,7 @@ static void igmpv3_del_delrec(struct in_
+       if (pmc) {
+               im->interface = pmc->interface;
+               im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+-              im->sfmode = pmc->sfmode;
+-              if (pmc->sfmode == MCAST_INCLUDE) {
++              if (im->sfmode == MCAST_INCLUDE) {
+                       im->tomb = pmc->tomb;
+                       im->sources = pmc->sources;
+                       for (psf = im->sources; psf; psf = psf->sf_next)
+--- a/net/ipv6/mcast.c
++++ b/net/ipv6/mcast.c
+@@ -771,8 +771,7 @@ static void mld_del_delrec(struct inet6_
+       if (pmc) {
+               im->idev = pmc->idev;
+               im->mca_crcount = idev->mc_qrv;
+-              im->mca_sfmode = pmc->mca_sfmode;
+-              if (pmc->mca_sfmode == MCAST_INCLUDE) {
++              if (im->mca_sfmode == MCAST_INCLUDE) {
+                       im->mca_tomb = pmc->mca_tomb;
+                       im->mca_sources = pmc->mca_sources;
+                       for (psf = im->mca_sources; psf; psf = psf->sf_next)
diff --git a/queue-4.17/net-dsa-mv88e6xxx-fix-races-between-lock-and-irq-freeing.patch b/queue-4.17/net-dsa-mv88e6xxx-fix-races-between-lock-and-irq-freeing.patch
new file mode 100644 (file)
index 0000000..e1aeb00
--- /dev/null
@@ -0,0 +1,98 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: "Uwe Kleine-König" <u.kleine-koenig@pengutronix.de>
+Date: Fri, 20 Jul 2018 11:53:15 +0200
+Subject: net: dsa: mv88e6xxx: fix races between lock and irq freeing
+
+From: "Uwe Kleine-König" <u.kleine-koenig@pengutronix.de>
+
+[ Upstream commit 3d82475ad46c0b65f2618b5f2bbb4cadbb5ac5d8 ]
+
+free_irq() waits until all handlers for this IRQ have completed. As the
+relevant handler (mv88e6xxx_g1_irq_thread_fn()) takes the chip's reg_lock
+it might never return if the thread calling free_irq() holds this lock.
+
+For the same reason kthread_cancel_delayed_work_sync() in the polling case
+must not hold this lock.
+
+Also first free the irq (or stop the worker respectively) such that
+mv88e6xxx_g1_irq_thread_work() isn't called any more before the irq
+mappings are dropped in mv88e6xxx_g1_irq_free_common() to prevent the
+worker thread to call handle_nested_irq(0) which results in a NULL-pointer
+exception.
+
+Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/dsa/mv88e6xxx/chip.c |   21 +++++++++++++--------
+ 1 file changed, 13 insertions(+), 8 deletions(-)
+
+--- a/drivers/net/dsa/mv88e6xxx/chip.c
++++ b/drivers/net/dsa/mv88e6xxx/chip.c
+@@ -341,6 +341,7 @@ static const struct irq_domain_ops mv88e
+       .xlate  = irq_domain_xlate_twocell,
+ };
++/* To be called with reg_lock held */
+ static void mv88e6xxx_g1_irq_free_common(struct mv88e6xxx_chip *chip)
+ {
+       int irq, virq;
+@@ -360,9 +361,15 @@ static void mv88e6xxx_g1_irq_free_common
+ static void mv88e6xxx_g1_irq_free(struct mv88e6xxx_chip *chip)
+ {
+-      mv88e6xxx_g1_irq_free_common(chip);
+-
++      /*
++       * free_irq must be called without reg_lock taken because the irq
++       * handler takes this lock, too.
++       */
+       free_irq(chip->irq, chip);
++
++      mutex_lock(&chip->reg_lock);
++      mv88e6xxx_g1_irq_free_common(chip);
++      mutex_unlock(&chip->reg_lock);
+ }
+ static int mv88e6xxx_g1_irq_setup_common(struct mv88e6xxx_chip *chip)
+@@ -467,10 +474,12 @@ static int mv88e6xxx_irq_poll_setup(stru
+ static void mv88e6xxx_irq_poll_free(struct mv88e6xxx_chip *chip)
+ {
+-      mv88e6xxx_g1_irq_free_common(chip);
+-
+       kthread_cancel_delayed_work_sync(&chip->irq_poll_work);
+       kthread_destroy_worker(chip->kworker);
++
++      mutex_lock(&chip->reg_lock);
++      mv88e6xxx_g1_irq_free_common(chip);
++      mutex_unlock(&chip->reg_lock);
+ }
+ int mv88e6xxx_wait(struct mv88e6xxx_chip *chip, int addr, int reg, u16 mask)
+@@ -4286,12 +4295,10 @@ out_g2_irq:
+       if (chip->info->g2_irqs > 0)
+               mv88e6xxx_g2_irq_free(chip);
+ out_g1_irq:
+-      mutex_lock(&chip->reg_lock);
+       if (chip->irq > 0)
+               mv88e6xxx_g1_irq_free(chip);
+       else
+               mv88e6xxx_irq_poll_free(chip);
+-      mutex_unlock(&chip->reg_lock);
+ out:
+       return err;
+ }
+@@ -4316,12 +4323,10 @@ static void mv88e6xxx_remove(struct mdio
+       if (chip->info->g2_irqs > 0)
+               mv88e6xxx_g2_irq_free(chip);
+-      mutex_lock(&chip->reg_lock);
+       if (chip->irq > 0)
+               mv88e6xxx_g1_irq_free(chip);
+       else
+               mv88e6xxx_irq_poll_free(chip);
+-      mutex_unlock(&chip->reg_lock);
+ }
+ static const struct of_device_id mv88e6xxx_of_match[] = {
diff --git a/queue-4.17/net-ipv6-fix-linklocal-to-global-address-with-vrf.patch b/queue-4.17/net-ipv6-fix-linklocal-to-global-address-with-vrf.patch
new file mode 100644 (file)
index 0000000..3a58b5e
--- /dev/null
@@ -0,0 +1,93 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: David Ahern <dsahern@gmail.com>
+Date: Thu, 19 Jul 2018 12:41:18 -0700
+Subject: net/ipv6: Fix linklocal to global address with VRF
+
+From: David Ahern <dsahern@gmail.com>
+
+[ Upstream commit 24b711edfc34bc45777a3f068812b7d1ed004a5d ]
+
+Example setup:
+    host: ip -6 addr add dev eth1 2001:db8:104::4
+           where eth1 is enslaved to a VRF
+
+    switch: ip -6 ro add 2001:db8:104::4/128 dev br1
+            where br1 only has an LLA
+
+           ping6 2001:db8:104::4
+           ssh   2001:db8:104::4
+
+(NOTE: UDP works fine if the PKTINFO has the address set to the global
+address and ifindex is set to the index of eth1 with a destination an
+LLA).
+
+For ICMP, icmp6_iif needs to be updated to check if skb->dev is an
+L3 master. If it is then return the ifindex from rt6i_idev similar
+to what is done for loopback.
+
+For TCP, restore the original tcp_v6_iif definition which is needed in
+most places and add a new tcp_v6_iif_l3_slave that considers the
+l3_slave variability. This latter check is only needed for socket
+lookups.
+
+Fixes: 9ff74384600a ("net: vrf: Handle ipv6 multicast and link-local addresses")
+Signed-off-by: David Ahern <dsahern@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/tcp.h   |    5 +++++
+ net/ipv6/icmp.c     |    5 +++--
+ net/ipv6/tcp_ipv6.c |    6 ++++--
+ 3 files changed, 12 insertions(+), 4 deletions(-)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -829,6 +829,11 @@ struct tcp_skb_cb {
+  */
+ static inline int tcp_v6_iif(const struct sk_buff *skb)
+ {
++      return TCP_SKB_CB(skb)->header.h6.iif;
++}
++
++static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb)
++{
+       bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags);
+       return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif;
+--- a/net/ipv6/icmp.c
++++ b/net/ipv6/icmp.c
+@@ -402,9 +402,10 @@ static int icmp6_iif(const struct sk_buf
+       /* for local traffic to local address, skb dev is the loopback
+        * device. Check if there is a dst attached to the skb and if so
+-       * get the real device index.
++       * get the real device index. Same is needed for replies to a link
++       * local address on a device enslaved to an L3 master device
+        */
+-      if (unlikely(iif == LOOPBACK_IFINDEX)) {
++      if (unlikely(iif == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) {
+               const struct rt6_info *rt6 = skb_rt6_info(skb);
+               if (rt6)
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -934,7 +934,8 @@ static void tcp_v6_send_reset(const stru
+                                          &tcp_hashinfo, NULL, 0,
+                                          &ipv6h->saddr,
+                                          th->source, &ipv6h->daddr,
+-                                         ntohs(th->source), tcp_v6_iif(skb),
++                                         ntohs(th->source),
++                                         tcp_v6_iif_l3_slave(skb),
+                                          tcp_v6_sdif(skb));
+               if (!sk1)
+                       goto out;
+@@ -1605,7 +1606,8 @@ do_time_wait:
+                                           skb, __tcp_hdrlen(th),
+                                           &ipv6_hdr(skb)->saddr, th->source,
+                                           &ipv6_hdr(skb)->daddr,
+-                                          ntohs(th->dest), tcp_v6_iif(skb),
++                                          ntohs(th->dest),
++                                          tcp_v6_iif_l3_slave(skb),
+                                           sdif);
+               if (sk2) {
+                       struct inet_timewait_sock *tw = inet_twsk(sk);
diff --git a/queue-4.17/net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch b/queue-4.17/net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch
new file mode 100644 (file)
index 0000000..3a9a587
--- /dev/null
@@ -0,0 +1,40 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Jack Morgenstein <jackm@dev.mellanox.co.il>
+Date: Tue, 24 Jul 2018 14:27:55 +0300
+Subject: net/mlx4_core: Save the qpn from the input modifier in RST2INIT wrapper
+
+From: Jack Morgenstein <jackm@dev.mellanox.co.il>
+
+[ Upstream commit 958c696f5a7274d9447a458ad7aa70719b29a50a ]
+
+Function mlx4_RST2INIT_QP_wrapper saved the qp number passed in the qp
+context, rather than the one passed in the input modifier.
+
+However, the qp number in the qp context is not defined as a
+required parameter by the FW. Therefore, drivers may choose to not
+specify the qp number in the qp context for the reset-to-init transition.
+
+Thus, we must save the qp number passed in the command input modifier --
+which is always present. (This saved qp number is used as the input
+modifier for command 2RST_QP when a slave's qp's are destroyed).
+
+Fixes: c82e9aa0a8bc ("mlx4_core: resource tracking for HCA resources used by guests")
+Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
+Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx4/resource_tracker.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
++++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+@@ -2956,7 +2956,7 @@ int mlx4_RST2INIT_QP_wrapper(struct mlx4
+       u32 srqn = qp_get_srqn(qpc) & 0xffffff;
+       int use_srq = (qp_get_srqn(qpc) >> 24) & 1;
+       struct res_srq *srq;
+-      int local_qpn = be32_to_cpu(qpc->local_qpn) & 0xffffff;
++      int local_qpn = vhcr->in_modifier & 0xffffff;
+       err = adjust_qp_sched_queue(dev, slave, qpc, inbox);
+       if (err)
diff --git a/queue-4.17/net-mlx5-adjust-clock-overflow-work-period.patch b/queue-4.17/net-mlx5-adjust-clock-overflow-work-period.patch
new file mode 100644 (file)
index 0000000..a3d7520
--- /dev/null
@@ -0,0 +1,70 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Ariel Levkovich <lariel@mellanox.com>
+Date: Mon, 25 Jun 2018 19:12:02 +0300
+Subject: net/mlx5: Adjust clock overflow work period
+
+From: Ariel Levkovich <lariel@mellanox.com>
+
+[ Upstream commit 33180bee86a8940a84950edca46315cd9dd6deb5 ]
+
+When driver converts HW timestamp to wall clock time it subtracts
+the last saved cycle counter from the HW timestamp and converts the
+difference to nanoseconds.
+The conversion is done by multiplying the cycles difference with the
+clock multiplier value as a first step and therefore the cycles
+difference should be small enough so that the multiplication product
+doesn't exceed 64bit.
+
+The overflow handling routine is in charge of updating the last saved
+cycle counter in driver and it is called periodically using kernel
+delayed workqueue.
+
+The delay period for this work is calculated using the max HW cycle
+counter value (a 41 bit mask) as a base which doesn't take the 64bit
+limit into account so the delay period may be incorrect and too
+long to prevent a large difference between the HW counter and the last
+saved counter in SW.
+
+This change adjusts the work period for the HW clock overflow work by
+taking the minimum between the previous value and the quotient of max
+u64 value and the clock multiplier value.
+
+Fixes: ef9814deafd0 ("net/mlx5e: Add HW timestamping (TS) support")
+Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
+Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c |   12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c
+@@ -487,6 +487,7 @@ void mlx5_pps_event(struct mlx5_core_dev
+ void mlx5_init_clock(struct mlx5_core_dev *mdev)
+ {
+       struct mlx5_clock *clock = &mdev->clock;
++      u64 overflow_cycles;
+       u64 ns;
+       u64 frac = 0;
+       u32 dev_freq;
+@@ -510,10 +511,17 @@ void mlx5_init_clock(struct mlx5_core_de
+       /* Calculate period in seconds to call the overflow watchdog - to make
+        * sure counter is checked at least once every wrap around.
++       * The period is calculated as the minimum between max HW cycles count
++       * (The clock source mask) and max amount of cycles that can be
++       * multiplied by clock multiplier where the result doesn't exceed
++       * 64bits.
+        */
+-      ns = cyclecounter_cyc2ns(&clock->cycles, clock->cycles.mask,
++      overflow_cycles = div64_u64(~0ULL >> 1, clock->cycles.mult);
++      overflow_cycles = min(overflow_cycles, clock->cycles.mask >> 1);
++
++      ns = cyclecounter_cyc2ns(&clock->cycles, overflow_cycles,
+                                frac, &frac);
+-      do_div(ns, NSEC_PER_SEC / 2 / HZ);
++      do_div(ns, NSEC_PER_SEC / HZ);
+       clock->overflow_period = ns;
+       mdev->clock_info_page = alloc_page(GFP_KERNEL);
diff --git a/queue-4.17/net-mlx5-e-switch-ubsan-fix-undefined-behavior-in-mlx5_eswitch_mode.patch b/queue-4.17/net-mlx5-e-switch-ubsan-fix-undefined-behavior-in-mlx5_eswitch_mode.patch
new file mode 100644 (file)
index 0000000..83fca61
--- /dev/null
@@ -0,0 +1,50 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Saeed Mahameed <saeedm@mellanox.com>
+Date: Mon, 9 Jul 2018 16:41:40 -0700
+Subject: net/mlx5: E-Switch, UBSAN fix undefined behavior in mlx5_eswitch_mode
+
+From: Saeed Mahameed <saeedm@mellanox.com>
+
+[ Upstream commit 443a858158d35916e572b75667ca4924a6af2182 ]
+
+With debug kernel UBSAN detects the following issue, which might happen
+when eswitch instance is not created, fix this by testing the eswitch
+pointer before returning the eswitch mode, if not set return mode =
+SRIOV_NONE.
+
+[   32.528951] UBSAN: Undefined behaviour in drivers/net/ethernet/mellanox/mlx5/core/eswitch.c:2219:12
+[   32.528951] member access within null pointer of type 'struct mlx5_eswitch'
+[   32.528951] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.18.0-rc3-dirty #181
+[   32.528951] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014
+[   32.528951] Call Trace:
+[   32.528951]  dump_stack+0xc7/0x13b
+[   32.528951]  ? show_regs_print_info+0x5/0x5
+[   32.528951]  ? __pm_runtime_use_autosuspend+0x140/0x140
+[   32.528951]  ubsan_epilogue+0x9/0x49
+[   32.528951]  ubsan_type_mismatch_common+0x1f9/0x2c0
+[   32.528951]  ? ucs2_as_utf8+0x310/0x310
+[   32.528951]  ? device_initialize+0x229/0x2e0
+[   32.528951]  __ubsan_handle_type_mismatch+0x9f/0xc9
+[   32.528951]  ? __ubsan_handle_divrem_overflow+0x19b/0x19b
+[   32.578008]  ? ib_device_get_by_index+0xf0/0xf0
+[   32.578008]  mlx5_eswitch_mode+0x30/0x40
+[   32.578008]  mlx5_ib_add+0x1e0/0x4a0
+
+Fixes: 57cbd893c4c5 ("net/mlx5: E-Switch, Move representors definition to a global scope")
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Reviewed-by: Leon Romanovsky <leonro@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/eswitch.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c
+@@ -2221,6 +2221,6 @@ free_out:
+ u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw)
+ {
+-      return esw->mode;
++      return ESW_ALLOWED(esw) ? esw->mode : SRIOV_NONE;
+ }
+ EXPORT_SYMBOL_GPL(mlx5_eswitch_mode);
diff --git a/queue-4.17/net-mlx5e-add-ingress-egress-indication-for-offloaded-tc-flows.patch b/queue-4.17/net-mlx5e-add-ingress-egress-indication-for-offloaded-tc-flows.patch
new file mode 100644 (file)
index 0000000..04942c8
--- /dev/null
@@ -0,0 +1,266 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Or Gerlitz <ogerlitz@mellanox.com>
+Date: Wed, 18 Apr 2018 13:45:11 +0300
+Subject: net/mlx5e: Add ingress/egress indication for offloaded TC flows
+
+From: Or Gerlitz <ogerlitz@mellanox.com>
+
+[ Upstream commit 60bd4af814fec164c42bdd2efd7984b85d6b1e1e ]
+
+When an e-switch TC rule is offloaded through the egdev (egress
+device) mechanism, we treat this as egress, all other cases (NIC
+and e-switch) are considred ingress.
+
+This is preparation step that will allow us to  identify "wrong"
+stat/del offload calls made by the TC core on egdev based flows and
+ignore them.
+
+Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
+Signed-off-by: Jiri Pirko <jiri@mellanox.com>
+Reviewed-by: Paul Blakey <paulb@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en.h      |    3 -
+ drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   15 ++++----
+ drivers/net/ethernet/mellanox/mlx5/core/en_rep.c  |   32 +++++++++++++-----
+ drivers/net/ethernet/mellanox/mlx5/core/en_tc.c   |   38 ++++++++++++++++------
+ drivers/net/ethernet/mellanox/mlx5/core/en_tc.h   |   13 +++++--
+ 5 files changed, 70 insertions(+), 31 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
+@@ -1092,9 +1092,6 @@ int mlx5e_ethtool_get_ts_info(struct mlx
+ int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv,
+                              struct ethtool_flash *flash);
+-int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+-                          void *cb_priv);
+-
+ /* mlx5e generic netdev management API */
+ struct net_device*
+ mlx5e_create_netdev(struct mlx5_core_dev *mdev, const struct mlx5e_profile *profile,
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+@@ -3093,22 +3093,23 @@ out:
+ #ifdef CONFIG_MLX5_ESWITCH
+ static int mlx5e_setup_tc_cls_flower(struct mlx5e_priv *priv,
+-                                   struct tc_cls_flower_offload *cls_flower)
++                                   struct tc_cls_flower_offload *cls_flower,
++                                   int flags)
+ {
+       switch (cls_flower->command) {
+       case TC_CLSFLOWER_REPLACE:
+-              return mlx5e_configure_flower(priv, cls_flower);
++              return mlx5e_configure_flower(priv, cls_flower, flags);
+       case TC_CLSFLOWER_DESTROY:
+-              return mlx5e_delete_flower(priv, cls_flower);
++              return mlx5e_delete_flower(priv, cls_flower, flags);
+       case TC_CLSFLOWER_STATS:
+-              return mlx5e_stats_flower(priv, cls_flower);
++              return mlx5e_stats_flower(priv, cls_flower, flags);
+       default:
+               return -EOPNOTSUPP;
+       }
+ }
+-int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
+-                          void *cb_priv)
++static int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data,
++                                 void *cb_priv)
+ {
+       struct mlx5e_priv *priv = cb_priv;
+@@ -3117,7 +3118,7 @@ int mlx5e_setup_tc_block_cb(enum tc_setu
+       switch (type) {
+       case TC_SETUP_CLSFLOWER:
+-              return mlx5e_setup_tc_cls_flower(priv, type_data);
++              return mlx5e_setup_tc_cls_flower(priv, type_data, MLX5E_TC_INGRESS);
+       default:
+               return -EOPNOTSUPP;
+       }
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c
+@@ -723,15 +723,31 @@ static int mlx5e_rep_get_phys_port_name(
+ static int
+ mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv,
+-                            struct tc_cls_flower_offload *cls_flower)
++                            struct tc_cls_flower_offload *cls_flower, int flags)
+ {
+       switch (cls_flower->command) {
+       case TC_CLSFLOWER_REPLACE:
+-              return mlx5e_configure_flower(priv, cls_flower);
++              return mlx5e_configure_flower(priv, cls_flower, flags);
+       case TC_CLSFLOWER_DESTROY:
+-              return mlx5e_delete_flower(priv, cls_flower);
++              return mlx5e_delete_flower(priv, cls_flower, flags);
+       case TC_CLSFLOWER_STATS:
+-              return mlx5e_stats_flower(priv, cls_flower);
++              return mlx5e_stats_flower(priv, cls_flower, flags);
++      default:
++              return -EOPNOTSUPP;
++      }
++}
++
++static int mlx5e_rep_setup_tc_cb_egdev(enum tc_setup_type type, void *type_data,
++                                     void *cb_priv)
++{
++      struct mlx5e_priv *priv = cb_priv;
++
++      if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data))
++              return -EOPNOTSUPP;
++
++      switch (type) {
++      case TC_SETUP_CLSFLOWER:
++              return mlx5e_rep_setup_tc_cls_flower(priv, type_data, MLX5E_TC_EGRESS);
+       default:
+               return -EOPNOTSUPP;
+       }
+@@ -747,7 +763,7 @@ static int mlx5e_rep_setup_tc_cb(enum tc
+       switch (type) {
+       case TC_SETUP_CLSFLOWER:
+-              return mlx5e_rep_setup_tc_cls_flower(priv, type_data);
++              return mlx5e_rep_setup_tc_cls_flower(priv, type_data, MLX5E_TC_INGRESS);
+       default:
+               return -EOPNOTSUPP;
+       }
+@@ -1111,7 +1127,7 @@ mlx5e_vport_rep_load(struct mlx5_core_de
+       uplink_rpriv = mlx5_eswitch_get_uplink_priv(dev->priv.eswitch, REP_ETH);
+       upriv = netdev_priv(uplink_rpriv->netdev);
+-      err = tc_setup_cb_egdev_register(netdev, mlx5e_setup_tc_block_cb,
++      err = tc_setup_cb_egdev_register(netdev, mlx5e_rep_setup_tc_cb_egdev,
+                                        upriv);
+       if (err)
+               goto err_neigh_cleanup;
+@@ -1126,7 +1142,7 @@ mlx5e_vport_rep_load(struct mlx5_core_de
+       return 0;
+ err_egdev_cleanup:
+-      tc_setup_cb_egdev_unregister(netdev, mlx5e_setup_tc_block_cb,
++      tc_setup_cb_egdev_unregister(netdev, mlx5e_rep_setup_tc_cb_egdev,
+                                    upriv);
+ err_neigh_cleanup:
+@@ -1155,7 +1171,7 @@ mlx5e_vport_rep_unload(struct mlx5_eswit
+       uplink_rpriv = mlx5_eswitch_get_uplink_priv(priv->mdev->priv.eswitch,
+                                                   REP_ETH);
+       upriv = netdev_priv(uplink_rpriv->netdev);
+-      tc_setup_cb_egdev_unregister(netdev, mlx5e_setup_tc_block_cb,
++      tc_setup_cb_egdev_unregister(netdev, mlx5e_rep_setup_tc_cb_egdev,
+                                    upriv);
+       mlx5e_rep_neigh_cleanup(rpriv);
+       mlx5e_detach_netdev(priv);
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+@@ -61,12 +61,16 @@ struct mlx5_nic_flow_attr {
+       struct mlx5_flow_table  *hairpin_ft;
+ };
++#define MLX5E_TC_FLOW_BASE (MLX5E_TC_LAST_EXPORTED_BIT + 1)
++
+ enum {
+-      MLX5E_TC_FLOW_ESWITCH   = BIT(0),
+-      MLX5E_TC_FLOW_NIC       = BIT(1),
+-      MLX5E_TC_FLOW_OFFLOADED = BIT(2),
+-      MLX5E_TC_FLOW_HAIRPIN   = BIT(3),
+-      MLX5E_TC_FLOW_HAIRPIN_RSS = BIT(4),
++      MLX5E_TC_FLOW_INGRESS   = MLX5E_TC_INGRESS,
++      MLX5E_TC_FLOW_EGRESS    = MLX5E_TC_EGRESS,
++      MLX5E_TC_FLOW_ESWITCH   = BIT(MLX5E_TC_FLOW_BASE),
++      MLX5E_TC_FLOW_NIC       = BIT(MLX5E_TC_FLOW_BASE + 1),
++      MLX5E_TC_FLOW_OFFLOADED = BIT(MLX5E_TC_FLOW_BASE + 2),
++      MLX5E_TC_FLOW_HAIRPIN   = BIT(MLX5E_TC_FLOW_BASE + 3),
++      MLX5E_TC_FLOW_HAIRPIN_RSS = BIT(MLX5E_TC_FLOW_BASE + 4),
+ };
+ struct mlx5e_tc_flow {
+@@ -2566,8 +2570,20 @@ static int parse_tc_fdb_actions(struct m
+       return err;
+ }
++static void get_flags(int flags, u8 *flow_flags)
++{
++      u8 __flow_flags = 0;
++
++      if (flags & MLX5E_TC_INGRESS)
++              __flow_flags |= MLX5E_TC_FLOW_INGRESS;
++      if (flags & MLX5E_TC_EGRESS)
++              __flow_flags |= MLX5E_TC_FLOW_EGRESS;
++
++      *flow_flags = __flow_flags;
++}
++
+ int mlx5e_configure_flower(struct mlx5e_priv *priv,
+-                         struct tc_cls_flower_offload *f)
++                         struct tc_cls_flower_offload *f, int flags)
+ {
+       struct mlx5_eswitch *esw = priv->mdev->priv.eswitch;
+       struct mlx5e_tc_flow_parse_attr *parse_attr;
+@@ -2576,11 +2592,13 @@ int mlx5e_configure_flower(struct mlx5e_
+       int attr_size, err = 0;
+       u8 flow_flags = 0;
++      get_flags(flags, &flow_flags);
++
+       if (esw && esw->mode == SRIOV_OFFLOADS) {
+-              flow_flags = MLX5E_TC_FLOW_ESWITCH;
++              flow_flags |= MLX5E_TC_FLOW_ESWITCH;
+               attr_size  = sizeof(struct mlx5_esw_flow_attr);
+       } else {
+-              flow_flags = MLX5E_TC_FLOW_NIC;
++              flow_flags |= MLX5E_TC_FLOW_NIC;
+               attr_size  = sizeof(struct mlx5_nic_flow_attr);
+       }
+@@ -2639,7 +2657,7 @@ err_free:
+ }
+ int mlx5e_delete_flower(struct mlx5e_priv *priv,
+-                      struct tc_cls_flower_offload *f)
++                      struct tc_cls_flower_offload *f, int flags)
+ {
+       struct mlx5e_tc_flow *flow;
+       struct mlx5e_tc_table *tc = &priv->fs.tc;
+@@ -2659,7 +2677,7 @@ int mlx5e_delete_flower(struct mlx5e_pri
+ }
+ int mlx5e_stats_flower(struct mlx5e_priv *priv,
+-                     struct tc_cls_flower_offload *f)
++                     struct tc_cls_flower_offload *f, int flags)
+ {
+       struct mlx5e_tc_table *tc = &priv->fs.tc;
+       struct mlx5e_tc_flow *flow;
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h
+@@ -38,16 +38,23 @@
+ #define MLX5E_TC_FLOW_ID_MASK 0x0000ffff
+ #ifdef CONFIG_MLX5_ESWITCH
++
++enum {
++      MLX5E_TC_INGRESS = BIT(0),
++      MLX5E_TC_EGRESS  = BIT(1),
++      MLX5E_TC_LAST_EXPORTED_BIT = 1,
++};
++
+ int mlx5e_tc_init(struct mlx5e_priv *priv);
+ void mlx5e_tc_cleanup(struct mlx5e_priv *priv);
+ int mlx5e_configure_flower(struct mlx5e_priv *priv,
+-                         struct tc_cls_flower_offload *f);
++                         struct tc_cls_flower_offload *f, int flags);
+ int mlx5e_delete_flower(struct mlx5e_priv *priv,
+-                      struct tc_cls_flower_offload *f);
++                      struct tc_cls_flower_offload *f, int flags);
+ int mlx5e_stats_flower(struct mlx5e_priv *priv,
+-                     struct tc_cls_flower_offload *f);
++                     struct tc_cls_flower_offload *f, int flags);
+ struct mlx5e_encap_entry;
+ void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv,
diff --git a/queue-4.17/net-mlx5e-don-t-allow-arfs-for-encapsulated-packets.patch b/queue-4.17/net-mlx5e-don-t-allow-arfs-for-encapsulated-packets.patch
new file mode 100644 (file)
index 0000000..4713e6f
--- /dev/null
@@ -0,0 +1,33 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Eran Ben Elisha <eranbe@mellanox.com>
+Date: Sun, 8 Jul 2018 14:52:12 +0300
+Subject: net/mlx5e: Don't allow aRFS for encapsulated packets
+
+From: Eran Ben Elisha <eranbe@mellanox.com>
+
+[ Upstream commit d2e1c57bcf9a07cbb67f30ecf238f298799bce1c ]
+
+Driver is yet to support aRFS for encapsulated packets, return early
+error in such case.
+
+Fixes: 18c908e477dc ("net/mlx5e: Add accelerated RFS support")
+Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
+Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+@@ -711,6 +711,9 @@ int mlx5e_rx_flow_steer(struct net_devic
+           skb->protocol != htons(ETH_P_IPV6))
+               return -EPROTONOSUPPORT;
++      if (skb->encapsulation)
++              return -EPROTONOSUPPORT;
++
+       arfs_t = arfs_get_table(arfs, arfs_get_ip_proto(skb), skb->protocol);
+       if (!arfs_t)
+               return -EPROTONOSUPPORT;
diff --git a/queue-4.17/net-mlx5e-fix-quota-counting-in-arfs-expire-flow.patch b/queue-4.17/net-mlx5e-fix-quota-counting-in-arfs-expire-flow.patch
new file mode 100644 (file)
index 0000000..9d3b0d3
--- /dev/null
@@ -0,0 +1,41 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Eran Ben Elisha <eranbe@mellanox.com>
+Date: Sun, 8 Jul 2018 13:08:55 +0300
+Subject: net/mlx5e: Fix quota counting in aRFS expire flow
+
+From: Eran Ben Elisha <eranbe@mellanox.com>
+
+[ Upstream commit 2630bae8018823c3b88788b69fb9f16ea3b4a11e ]
+
+Quota should follow the amount of rules which do expire, and not the
+number of rules that were examined, fixed that.
+
+Fixes: 18c908e477dc ("net/mlx5e: Add accelerated RFS support")
+Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
+Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
+Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+@@ -381,14 +381,14 @@ static void arfs_may_expire_flow(struct
+       HLIST_HEAD(del_list);
+       spin_lock_bh(&priv->fs.arfs.arfs_lock);
+       mlx5e_for_each_arfs_rule(arfs_rule, htmp, priv->fs.arfs.arfs_tables, i, j) {
+-              if (quota++ > MLX5E_ARFS_EXPIRY_QUOTA)
+-                      break;
+               if (!work_pending(&arfs_rule->arfs_work) &&
+                   rps_may_expire_flow(priv->netdev,
+                                       arfs_rule->rxq, arfs_rule->flow_id,
+                                       arfs_rule->filter_id)) {
+                       hlist_del_init(&arfs_rule->hlist);
+                       hlist_add_head(&arfs_rule->hlist, &del_list);
++                      if (quota++ > MLX5E_ARFS_EXPIRY_QUOTA)
++                              break;
+               }
+       }
+       spin_unlock_bh(&priv->fs.arfs.arfs_lock);
diff --git a/queue-4.17/net-mlx5e-only-allow-offloading-decap-egress-egdev-flows.patch b/queue-4.17/net-mlx5e-only-allow-offloading-decap-egress-egdev-flows.patch
new file mode 100644 (file)
index 0000000..2954922
--- /dev/null
@@ -0,0 +1,39 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Roi Dayan <roid@mellanox.com>
+Date: Thu, 12 Jul 2018 18:25:59 +0300
+Subject: net/mlx5e: Only allow offloading decap egress (egdev) flows
+
+From: Roi Dayan <roid@mellanox.com>
+
+[ Upstream commit 7e29392eee7a1e3318eeb1099807264a49f60e33 ]
+
+We get egress rules through the egdev mechanism when the ingress device
+is not supporting offload, with the expected use-case of tunnel decap
+ingress rule set on shared tunnel device.
+
+Make sure to offload egress/egdev rules only if decap action (tunnel key
+unset) exists there and err otherwise.
+
+Fixes: 717503b9cf57 ("net: sched: convert cls_flower->egress_dev users to tc_setup_cb_egdev infra")
+Signed-off-by: Roi Dayan <roid@mellanox.com>
+Signed-off-by: Paul Blakey <paulb@mellanox.com>
+Reviewed-by: Or Gerlitz <ogerlitz@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_tc.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+@@ -1894,6 +1894,10 @@ static bool actions_match_supported(stru
+       else
+               actions = flow->nic_attr->action;
++      if (flow->flags & MLX5E_TC_FLOW_EGRESS &&
++          !(actions & MLX5_FLOW_CONTEXT_ACTION_DECAP))
++              return false;
++
+       if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR)
+               return modify_header_match_supported(&parse_attr->spec, exts);
diff --git a/queue-4.17/net-mlx5e-refine-ets-validation-function.patch b/queue-4.17/net-mlx5e-refine-ets-validation-function.patch
new file mode 100644 (file)
index 0000000..4de3267
--- /dev/null
@@ -0,0 +1,74 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Shay Agroskin <shayag@mellanox.com>
+Date: Wed, 27 Jun 2018 15:43:07 +0300
+Subject: net/mlx5e: Refine ets validation function
+
+From: Shay Agroskin <shayag@mellanox.com>
+
+[ Upstream commit e279d634f3d57452eb106a0c0e99a6add3fba1a6 ]
+
+Removed an error message received when configuring ETS total
+bandwidth to be zero.
+Our hardware doesn't support such configuration, so we shall
+reject it in the driver. Nevertheless, we removed the error message
+in order to eliminate error messages caused by old userspace tools
+who try to pass such configuration.
+
+Fixes: ff0891915cd7 ("net/mlx5e: Fix ETS BW check")
+Signed-off-by: Shay Agroskin <shayag@mellanox.com>
+Reviewed-by: Huy Nguyen <huyn@mellanox.com>
+Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c |   17 ++++++++---------
+ 1 file changed, 8 insertions(+), 9 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c
+@@ -272,7 +272,8 @@ int mlx5e_dcbnl_ieee_setets_core(struct
+ }
+ static int mlx5e_dbcnl_validate_ets(struct net_device *netdev,
+-                                  struct ieee_ets *ets)
++                                  struct ieee_ets *ets,
++                                  bool zero_sum_allowed)
+ {
+       bool have_ets_tc = false;
+       int bw_sum = 0;
+@@ -297,8 +298,9 @@ static int mlx5e_dbcnl_validate_ets(stru
+       }
+       if (have_ets_tc && bw_sum != 100) {
+-              netdev_err(netdev,
+-                         "Failed to validate ETS: BW sum is illegal\n");
++              if (bw_sum || (!bw_sum && !zero_sum_allowed))
++                      netdev_err(netdev,
++                                 "Failed to validate ETS: BW sum is illegal\n");
+               return -EINVAL;
+       }
+       return 0;
+@@ -313,7 +315,7 @@ static int mlx5e_dcbnl_ieee_setets(struc
+       if (!MLX5_CAP_GEN(priv->mdev, ets))
+               return -EOPNOTSUPP;
+-      err = mlx5e_dbcnl_validate_ets(netdev, ets);
++      err = mlx5e_dbcnl_validate_ets(netdev, ets, false);
+       if (err)
+               return err;
+@@ -613,12 +615,9 @@ static u8 mlx5e_dcbnl_setall(struct net_
+                         ets.prio_tc[i]);
+       }
+-      err = mlx5e_dbcnl_validate_ets(netdev, &ets);
+-      if (err) {
+-              netdev_err(netdev,
+-                         "%s, Failed to validate ETS: %d\n", __func__, err);
++      err = mlx5e_dbcnl_validate_ets(netdev, &ets, true);
++      if (err)
+               goto out;
+-      }
+       err = mlx5e_dcbnl_ieee_setets_core(priv, &ets);
+       if (err) {
diff --git a/queue-4.17/net-next-hinic-fix-a-problem-in-hinic_xmit_frame.patch b/queue-4.17/net-next-hinic-fix-a-problem-in-hinic_xmit_frame.patch
new file mode 100644 (file)
index 0000000..82fe5b4
--- /dev/null
@@ -0,0 +1,37 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Zhao Chen <zhaochen6@huawei.com>
+Date: Wed, 18 Jul 2018 00:33:18 -0400
+Subject: net-next/hinic: fix a problem in hinic_xmit_frame()
+
+From: Zhao Chen <zhaochen6@huawei.com>
+
+[ Upstream commit f7482683f1f4925c60941dbbd0813ceaa069d106 ]
+
+The calculation of "wqe_size" is not correct when the tx queue is busy in
+hinic_xmit_frame().
+
+When there are no free WQEs, the tx flow will unmap the skb buffer, then
+ring the doobell for the pending packets. But the "wqe_size" which used
+to calculate the doorbell address is not correct. The wqe size should be
+cleared to 0, otherwise, it will cause a doorbell error.
+
+This patch fixes the problem.
+
+Reported-by: Zhou Wang <wangzhou1@hisilicon.com>
+Signed-off-by: Zhao Chen <zhaochen6@huawei.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/huawei/hinic/hinic_tx.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c
++++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c
+@@ -229,6 +229,7 @@ netdev_tx_t hinic_xmit_frame(struct sk_b
+               txq->txq_stats.tx_busy++;
+               u64_stats_update_end(&txq->txq_stats.syncp);
+               err = NETDEV_TX_BUSY;
++              wqe_size = 0;
+               goto flush_skbs;
+       }
diff --git a/queue-4.17/net-phy-consider-phy_ignore_interrupt-in-phy_start_aneg_priv.patch b/queue-4.17/net-phy-consider-phy_ignore_interrupt-in-phy_start_aneg_priv.patch
new file mode 100644 (file)
index 0000000..3bb5a09
--- /dev/null
@@ -0,0 +1,32 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Heiner Kallweit <hkallweit1@gmail.com>
+Date: Thu, 19 Jul 2018 08:15:16 +0200
+Subject: net: phy: consider PHY_IGNORE_INTERRUPT in phy_start_aneg_priv
+
+From: Heiner Kallweit <hkallweit1@gmail.com>
+
+[ Upstream commit 215d08a85b9acf5e1fe9dbf50f1774cde333efef ]
+
+The situation described in the comment can occur also with
+PHY_IGNORE_INTERRUPT, therefore change the condition to include it.
+
+Fixes: f555f34fdc58 ("net: phy: fix auto-negotiation stall due to unavailable interrupt")
+Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/phy/phy.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/phy/phy.c
++++ b/drivers/net/phy/phy.c
+@@ -514,7 +514,7 @@ static int phy_start_aneg_priv(struct ph
+        * negotiation may already be done and aneg interrupt may not be
+        * generated.
+        */
+-      if (phy_interrupt_is_valid(phydev) && (phydev->state == PHY_AN)) {
++      if (phydev->irq != PHY_POLL && phydev->state == PHY_AN) {
+               err = phy_aneg_done(phydev);
+               if (err > 0) {
+                       trigger = true;
diff --git a/queue-4.17/net-skb_segment-should-not-return-null.patch b/queue-4.17/net-skb_segment-should-not-return-null.patch
new file mode 100644 (file)
index 0000000..8f2831a
--- /dev/null
@@ -0,0 +1,139 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Eric Dumazet <edumazet@google.com>
+Date: Thu, 19 Jul 2018 16:04:38 -0700
+Subject: net: skb_segment() should not return NULL
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit ff907a11a0d68a749ce1a321f4505c03bf72190c ]
+
+syzbot caught a NULL deref [1], caused by skb_segment()
+
+skb_segment() has many "goto err;" that assume the @err variable
+contains -ENOMEM.
+
+A successful call to __skb_linearize() should not clear @err,
+otherwise a subsequent memory allocation error could return NULL.
+
+While we are at it, we might use -EINVAL instead of -ENOMEM when
+MAX_SKB_FRAGS limit is reached.
+
+[1]
+kasan: CONFIG_KASAN_INLINE enabled
+kasan: GPF could be caused by NULL-ptr deref or user memory access
+general protection fault: 0000 [#1] SMP KASAN
+CPU: 0 PID: 13285 Comm: syz-executor3 Not tainted 4.18.0-rc4+ #146
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+RIP: 0010:tcp_gso_segment+0x3dc/0x1780 net/ipv4/tcp_offload.c:106
+Code: f0 ff ff 0f 87 1c fd ff ff e8 00 88 0b fb 48 8b 75 d0 48 b9 00 00 00 00 00 fc ff df 48 8d be 90 00 00 00 48 89 f8 48 c1 e8 03 <0f> b6 14 08 48 8d 86 94 00 00 00 48 89 c6 83 e0 07 48 c1 ee 03 0f
+RSP: 0018:ffff88019b7fd060 EFLAGS: 00010206
+RAX: 0000000000000012 RBX: 0000000000000020 RCX: dffffc0000000000
+RDX: 0000000000040000 RSI: 0000000000000000 RDI: 0000000000000090
+RBP: ffff88019b7fd0f0 R08: ffff88019510e0c0 R09: ffffed003b5c46d6
+R10: ffffed003b5c46d6 R11: ffff8801dae236b3 R12: 0000000000000001
+R13: ffff8801d6c581f4 R14: 0000000000000000 R15: ffff8801d6c58128
+FS:  00007fcae64d6700(0000) GS:ffff8801dae00000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00000000004e8664 CR3: 00000001b669b000 CR4: 00000000001406f0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+Call Trace:
+ tcp4_gso_segment+0x1c3/0x440 net/ipv4/tcp_offload.c:54
+ inet_gso_segment+0x64e/0x12d0 net/ipv4/af_inet.c:1342
+ inet_gso_segment+0x64e/0x12d0 net/ipv4/af_inet.c:1342
+ skb_mac_gso_segment+0x3b5/0x740 net/core/dev.c:2792
+ __skb_gso_segment+0x3c3/0x880 net/core/dev.c:2865
+ skb_gso_segment include/linux/netdevice.h:4099 [inline]
+ validate_xmit_skb+0x640/0xf30 net/core/dev.c:3104
+ __dev_queue_xmit+0xc14/0x3910 net/core/dev.c:3561
+ dev_queue_xmit+0x17/0x20 net/core/dev.c:3602
+ neigh_hh_output include/net/neighbour.h:473 [inline]
+ neigh_output include/net/neighbour.h:481 [inline]
+ ip_finish_output2+0x1063/0x1860 net/ipv4/ip_output.c:229
+ ip_finish_output+0x841/0xfa0 net/ipv4/ip_output.c:317
+ NF_HOOK_COND include/linux/netfilter.h:276 [inline]
+ ip_output+0x223/0x880 net/ipv4/ip_output.c:405
+ dst_output include/net/dst.h:444 [inline]
+ ip_local_out+0xc5/0x1b0 net/ipv4/ip_output.c:124
+ iptunnel_xmit+0x567/0x850 net/ipv4/ip_tunnel_core.c:91
+ ip_tunnel_xmit+0x1598/0x3af1 net/ipv4/ip_tunnel.c:778
+ ipip_tunnel_xmit+0x264/0x2c0 net/ipv4/ipip.c:308
+ __netdev_start_xmit include/linux/netdevice.h:4148 [inline]
+ netdev_start_xmit include/linux/netdevice.h:4157 [inline]
+ xmit_one net/core/dev.c:3034 [inline]
+ dev_hard_start_xmit+0x26c/0xc30 net/core/dev.c:3050
+ __dev_queue_xmit+0x29ef/0x3910 net/core/dev.c:3569
+ dev_queue_xmit+0x17/0x20 net/core/dev.c:3602
+ neigh_direct_output+0x15/0x20 net/core/neighbour.c:1403
+ neigh_output include/net/neighbour.h:483 [inline]
+ ip_finish_output2+0xa67/0x1860 net/ipv4/ip_output.c:229
+ ip_finish_output+0x841/0xfa0 net/ipv4/ip_output.c:317
+ NF_HOOK_COND include/linux/netfilter.h:276 [inline]
+ ip_output+0x223/0x880 net/ipv4/ip_output.c:405
+ dst_output include/net/dst.h:444 [inline]
+ ip_local_out+0xc5/0x1b0 net/ipv4/ip_output.c:124
+ ip_queue_xmit+0x9df/0x1f80 net/ipv4/ip_output.c:504
+ tcp_transmit_skb+0x1bf9/0x3f10 net/ipv4/tcp_output.c:1168
+ tcp_write_xmit+0x1641/0x5c20 net/ipv4/tcp_output.c:2363
+ __tcp_push_pending_frames+0xb2/0x290 net/ipv4/tcp_output.c:2536
+ tcp_push+0x638/0x8c0 net/ipv4/tcp.c:735
+ tcp_sendmsg_locked+0x2ec5/0x3f00 net/ipv4/tcp.c:1410
+ tcp_sendmsg+0x2f/0x50 net/ipv4/tcp.c:1447
+ inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798
+ sock_sendmsg_nosec net/socket.c:641 [inline]
+ sock_sendmsg+0xd5/0x120 net/socket.c:651
+ __sys_sendto+0x3d7/0x670 net/socket.c:1797
+ __do_sys_sendto net/socket.c:1809 [inline]
+ __se_sys_sendto net/socket.c:1805 [inline]
+ __x64_sys_sendto+0xe1/0x1a0 net/socket.c:1805
+ do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
+ entry_SYSCALL_64_after_hwframe+0x49/0xbe
+RIP: 0033:0x455ab9
+Code: 1d ba fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb b9 fb ff c3 66 2e 0f 1f 84 00 00 00 00
+RSP: 002b:00007fcae64d5c68 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
+RAX: ffffffffffffffda RBX: 00007fcae64d66d4 RCX: 0000000000455ab9
+RDX: 0000000000000001 RSI: 0000000020000200 RDI: 0000000000000013
+RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
+R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000014
+R13: 00000000004c1145 R14: 00000000004d1818 R15: 0000000000000006
+Modules linked in:
+Dumping ftrace buffer:
+   (ftrace buffer empty)
+
+Fixes: ddff00d42043 ("net: Move skb_has_shared_frag check out of GRE code and into segmentation")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Alexander Duyck <alexander.h.duyck@intel.com>
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/skbuff.c |   10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -3705,6 +3705,7 @@ normal:
+                               net_warn_ratelimited(
+                                       "skb_segment: too many frags: %u %u\n",
+                                       pos, mss);
++                              err = -EINVAL;
+                               goto err;
+                       }
+@@ -3738,11 +3739,10 @@ skip_fraglist:
+ perform_csum_check:
+               if (!csum) {
+-                      if (skb_has_shared_frag(nskb)) {
+-                              err = __skb_linearize(nskb);
+-                              if (err)
+-                                      goto err;
+-                      }
++                      if (skb_has_shared_frag(nskb) &&
++                          __skb_linearize(nskb))
++                              goto err;
++
+                       if (!nskb->remcsum_offload)
+                               nskb->ip_summed = CHECKSUM_NONE;
+                       SKB_GSO_CB(nskb)->csum =
diff --git a/queue-4.17/nfp-flower-ensure-dead-neighbour-entries-are-not-offloaded.patch b/queue-4.17/nfp-flower-ensure-dead-neighbour-entries-are-not-offloaded.patch
new file mode 100644 (file)
index 0000000..829cf36
--- /dev/null
@@ -0,0 +1,37 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: John Hurley <john.hurley@netronome.com>
+Date: Fri, 20 Jul 2018 21:07:54 -0700
+Subject: nfp: flower: ensure dead neighbour entries are not offloaded
+
+From: John Hurley <john.hurley@netronome.com>
+
+[ Upstream commit b809ec869b2cf2af053ffd99e5a46ab600e94aa2 ]
+
+Previously only the neighbour state was checked to decide if an offloaded
+entry should be removed. However, there can be situations when the entry
+is dead but still marked as valid. This can lead to dead entries not
+being removed from fw tables or even incorrect data being added.
+
+Check the entry dead bit before deciding if it should be added to or
+removed from fw neighbour tables.
+
+Fixes: 8e6a9046b66a ("nfp: flower vxlan neighbour offload")
+Signed-off-by: John Hurley <john.hurley@netronome.com>
+Reviewed-by: Jakub Kicinski <jakub.kicinski@netronome.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c
++++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c
+@@ -317,7 +317,7 @@ nfp_tun_write_neigh(struct net_device *n
+       payload.dst_ipv4 = flow->daddr;
+       /* If entry has expired send dst IP with all other fields 0. */
+-      if (!(neigh->nud_state & NUD_VALID)) {
++      if (!(neigh->nud_state & NUD_VALID) || neigh->dead) {
+               nfp_tun_del_route_from_cache(app, payload.dst_ipv4);
+               /* Trigger ARP to verify invalid neighbour state. */
+               neigh_event_send(neigh, NULL);
diff --git a/queue-4.17/r8169-restore-previous-behavior-to-accept-bios-wol-settings.patch b/queue-4.17/r8169-restore-previous-behavior-to-accept-bios-wol-settings.patch
new file mode 100644 (file)
index 0000000..b38990a
--- /dev/null
@@ -0,0 +1,40 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Heiner Kallweit <hkallweit1@gmail.com>
+Date: Tue, 24 Jul 2018 22:21:04 +0200
+Subject: r8169: restore previous behavior to accept BIOS WoL settings
+
+From: Heiner Kallweit <hkallweit1@gmail.com>
+
+[ Upstream commit 18041b523692038d41751fd8046638c356d77a36 ]
+
+Commit 7edf6d314cd0 tried to resolve an inconsistency (BIOS WoL
+settings are accepted, but device isn't wakeup-enabled) resulting
+from a previous broken-BIOS workaround by making disabled WoL the
+default.
+This however had some side effects, most likely due to a broken BIOS
+some systems don't properly resume from suspend when the MagicPacket
+WoL bit isn't set in the chip, see
+https://bugzilla.kernel.org/show_bug.cgi?id=200195
+Therefore restore the WoL behavior from 4.16.
+
+Reported-by: Albert Astals Cid <aacid@kde.org>
+Fixes: 7edf6d314cd0 ("r8169: disable WOL per default")
+Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/realtek/r8169.c |    3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/realtek/r8169.c
++++ b/drivers/net/ethernet/realtek/r8169.c
+@@ -8272,8 +8272,7 @@ static int rtl_init_one(struct pci_dev *
+               return rc;
+       }
+-      /* override BIOS settings, use userspace tools to enable WOL */
+-      __rtl8169_set_wol(tp, 0);
++      tp->saved_wolopts = __rtl8169_get_wol(tp);
+       if (rtl_tbi_enabled(tp)) {
+               tp->set_speed = rtl8169_set_speed_tbi;
diff --git a/queue-4.17/rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch b/queue-4.17/rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch
new file mode 100644 (file)
index 0000000..fb62fd8
--- /dev/null
@@ -0,0 +1,65 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Roopa Prabhu <roopa@cumulusnetworks.com>
+Date: Fri, 20 Jul 2018 13:21:01 -0700
+Subject: rtnetlink: add rtnl_link_state check in rtnl_configure_link
+
+From: Roopa Prabhu <roopa@cumulusnetworks.com>
+
+[ Upstream commit 5025f7f7d506fba9b39e7fe8ca10f6f34cb9bc2d ]
+
+rtnl_configure_link sets dev->rtnl_link_state to
+RTNL_LINK_INITIALIZED and unconditionally calls
+__dev_notify_flags to notify user-space of dev flags.
+
+current call sequence for rtnl_configure_link
+rtnetlink_newlink
+    rtnl_link_ops->newlink
+    rtnl_configure_link (unconditionally notifies userspace of
+                         default and new dev flags)
+
+If a newlink handler wants to call rtnl_configure_link
+early, we will end up with duplicate notifications to
+user-space.
+
+This patch fixes rtnl_configure_link to check rtnl_link_state
+and call __dev_notify_flags with gchanges = 0 if already
+RTNL_LINK_INITIALIZED.
+
+Later in the series, this patch will help the following sequence
+where a driver implementing newlink can call rtnl_configure_link
+to initialize the link early.
+
+makes the following call sequence work:
+rtnetlink_newlink
+    rtnl_link_ops->newlink (vxlan) -> rtnl_configure_link (initializes
+                                                link and notifies
+                                                user-space of default
+                                                dev flags)
+    rtnl_configure_link (updates dev flags if requested by user ifm
+                         and notifies user-space of new dev flags)
+
+Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/rtnetlink.c |    9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- a/net/core/rtnetlink.c
++++ b/net/core/rtnetlink.c
+@@ -2749,9 +2749,12 @@ int rtnl_configure_link(struct net_devic
+                       return err;
+       }
+-      dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
+-
+-      __dev_notify_flags(dev, old_flags, ~0U);
++      if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
++              __dev_notify_flags(dev, old_flags, 0U);
++      } else {
++              dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
++              __dev_notify_flags(dev, old_flags, ~0U);
++      }
+       return 0;
+ }
+ EXPORT_SYMBOL(rtnl_configure_link);
index ae69565c2d79f8425849a4fcb078f6c52efcba36..5bf12943b4fde01affe8acba85caa65389d01977 100644 (file)
@@ -9,3 +9,38 @@ xen-pvh-set-up-gs-segment-for-stack-canary.patch
 kvm-ppc-check-if-iommu-page-is-contained-in-the-pinned-physical-page.patch
 drm-nouveau-drm-nouveau-fix-runtime-pm-leak-in-nv50_disp_atomic_commit.patch
 drm-nouveau-set-driver_atomic-cap-earlier-to-fix-debugfs.patch
+clk-meson-gxbb-set-fclk_div2-as-clk_is_critical.patch
+bonding-set-default-miimon-value-for-non-arp-modes-if-not-set.patch
+ip-hash-fragments-consistently.patch
+ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch
+net-dsa-mv88e6xxx-fix-races-between-lock-and-irq-freeing.patch
+net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch
+net-next-hinic-fix-a-problem-in-hinic_xmit_frame.patch
+net-skb_segment-should-not-return-null.patch
+tcp-fix-dctcp-delayed-ack-schedule.patch
+tcp-helpers-to-send-special-dctcp-ack.patch
+tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch
+tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch
+net-mlx5-e-switch-ubsan-fix-undefined-behavior-in-mlx5_eswitch_mode.patch
+r8169-restore-previous-behavior-to-accept-bios-wol-settings.patch
+tls-check-rcv_shutdown-in-tls_wait_data.patch
+net-mlx5e-add-ingress-egress-indication-for-offloaded-tc-flows.patch
+net-mlx5e-only-allow-offloading-decap-egress-egdev-flows.patch
+net-mlx5e-refine-ets-validation-function.patch
+nfp-flower-ensure-dead-neighbour-entries-are-not-offloaded.patch
+sock-fix-sg-page-frag-coalescing-in-sk_alloc_sg.patch
+net-phy-consider-phy_ignore_interrupt-in-phy_start_aneg_priv.patch
+multicast-do-not-restore-deleted-record-source-filter-mode-to-new-one.patch
+net-ipv6-fix-linklocal-to-global-address-with-vrf.patch
+net-mlx5e-don-t-allow-arfs-for-encapsulated-packets.patch
+net-mlx5e-fix-quota-counting-in-arfs-expire-flow.patch
+net-mlx5-adjust-clock-overflow-work-period.patch
+rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch
+vxlan-add-new-fdb-alloc-and-create-helpers.patch
+vxlan-make-netlink-notify-in-vxlan_fdb_destroy-optional.patch
+vxlan-fix-default-fdb-entry-netlink-notify-ordering-during-netdev-create.patch
+tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch
+tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch
+tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch
+tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch
+tcp-add-tcp_ooo_try_coalesce-helper.patch
diff --git a/queue-4.17/sock-fix-sg-page-frag-coalescing-in-sk_alloc_sg.patch b/queue-4.17/sock-fix-sg-page-frag-coalescing-in-sk_alloc_sg.patch
new file mode 100644 (file)
index 0000000..47df266
--- /dev/null
@@ -0,0 +1,40 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Daniel Borkmann <daniel@iogearbox.net>
+Date: Mon, 23 Jul 2018 22:37:54 +0200
+Subject: sock: fix sg page frag coalescing in sk_alloc_sg
+
+From: Daniel Borkmann <daniel@iogearbox.net>
+
+[ Upstream commit 144fe2bfd236dc814eae587aea7e2af03dbdd755 ]
+
+Current sg coalescing logic in sk_alloc_sg() (latter is used by tls and
+sockmap) is not quite correct in that we do fetch the previous sg entry,
+however the subsequent check whether the refilled page frag from the
+socket is still the same as from the last entry with prior offset and
+length matching the start of the current buffer is comparing always the
+first sg list entry instead of the prior one.
+
+Fixes: 3c4d7559159b ("tls: kernel TLS support")
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Acked-by: Dave Watson <davejwatson@fb.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/sock.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -2270,9 +2270,9 @@ int sk_alloc_sg(struct sock *sk, int len
+               pfrag->offset += use;
+               sge = sg + sg_curr - 1;
+-              if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page &&
+-                  sg->offset + sg->length == orig_offset) {
+-                      sg->length += use;
++              if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page &&
++                  sge->offset + sge->length == orig_offset) {
++                      sge->length += use;
+               } else {
+                       sge = sg + sg_curr;
+                       sg_unmark_end(sge);
diff --git a/queue-4.17/tcp-add-tcp_ooo_try_coalesce-helper.patch b/queue-4.17/tcp-add-tcp_ooo_try_coalesce-helper.patch
new file mode 100644 (file)
index 0000000..a8ec886
--- /dev/null
@@ -0,0 +1,74 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Eric Dumazet <edumazet@google.com>
+Date: Mon, 23 Jul 2018 09:28:21 -0700
+Subject: tcp: add tcp_ooo_try_coalesce() helper
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c ]
+
+In case skb in out_or_order_queue is the result of
+multiple skbs coalescing, we would like to get a proper gso_segs
+counter tracking, so that future tcp_drop() can report an accurate
+number.
+
+I chose to not implement this tracking for skbs in receive queue,
+since they are not dropped, unless socket is disconnected.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |   25 +++++++++++++++++++++----
+ 1 file changed, 21 insertions(+), 4 deletions(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4299,6 +4299,23 @@ static bool tcp_try_coalesce(struct sock
+       return true;
+ }
++static bool tcp_ooo_try_coalesce(struct sock *sk,
++                           struct sk_buff *to,
++                           struct sk_buff *from,
++                           bool *fragstolen)
++{
++      bool res = tcp_try_coalesce(sk, to, from, fragstolen);
++
++      /* In case tcp_drop() is called later, update to->gso_segs */
++      if (res) {
++              u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) +
++                             max_t(u16, 1, skb_shinfo(from)->gso_segs);
++
++              skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF);
++      }
++      return res;
++}
++
+ static void tcp_drop(struct sock *sk, struct sk_buff *skb)
+ {
+       sk_drops_add(sk, skb);
+@@ -4422,8 +4439,8 @@ static void tcp_data_queue_ofo(struct so
+       /* In the typical case, we are adding an skb to the end of the list.
+        * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup.
+        */
+-      if (tcp_try_coalesce(sk, tp->ooo_last_skb,
+-                           skb, &fragstolen)) {
++      if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb,
++                               skb, &fragstolen)) {
+ coalesce_done:
+               tcp_grow_window(sk, skb);
+               kfree_skb_partial(skb, fragstolen);
+@@ -4473,8 +4490,8 @@ coalesce_done:
+                               tcp_drop(sk, skb1);
+                               goto merge_right;
+                       }
+-              } else if (tcp_try_coalesce(sk, skb1,
+-                                          skb, &fragstolen)) {
++              } else if (tcp_ooo_try_coalesce(sk, skb1,
++                                              skb, &fragstolen)) {
+                       goto coalesce_done;
+               }
+               p = &parent->rb_right;
diff --git a/queue-4.17/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch b/queue-4.17/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch
new file mode 100644 (file)
index 0000000..56b9c8a
--- /dev/null
@@ -0,0 +1,46 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Eric Dumazet <edumazet@google.com>
+Date: Mon, 23 Jul 2018 09:28:18 -0700
+Subject: tcp: avoid collapses in tcp_prune_queue() if possible
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7 ]
+
+Right after a TCP flow is created, receiving tiny out of order
+packets allways hit the condition :
+
+if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+       tcp_clamp_window(sk);
+
+tcp_clamp_window() increases sk_rcvbuf to match sk_rmem_alloc
+(guarded by tcp_rmem[2])
+
+Calling tcp_collapse_ofo_queue() in this case is not useful,
+and offers a O(N^2) surface attack to malicious peers.
+
+Better not attempt anything before full queue capacity is reached,
+forcing attacker to spend lots of resource and allow us to more
+easily detect the abuse.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4936,6 +4936,9 @@ static int tcp_prune_queue(struct sock *
+       else if (tcp_under_memory_pressure(sk))
+               tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
++      if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
++              return 0;
++
+       tcp_collapse_ofo_queue(sk);
+       if (!skb_queue_empty(&sk->sk_receive_queue))
+               tcp_collapse(sk, &sk->sk_receive_queue, NULL,
diff --git a/queue-4.17/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch b/queue-4.17/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch
new file mode 100644 (file)
index 0000000..02821eb
--- /dev/null
@@ -0,0 +1,42 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Eric Dumazet <edumazet@google.com>
+Date: Mon, 23 Jul 2018 09:28:20 -0700
+Subject: tcp: call tcp_drop() from tcp_data_queue_ofo()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 8541b21e781a22dce52a74fef0b9bed00404a1cd ]
+
+In order to be able to give better diagnostics and detect
+malicious traffic, we need to have better sk->sk_drops tracking.
+
+Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4451,7 +4451,7 @@ coalesce_done:
+                               /* All the bits are present. Drop. */
+                               NET_INC_STATS(sock_net(sk),
+                                             LINUX_MIB_TCPOFOMERGE);
+-                              __kfree_skb(skb);
++                              tcp_drop(sk, skb);
+                               skb = NULL;
+                               tcp_dsack_set(sk, seq, end_seq);
+                               goto add_sack;
+@@ -4470,7 +4470,7 @@ coalesce_done:
+                                                TCP_SKB_CB(skb1)->end_seq);
+                               NET_INC_STATS(sock_net(sk),
+                                             LINUX_MIB_TCPOFOMERGE);
+-                              __kfree_skb(skb1);
++                              tcp_drop(sk, skb1);
+                               goto merge_right;
+                       }
+               } else if (tcp_try_coalesce(sk, skb1,
diff --git a/queue-4.17/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch b/queue-4.17/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch
new file mode 100644 (file)
index 0000000..0993a8b
--- /dev/null
@@ -0,0 +1,72 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Eric Dumazet <edumazet@google.com>
+Date: Mon, 23 Jul 2018 09:28:19 -0700
+Subject: tcp: detect malicious patterns in tcp_collapse_ofo_queue()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf ]
+
+In case an attacker feeds tiny packets completely out of order,
+tcp_collapse_ofo_queue() might scan the whole rb-tree, performing
+expensive copies, but not changing socket memory usage at all.
+
+1) Do not attempt to collapse tiny skbs.
+2) Add logic to exit early when too many tiny skbs are detected.
+
+We prefer not doing aggressive collapsing (which copies packets)
+for pathological flows, and revert to tcp_prune_ofo_queue() which
+will be less expensive.
+
+In the future, we might add the possibility of terminating flows
+that are proven to be malicious.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |   15 +++++++++++++--
+ 1 file changed, 13 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4834,6 +4834,7 @@ end:
+ static void tcp_collapse_ofo_queue(struct sock *sk)
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
++      u32 range_truesize, sum_tiny = 0;
+       struct sk_buff *skb, *head;
+       u32 start, end;
+@@ -4845,6 +4846,7 @@ new_range:
+       }
+       start = TCP_SKB_CB(skb)->seq;
+       end = TCP_SKB_CB(skb)->end_seq;
++      range_truesize = skb->truesize;
+       for (head = skb;;) {
+               skb = skb_rb_next(skb);
+@@ -4855,11 +4857,20 @@ new_range:
+               if (!skb ||
+                   after(TCP_SKB_CB(skb)->seq, end) ||
+                   before(TCP_SKB_CB(skb)->end_seq, start)) {
+-                      tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+-                                   head, skb, start, end);
++                      /* Do not attempt collapsing tiny skbs */
++                      if (range_truesize != head->truesize ||
++                          end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
++                              tcp_collapse(sk, NULL, &tp->out_of_order_queue,
++                                           head, skb, start, end);
++                      } else {
++                              sum_tiny += range_truesize;
++                              if (sum_tiny > sk->sk_rcvbuf >> 3)
++                                      return;
++                      }
+                       goto new_range;
+               }
++              range_truesize += skb->truesize;
+               if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
+                       start = TCP_SKB_CB(skb)->seq;
+               if (after(TCP_SKB_CB(skb)->end_seq, end))
diff --git a/queue-4.17/tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch b/queue-4.17/tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch
new file mode 100644 (file)
index 0000000..0bf5168
--- /dev/null
@@ -0,0 +1,138 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Yuchung Cheng <ycheng@google.com>
+Date: Wed, 18 Jul 2018 13:56:35 -0700
+Subject: tcp: do not cancel delay-AcK on DCTCP special ACK
+
+From: Yuchung Cheng <ycheng@google.com>
+
+[ Upstream commit 27cde44a259c380a3c09066fc4b42de7dde9b1ad ]
+
+Currently when a DCTCP receiver delays an ACK and receive a
+data packet with a different CE mark from the previous one's, it
+sends two immediate ACKs acking previous and latest sequences
+respectly (for ECN accounting).
+
+Previously sending the first ACK may mark off the delayed ACK timer
+(tcp_event_ack_sent). This may subsequently prevent sending the
+second ACK to acknowledge the latest sequence (tcp_ack_snd_check).
+The culprit is that tcp_send_ack() assumes it always acknowleges
+the latest sequence, which is not true for the first special ACK.
+
+The fix is to not make the assumption in tcp_send_ack and check the
+actual ack sequence before cancelling the delayed ACK. Further it's
+safer to pass the ack sequence number as a local variable into
+tcp_send_ack routine, instead of intercepting tp->rcv_nxt to avoid
+future bugs like this.
+
+Reported-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/tcp.h     |    1 +
+ net/ipv4/tcp_dctcp.c  |   34 ++++------------------------------
+ net/ipv4/tcp_output.c |   11 ++++++++---
+ 3 files changed, 13 insertions(+), 33 deletions(-)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -535,6 +535,7 @@ void tcp_send_fin(struct sock *sk);
+ void tcp_send_active_reset(struct sock *sk, gfp_t priority);
+ int tcp_send_synack(struct sock *);
+ void tcp_push_one(struct sock *, unsigned int mss_now);
++void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
+ void tcp_send_ack(struct sock *sk);
+ void tcp_send_delayed_ack(struct sock *sk);
+ void tcp_send_loss_probe(struct sock *sk);
+--- a/net/ipv4/tcp_dctcp.c
++++ b/net/ipv4/tcp_dctcp.c
+@@ -135,21 +135,8 @@ static void dctcp_ce_state_0_to_1(struct
+        * ACK has not sent yet.
+        */
+       if (!ca->ce_state &&
+-          inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
+-              u32 tmp_rcv_nxt;
+-
+-              /* Save current rcv_nxt. */
+-              tmp_rcv_nxt = tp->rcv_nxt;
+-
+-              /* Generate previous ack with CE=0. */
+-              tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+-              tp->rcv_nxt = ca->prior_rcv_nxt;
+-
+-              tcp_send_ack(sk);
+-
+-              /* Recover current rcv_nxt. */
+-              tp->rcv_nxt = tmp_rcv_nxt;
+-      }
++          inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
++              __tcp_send_ack(sk, ca->prior_rcv_nxt);
+       ca->prior_rcv_nxt = tp->rcv_nxt;
+       ca->ce_state = 1;
+@@ -166,21 +153,8 @@ static void dctcp_ce_state_1_to_0(struct
+        * ACK has not sent yet.
+        */
+       if (ca->ce_state &&
+-          inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
+-              u32 tmp_rcv_nxt;
+-
+-              /* Save current rcv_nxt. */
+-              tmp_rcv_nxt = tp->rcv_nxt;
+-
+-              /* Generate previous ack with CE=1. */
+-              tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+-              tp->rcv_nxt = ca->prior_rcv_nxt;
+-
+-              tcp_send_ack(sk);
+-
+-              /* Recover current rcv_nxt. */
+-              tp->rcv_nxt = tmp_rcv_nxt;
+-      }
++          inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
++              __tcp_send_ack(sk, ca->prior_rcv_nxt);
+       ca->prior_rcv_nxt = tp->rcv_nxt;
+       ca->ce_state = 0;
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -160,8 +160,13 @@ static void tcp_event_data_sent(struct t
+ }
+ /* Account for an ACK we sent. */
+-static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
++static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
++                                    u32 rcv_nxt)
+ {
++      struct tcp_sock *tp = tcp_sk(sk);
++
++      if (unlikely(rcv_nxt != tp->rcv_nxt))
++              return;  /* Special ACK sent by DCTCP to reflect ECN */
+       tcp_dec_quickack_mode(sk, pkts);
+       inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+ }
+@@ -1149,7 +1154,7 @@ static int __tcp_transmit_skb(struct soc
+       icsk->icsk_af_ops->send_check(sk, skb);
+       if (likely(tcb->tcp_flags & TCPHDR_ACK))
+-              tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
++              tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
+       if (skb->len != tcp_header_size) {
+               tcp_event_data_sent(tp, sk);
+@@ -3627,12 +3632,12 @@ void __tcp_send_ack(struct sock *sk, u32
+       /* Send it off, this clears delayed acks for us. */
+       __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
+ }
++EXPORT_SYMBOL_GPL(__tcp_send_ack);
+ void tcp_send_ack(struct sock *sk)
+ {
+       __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
+ }
+-EXPORT_SYMBOL_GPL(tcp_send_ack);
+ /* This routine sends a packet with an out of date sequence
+  * number. It assumes the other end will try to ack it.
diff --git a/queue-4.17/tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch b/queue-4.17/tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch
new file mode 100644 (file)
index 0000000..2619d45
--- /dev/null
@@ -0,0 +1,138 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Yuchung Cheng <ycheng@google.com>
+Date: Wed, 18 Jul 2018 13:56:36 -0700
+Subject: tcp: do not delay ACK in DCTCP upon CE status change
+
+From: Yuchung Cheng <ycheng@google.com>
+
+[ Upstream commit a0496ef2c23b3b180902dd185d0d63ccbc624cf8 ]
+
+Per DCTCP RFC8257 (Section 3.2) the ACK reflecting the CE status change
+has to be sent immediately so the sender can respond quickly:
+
+""" When receiving packets, the CE codepoint MUST be processed as follows:
+
+   1.  If the CE codepoint is set and DCTCP.CE is false, set DCTCP.CE to
+       true and send an immediate ACK.
+
+   2.  If the CE codepoint is not set and DCTCP.CE is true, set DCTCP.CE
+       to false and send an immediate ACK.
+"""
+
+Previously DCTCP implementation may continue to delay the ACK. This
+patch fixes that to implement the RFC by forcing an immediate ACK.
+
+Tested with this packetdrill script provided by Larry Brakmo
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < [ect0] SEW 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.100 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+0.110 < [ect0] . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+   +0 setsockopt(4, SOL_SOCKET, SO_DEBUG, [1], 4) = 0
+
+0.200 < [ect0] . 1:1001(1000) ack 1 win 257
+0.200 > [ect01] . 1:1(0) ack 1001
+
+0.200 write(4, ..., 1) = 1
+0.200 > [ect01] P. 1:2(1) ack 1001
+
+0.200 < [ect0] . 1001:2001(1000) ack 2 win 257
++0.005 < [ce] . 2001:3001(1000) ack 2 win 257
+
++0.000 > [ect01] . 2:2(0) ack 2001
+// Previously the ACK below would be delayed by 40ms
++0.000 > [ect01] E. 2:2(0) ack 3001
+
++0.500 < F. 9501:9501(0) ack 4 win 257
+
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/tcp.h    |    1 +
+ net/ipv4/tcp_dctcp.c |   30 ++++++++++++++++++------------
+ net/ipv4/tcp_input.c |    3 ++-
+ 3 files changed, 21 insertions(+), 13 deletions(-)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -342,6 +342,7 @@ ssize_t tcp_splice_read(struct socket *s
+                       struct pipe_inode_info *pipe, size_t len,
+                       unsigned int flags);
++void tcp_enter_quickack_mode(struct sock *sk);
+ static inline void tcp_dec_quickack_mode(struct sock *sk,
+                                        const unsigned int pkts)
+ {
+--- a/net/ipv4/tcp_dctcp.c
++++ b/net/ipv4/tcp_dctcp.c
+@@ -131,12 +131,15 @@ static void dctcp_ce_state_0_to_1(struct
+       struct dctcp *ca = inet_csk_ca(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
+-      /* State has changed from CE=0 to CE=1 and delayed
+-       * ACK has not sent yet.
+-       */
+-      if (!ca->ce_state &&
+-          inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+-              __tcp_send_ack(sk, ca->prior_rcv_nxt);
++      if (!ca->ce_state) {
++              /* State has changed from CE=0 to CE=1, force an immediate
++               * ACK to reflect the new CE state. If an ACK was delayed,
++               * send that first to reflect the prior CE state.
++               */
++              if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
++                      __tcp_send_ack(sk, ca->prior_rcv_nxt);
++              tcp_enter_quickack_mode(sk);
++      }
+       ca->prior_rcv_nxt = tp->rcv_nxt;
+       ca->ce_state = 1;
+@@ -149,12 +152,15 @@ static void dctcp_ce_state_1_to_0(struct
+       struct dctcp *ca = inet_csk_ca(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
+-      /* State has changed from CE=1 to CE=0 and delayed
+-       * ACK has not sent yet.
+-       */
+-      if (ca->ce_state &&
+-          inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+-              __tcp_send_ack(sk, ca->prior_rcv_nxt);
++      if (ca->ce_state) {
++              /* State has changed from CE=1 to CE=0, force an immediate
++               * ACK to reflect the new CE state. If an ACK was delayed,
++               * send that first to reflect the prior CE state.
++               */
++              if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
++                      __tcp_send_ack(sk, ca->prior_rcv_nxt);
++              tcp_enter_quickack_mode(sk);
++      }
+       ca->prior_rcv_nxt = tp->rcv_nxt;
+       ca->ce_state = 0;
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -195,13 +195,14 @@ static void tcp_incr_quickack(struct soc
+               icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+ }
+-static void tcp_enter_quickack_mode(struct sock *sk)
++void tcp_enter_quickack_mode(struct sock *sk)
+ {
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       tcp_incr_quickack(sk);
+       icsk->icsk_ack.pingpong = 0;
+       icsk->icsk_ack.ato = TCP_ATO_MIN;
+ }
++EXPORT_SYMBOL(tcp_enter_quickack_mode);
+ /* Send ACKs quickly, if "quick" count is not exhausted
+  * and the session is not interactive.
diff --git a/queue-4.17/tcp-fix-dctcp-delayed-ack-schedule.patch b/queue-4.17/tcp-fix-dctcp-delayed-ack-schedule.patch
new file mode 100644 (file)
index 0000000..bea6352
--- /dev/null
@@ -0,0 +1,98 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Yuchung Cheng <ycheng@google.com>
+Date: Thu, 12 Jul 2018 06:04:52 -0700
+Subject: tcp: fix dctcp delayed ACK schedule
+
+From: Yuchung Cheng <ycheng@google.com>
+
+[ Upstream commit b0c05d0e99d98d7f0cd41efc1eeec94efdc3325d ]
+
+Previously, when a data segment was sent an ACK was piggybacked
+on the data segment without generating a CA_EVENT_NON_DELAYED_ACK
+event to notify congestion control modules. So the DCTCP
+ca->delayed_ack_reserved flag could incorrectly stay set when
+in fact there were no delayed ACKs being reserved. This could result
+in sending a special ECN notification ACK that carries an older
+ACK sequence, when in fact there was no need for such an ACK.
+DCTCP keeps track of the delayed ACK status with its own separate
+state ca->delayed_ack_reserved. Previously it may accidentally cancel
+the delayed ACK without updating this field upon sending a special
+ACK that carries a older ACK sequence. This inconsistency would
+lead to DCTCP receiver never acknowledging the latest data until the
+sender times out and retry in some cases.
+
+Packetdrill script (provided by Larry Brakmo)
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < [ect0] SEW 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.100 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+0.110 < [ect0] . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+0.200 < [ect0] . 1:1001(1000) ack 1 win 257
+0.200 > [ect01] . 1:1(0) ack 1001
+
+0.200 write(4, ..., 1) = 1
+0.200 > [ect01] P. 1:2(1) ack 1001
+
+0.200 < [ect0] . 1001:2001(1000) ack 2 win 257
+0.200 write(4, ..., 1) = 1
+0.200 > [ect01] P. 2:3(1) ack 2001
+
+0.200 < [ect0] . 2001:3001(1000) ack 3 win 257
+0.200 < [ect0] . 3001:4001(1000) ack 3 win 257
+0.200 > [ect01] . 3:3(0) ack 4001
+
+0.210 < [ce] P. 4001:4501(500) ack 3 win 257
+
++0.001 read(4, ..., 4500) = 4500
++0 write(4, ..., 1) = 1
++0 > [ect01] PE. 3:4(1) ack 4501
+
++0.010 < [ect0] W. 4501:5501(1000) ack 4 win 257
+// Previously the ACK sequence below would be 4501, causing a long RTO
++0.040~+0.045 > [ect01] . 4:4(0) ack 5501   // delayed ack
+
++0.311 < [ect0] . 5501:6501(1000) ack 4 win 257  // More data
++0 > [ect01] . 4:4(0) ack 6501     // now acks everything
+
++0.500 < F. 9501:9501(0) ack 4 win 257
+
+Reported-by: Larry Brakmo <brakmo@fb.com>
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Acked-by: Lawrence Brakmo <brakmo@fb.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_dctcp.c |    6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/tcp_dctcp.c
++++ b/net/ipv4/tcp_dctcp.c
+@@ -134,7 +134,8 @@ static void dctcp_ce_state_0_to_1(struct
+       /* State has changed from CE=0 to CE=1 and delayed
+        * ACK has not sent yet.
+        */
+-      if (!ca->ce_state && ca->delayed_ack_reserved) {
++      if (!ca->ce_state &&
++          inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
+               u32 tmp_rcv_nxt;
+               /* Save current rcv_nxt. */
+@@ -164,7 +165,8 @@ static void dctcp_ce_state_1_to_0(struct
+       /* State has changed from CE=1 to CE=0 and delayed
+        * ACK has not sent yet.
+        */
+-      if (ca->ce_state && ca->delayed_ack_reserved) {
++      if (ca->ce_state &&
++          inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
+               u32 tmp_rcv_nxt;
+               /* Save current rcv_nxt. */
diff --git a/queue-4.17/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch b/queue-4.17/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch
new file mode 100644 (file)
index 0000000..efdd47d
--- /dev/null
@@ -0,0 +1,76 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Eric Dumazet <edumazet@google.com>
+Date: Mon, 23 Jul 2018 09:28:17 -0700
+Subject: tcp: free batches of packets in tcp_prune_ofo_queue()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 72cd43ba64fc172a443410ce01645895850844c8 ]
+
+Juha-Matti Tilli reported that malicious peers could inject tiny
+packets in out_of_order_queue, forcing very expensive calls
+to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for
+every incoming packet. out_of_order_queue rb-tree can contain
+thousands of nodes, iterating over all of them is not nice.
+
+Before linux-4.9, we would have pruned all packets in ofo_queue
+in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs
+truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB.
+
+Since we plan to increase tcp_rmem[2] in the future to cope with
+modern BDP, can not revert to the old behavior, without great pain.
+
+Strategy taken in this patch is to purge ~12.5 % of the queue capacity.
+
+Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c |   15 +++++++++++----
+ 1 file changed, 11 insertions(+), 4 deletions(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4874,6 +4874,7 @@ new_range:
+  * 2) not add too big latencies if thousands of packets sit there.
+  *    (But if application shrinks SO_RCVBUF, we could still end up
+  *     freeing whole queue here)
++ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
+  *
+  * Return true if queue has shrunk.
+  */
+@@ -4881,20 +4882,26 @@ static bool tcp_prune_ofo_queue(struct s
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct rb_node *node, *prev;
++      int goal;
+       if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
+               return false;
+       NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
++      goal = sk->sk_rcvbuf >> 3;
+       node = &tp->ooo_last_skb->rbnode;
+       do {
+               prev = rb_prev(node);
+               rb_erase(node, &tp->out_of_order_queue);
++              goal -= rb_to_skb(node)->truesize;
+               tcp_drop(sk, rb_to_skb(node));
+-              sk_mem_reclaim(sk);
+-              if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+-                  !tcp_under_memory_pressure(sk))
+-                      break;
++              if (!prev || goal <= 0) {
++                      sk_mem_reclaim(sk);
++                      if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
++                          !tcp_under_memory_pressure(sk))
++                              break;
++                      goal = sk->sk_rcvbuf >> 3;
++              }
+               node = prev;
+       } while (node);
+       tp->ooo_last_skb = rb_to_skb(prev);
diff --git a/queue-4.17/tcp-helpers-to-send-special-dctcp-ack.patch b/queue-4.17/tcp-helpers-to-send-special-dctcp-ack.patch
new file mode 100644 (file)
index 0000000..6c56541
--- /dev/null
@@ -0,0 +1,79 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Yuchung Cheng <ycheng@google.com>
+Date: Wed, 18 Jul 2018 13:56:34 -0700
+Subject: tcp: helpers to send special DCTCP ack
+
+From: Yuchung Cheng <ycheng@google.com>
+
+[ Upstream commit 2987babb6982306509380fc11b450227a844493b ]
+
+Refactor and create helpers to send the special ACK in DCTCP.
+
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_output.c |   22 +++++++++++++++++-----
+ 1 file changed, 17 insertions(+), 5 deletions(-)
+
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -1031,8 +1031,8 @@ static void tcp_update_skb_after_send(st
+  * We are working here with either a clone of the original
+  * SKB, or a fresh unique copy made by the retransmit engine.
+  */
+-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+-                          gfp_t gfp_mask)
++static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
++                            int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
+ {
+       const struct inet_connection_sock *icsk = inet_csk(sk);
+       struct inet_sock *inet;
+@@ -1108,7 +1108,7 @@ static int tcp_transmit_skb(struct sock
+       th->source              = inet->inet_sport;
+       th->dest                = inet->inet_dport;
+       th->seq                 = htonl(tcb->seq);
+-      th->ack_seq             = htonl(tp->rcv_nxt);
++      th->ack_seq             = htonl(rcv_nxt);
+       *(((__be16 *)th) + 6)   = htons(((tcp_header_size >> 2) << 12) |
+                                       tcb->tcp_flags);
+@@ -1186,6 +1186,13 @@ static int tcp_transmit_skb(struct sock
+       return err;
+ }
++static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
++                          gfp_t gfp_mask)
++{
++      return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
++                                tcp_sk(sk)->rcv_nxt);
++}
++
+ /* This routine just queues the buffer for sending.
+  *
+  * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
+@@ -3583,7 +3590,7 @@ void tcp_send_delayed_ack(struct sock *s
+ }
+ /* This routine sends an ack and also updates the window. */
+-void tcp_send_ack(struct sock *sk)
++void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
+ {
+       struct sk_buff *buff;
+@@ -3618,7 +3625,12 @@ void tcp_send_ack(struct sock *sk)
+       skb_set_tcp_pure_ack(buff);
+       /* Send it off, this clears delayed acks for us. */
+-      tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0);
++      __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
++}
++
++void tcp_send_ack(struct sock *sk)
++{
++      __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
+ }
+ EXPORT_SYMBOL_GPL(tcp_send_ack);
diff --git a/queue-4.17/tls-check-rcv_shutdown-in-tls_wait_data.patch b/queue-4.17/tls-check-rcv_shutdown-in-tls_wait_data.patch
new file mode 100644 (file)
index 0000000..75a1f9c
--- /dev/null
@@ -0,0 +1,37 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Doron Roberts-Kedes <doronrk@fb.com>
+Date: Wed, 18 Jul 2018 16:22:27 -0700
+Subject: tls: check RCV_SHUTDOWN in tls_wait_data
+
+From: Doron Roberts-Kedes <doronrk@fb.com>
+
+[ Upstream commit fcf4793e278edede8fcd748198d12128037e526c ]
+
+The current code does not check sk->sk_shutdown & RCV_SHUTDOWN.
+tls_sw_recvmsg may return a positive value in the case where bytes have
+already been copied when the socket is shutdown. sk->sk_err has been
+cleared, causing the tls_wait_data to hang forever on a subsequent
+invocation. Checking sk->sk_shutdown & RCV_SHUTDOWN, as in tcp_recvmsg,
+fixes this problem.
+
+Fixes: c46234ebb4d1 ("tls: RX path for ktls")
+Acked-by: Dave Watson <davejwatson@fb.com>
+Signed-off-by: Doron Roberts-Kedes <doronrk@fb.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/tls/tls_sw.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -646,6 +646,9 @@ static struct sk_buff *tls_wait_data(str
+                       return NULL;
+               }
++              if (sk->sk_shutdown & RCV_SHUTDOWN)
++                      return NULL;
++
+               if (sock_flag(sk, SOCK_DONE))
+                       return NULL;
diff --git a/queue-4.17/vxlan-add-new-fdb-alloc-and-create-helpers.patch b/queue-4.17/vxlan-add-new-fdb-alloc-and-create-helpers.patch
new file mode 100644 (file)
index 0000000..4e7e716
--- /dev/null
@@ -0,0 +1,169 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Roopa Prabhu <roopa@cumulusnetworks.com>
+Date: Fri, 20 Jul 2018 13:21:02 -0700
+Subject: vxlan: add new fdb alloc and create helpers
+
+From: Roopa Prabhu <roopa@cumulusnetworks.com>
+
+[ Upstream commit 7431016b107c95cb5b2014aa1901fcb115f746bc ]
+
+- Add new vxlan_fdb_alloc helper
+- rename existing vxlan_fdb_create into vxlan_fdb_update:
+        because it really creates or updates an existing
+        fdb entry
+- move new fdb creation into a separate vxlan_fdb_create
+
+Main motivation for this change is to introduce the ability
+to decouple vxlan fdb creation and notify, used in a later patch.
+
+Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/vxlan.c |   91 +++++++++++++++++++++++++++++++++++-----------------
+ 1 file changed, 62 insertions(+), 29 deletions(-)
+
+--- a/drivers/net/vxlan.c
++++ b/drivers/net/vxlan.c
+@@ -636,9 +636,62 @@ static int vxlan_gro_complete(struct soc
+       return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr));
+ }
+-/* Add new entry to forwarding table -- assumes lock held */
++static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan,
++                                       const u8 *mac, __u16 state,
++                                       __be32 src_vni, __u8 ndm_flags)
++{
++      struct vxlan_fdb *f;
++
++      f = kmalloc(sizeof(*f), GFP_ATOMIC);
++      if (!f)
++              return NULL;
++      f->state = state;
++      f->flags = ndm_flags;
++      f->updated = f->used = jiffies;
++      f->vni = src_vni;
++      INIT_LIST_HEAD(&f->remotes);
++      memcpy(f->eth_addr, mac, ETH_ALEN);
++
++      return f;
++}
++
+ static int vxlan_fdb_create(struct vxlan_dev *vxlan,
+                           const u8 *mac, union vxlan_addr *ip,
++                          __u16 state, __be16 port, __be32 src_vni,
++                          __be32 vni, __u32 ifindex, __u8 ndm_flags,
++                          struct vxlan_fdb **fdb)
++{
++      struct vxlan_rdst *rd = NULL;
++      struct vxlan_fdb *f;
++      int rc;
++
++      if (vxlan->cfg.addrmax &&
++          vxlan->addrcnt >= vxlan->cfg.addrmax)
++              return -ENOSPC;
++
++      netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
++      f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags);
++      if (!f)
++              return -ENOMEM;
++
++      rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
++      if (rc < 0) {
++              kfree(f);
++              return rc;
++      }
++
++      ++vxlan->addrcnt;
++      hlist_add_head_rcu(&f->hlist,
++                         vxlan_fdb_head(vxlan, mac, src_vni));
++
++      *fdb = f;
++
++      return 0;
++}
++
++/* Add new entry to forwarding table -- assumes lock held */
++static int vxlan_fdb_update(struct vxlan_dev *vxlan,
++                          const u8 *mac, union vxlan_addr *ip,
+                           __u16 state, __u16 flags,
+                           __be16 port, __be32 src_vni, __be32 vni,
+                           __u32 ifindex, __u8 ndm_flags)
+@@ -687,37 +740,17 @@ static int vxlan_fdb_create(struct vxlan
+               if (!(flags & NLM_F_CREATE))
+                       return -ENOENT;
+-              if (vxlan->cfg.addrmax &&
+-                  vxlan->addrcnt >= vxlan->cfg.addrmax)
+-                      return -ENOSPC;
+-
+               /* Disallow replace to add a multicast entry */
+               if ((flags & NLM_F_REPLACE) &&
+                   (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac)))
+                       return -EOPNOTSUPP;
+               netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip);
+-              f = kmalloc(sizeof(*f), GFP_ATOMIC);
+-              if (!f)
+-                      return -ENOMEM;
+-
+-              notify = 1;
+-              f->state = state;
+-              f->flags = ndm_flags;
+-              f->updated = f->used = jiffies;
+-              f->vni = src_vni;
+-              INIT_LIST_HEAD(&f->remotes);
+-              memcpy(f->eth_addr, mac, ETH_ALEN);
+-
+-              rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd);
+-              if (rc < 0) {
+-                      kfree(f);
++              rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni,
++                                    vni, ifindex, ndm_flags, &f);
++              if (rc < 0)
+                       return rc;
+-              }
+-
+-              ++vxlan->addrcnt;
+-              hlist_add_head_rcu(&f->hlist,
+-                                 vxlan_fdb_head(vxlan, mac, src_vni));
++              notify = 1;
+       }
+       if (notify) {
+@@ -863,7 +896,7 @@ static int vxlan_fdb_add(struct ndmsg *n
+               return -EAFNOSUPPORT;
+       spin_lock_bh(&vxlan->hash_lock);
+-      err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags,
++      err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags,
+                              port, src_vni, vni, ifindex, ndm->ndm_flags);
+       spin_unlock_bh(&vxlan->hash_lock);
+@@ -1006,7 +1039,7 @@ static bool vxlan_snoop(struct net_devic
+               /* close off race between vxlan_flush and incoming packets */
+               if (netif_running(dev))
+-                      vxlan_fdb_create(vxlan, src_mac, src_ip,
++                      vxlan_fdb_update(vxlan, src_mac, src_ip,
+                                        NUD_REACHABLE,
+                                        NLM_F_EXCL|NLM_F_CREATE,
+                                        vxlan->cfg.dst_port,
+@@ -3165,7 +3198,7 @@ static int __vxlan_dev_create(struct net
+       /* create an fdb entry for a valid default destination */
+       if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) {
+-              err = vxlan_fdb_create(vxlan, all_zeros_mac,
++              err = vxlan_fdb_update(vxlan, all_zeros_mac,
+                                      &vxlan->default_dst.remote_ip,
+                                      NUD_REACHABLE | NUD_PERMANENT,
+                                      NLM_F_EXCL | NLM_F_CREATE,
+@@ -3439,7 +3472,7 @@ static int vxlan_changelink(struct net_d
+                                          old_dst.remote_ifindex, 0);
+               if (!vxlan_addr_any(&dst->remote_ip)) {
+-                      err = vxlan_fdb_create(vxlan, all_zeros_mac,
++                      err = vxlan_fdb_update(vxlan, all_zeros_mac,
+                                              &dst->remote_ip,
+                                              NUD_REACHABLE | NUD_PERMANENT,
+                                              NLM_F_CREATE | NLM_F_APPEND,
diff --git a/queue-4.17/vxlan-fix-default-fdb-entry-netlink-notify-ordering-during-netdev-create.patch b/queue-4.17/vxlan-fix-default-fdb-entry-netlink-notify-ordering-during-netdev-create.patch
new file mode 100644 (file)
index 0000000..344f0fc
--- /dev/null
@@ -0,0 +1,116 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Roopa Prabhu <roopa@cumulusnetworks.com>
+Date: Fri, 20 Jul 2018 13:21:04 -0700
+Subject: vxlan: fix default fdb entry netlink notify ordering during netdev create
+
+From: Roopa Prabhu <roopa@cumulusnetworks.com>
+
+[ Upstream commit e99465b952861533d9ba748fdbecc96d9a36da3e ]
+
+Problem:
+In vxlan_newlink, a default fdb entry is added before register_netdev.
+The default fdb creation function also notifies user-space of the
+fdb entry on the vxlan device which user-space does not know about yet.
+(RTM_NEWNEIGH goes before RTM_NEWLINK for the same ifindex).
+
+This patch fixes the user-space netlink notification ordering issue
+with the following changes:
+- decouple fdb notify from fdb create.
+- Move fdb notify after register_netdev.
+- Call rtnl_configure_link in vxlan newlink handler to notify
+userspace about the newlink before fdb notify and
+hence avoiding the user-space race.
+
+Fixes: afbd8bae9c79 ("vxlan: add implicit fdb entry for default destination")
+Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/vxlan.c |   29 +++++++++++++++++++++--------
+ 1 file changed, 21 insertions(+), 8 deletions(-)
+
+--- a/drivers/net/vxlan.c
++++ b/drivers/net/vxlan.c
+@@ -3190,6 +3190,7 @@ static int __vxlan_dev_create(struct net
+ {
+       struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+       struct vxlan_dev *vxlan = netdev_priv(dev);
++      struct vxlan_fdb *f = NULL;
+       int err;
+       err = vxlan_dev_configure(net, dev, conf, false, extack);
+@@ -3200,27 +3201,38 @@ static int __vxlan_dev_create(struct net
+       /* create an fdb entry for a valid default destination */
+       if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) {
+-              err = vxlan_fdb_update(vxlan, all_zeros_mac,
++              err = vxlan_fdb_create(vxlan, all_zeros_mac,
+                                      &vxlan->default_dst.remote_ip,
+                                      NUD_REACHABLE | NUD_PERMANENT,
+-                                     NLM_F_EXCL | NLM_F_CREATE,
+                                      vxlan->cfg.dst_port,
+                                      vxlan->default_dst.remote_vni,
+                                      vxlan->default_dst.remote_vni,
+                                      vxlan->default_dst.remote_ifindex,
+-                                     NTF_SELF);
++                                     NTF_SELF, &f);
+               if (err)
+                       return err;
+       }
+       err = register_netdevice(dev);
++      if (err)
++              goto errout;
++
++      err = rtnl_configure_link(dev, NULL);
+       if (err) {
+-              vxlan_fdb_delete_default(vxlan, vxlan->default_dst.remote_vni);
+-              return err;
++              unregister_netdevice(dev);
++              goto errout;
+       }
++      /* notify default fdb entry */
++      if (f)
++              vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH);
++
+       list_add(&vxlan->next, &vn->vxlan_list);
+       return 0;
++errout:
++      if (f)
++              vxlan_fdb_destroy(vxlan, f, false);
++      return err;
+ }
+ static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[],
+@@ -3449,6 +3461,7 @@ static int vxlan_changelink(struct net_d
+       struct vxlan_rdst *dst = &vxlan->default_dst;
+       struct vxlan_rdst old_dst;
+       struct vxlan_config conf;
++      struct vxlan_fdb *f = NULL;
+       int err;
+       err = vxlan_nl2conf(tb, data,
+@@ -3474,19 +3487,19 @@ static int vxlan_changelink(struct net_d
+                                          old_dst.remote_ifindex, 0);
+               if (!vxlan_addr_any(&dst->remote_ip)) {
+-                      err = vxlan_fdb_update(vxlan, all_zeros_mac,
++                      err = vxlan_fdb_create(vxlan, all_zeros_mac,
+                                              &dst->remote_ip,
+                                              NUD_REACHABLE | NUD_PERMANENT,
+-                                             NLM_F_CREATE | NLM_F_APPEND,
+                                              vxlan->cfg.dst_port,
+                                              dst->remote_vni,
+                                              dst->remote_vni,
+                                              dst->remote_ifindex,
+-                                             NTF_SELF);
++                                             NTF_SELF, &f);
+                       if (err) {
+                               spin_unlock_bh(&vxlan->hash_lock);
+                               return err;
+                       }
++                      vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH);
+               }
+               spin_unlock_bh(&vxlan->hash_lock);
+       }
diff --git a/queue-4.17/vxlan-make-netlink-notify-in-vxlan_fdb_destroy-optional.patch b/queue-4.17/vxlan-make-netlink-notify-in-vxlan_fdb_destroy-optional.patch
new file mode 100644 (file)
index 0000000..1b1fcc5
--- /dev/null
@@ -0,0 +1,75 @@
+From foo@baz Fri Jul 27 08:31:26 CEST 2018
+From: Roopa Prabhu <roopa@cumulusnetworks.com>
+Date: Fri, 20 Jul 2018 13:21:03 -0700
+Subject: vxlan: make netlink notify in vxlan_fdb_destroy optional
+
+From: Roopa Prabhu <roopa@cumulusnetworks.com>
+
+[ Upstream commit f6e053858671bb156b6e44ad66418acc8c7f4e77 ]
+
+Add a new option do_notify to vxlan_fdb_destroy to make
+sending netlink notify optional. Used by a later patch.
+
+Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/vxlan.c |   14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+--- a/drivers/net/vxlan.c
++++ b/drivers/net/vxlan.c
+@@ -774,13 +774,15 @@ static void vxlan_fdb_free(struct rcu_he
+       kfree(f);
+ }
+-static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
++static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f,
++                            bool do_notify)
+ {
+       netdev_dbg(vxlan->dev,
+                   "delete %pM\n", f->eth_addr);
+       --vxlan->addrcnt;
+-      vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_DELNEIGH);
++      if (do_notify)
++              vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_DELNEIGH);
+       hlist_del_rcu(&f->hlist);
+       call_rcu(&f->rcu, vxlan_fdb_free);
+@@ -930,7 +932,7 @@ static int __vxlan_fdb_delete(struct vxl
+               goto out;
+       }
+-      vxlan_fdb_destroy(vxlan, f);
++      vxlan_fdb_destroy(vxlan, f, true);
+ out:
+       return 0;
+@@ -2393,7 +2395,7 @@ static void vxlan_cleanup(struct timer_l
+                                          "garbage collect %pM\n",
+                                          f->eth_addr);
+                               f->state = NUD_STALE;
+-                              vxlan_fdb_destroy(vxlan, f);
++                              vxlan_fdb_destroy(vxlan, f, true);
+                       } else if (time_before(timeout, next_timer))
+                               next_timer = timeout;
+               }
+@@ -2444,7 +2446,7 @@ static void vxlan_fdb_delete_default(str
+       spin_lock_bh(&vxlan->hash_lock);
+       f = __vxlan_find_mac(vxlan, all_zeros_mac, vni);
+       if (f)
+-              vxlan_fdb_destroy(vxlan, f);
++              vxlan_fdb_destroy(vxlan, f, true);
+       spin_unlock_bh(&vxlan->hash_lock);
+ }
+@@ -2498,7 +2500,7 @@ static void vxlan_flush(struct vxlan_dev
+                               continue;
+                       /* the all_zeros_mac entry is deleted at vxlan_uninit */
+                       if (!is_zero_ether_addr(f->eth_addr))
+-                              vxlan_fdb_destroy(vxlan, f);
++                              vxlan_fdb_destroy(vxlan, f, true);
+               }
+       }
+       spin_unlock_bh(&vxlan->hash_lock);