From: Greg Kroah-Hartman Date: Fri, 27 Jul 2018 06:33:10 +0000 (+0200) Subject: 4.17-stable patches X-Git-Tag: v3.18.117~12 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7dfb01171e863b698eb4f1134220da91afe4bf1a;p=thirdparty%2Fkernel%2Fstable-queue.git 4.17-stable patches added patches: bonding-set-default-miimon-value-for-non-arp-modes-if-not-set.patch clk-meson-gxbb-set-fclk_div2-as-clk_is_critical.patch ip-hash-fragments-consistently.patch ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch multicast-do-not-restore-deleted-record-source-filter-mode-to-new-one.patch net-dsa-mv88e6xxx-fix-races-between-lock-and-irq-freeing.patch net-ipv6-fix-linklocal-to-global-address-with-vrf.patch net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch net-mlx5-adjust-clock-overflow-work-period.patch net-mlx5-e-switch-ubsan-fix-undefined-behavior-in-mlx5_eswitch_mode.patch net-mlx5e-add-ingress-egress-indication-for-offloaded-tc-flows.patch net-mlx5e-don-t-allow-arfs-for-encapsulated-packets.patch net-mlx5e-fix-quota-counting-in-arfs-expire-flow.patch net-mlx5e-only-allow-offloading-decap-egress-egdev-flows.patch net-mlx5e-refine-ets-validation-function.patch net-next-hinic-fix-a-problem-in-hinic_xmit_frame.patch net-phy-consider-phy_ignore_interrupt-in-phy_start_aneg_priv.patch net-skb_segment-should-not-return-null.patch nfp-flower-ensure-dead-neighbour-entries-are-not-offloaded.patch r8169-restore-previous-behavior-to-accept-bios-wol-settings.patch rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch sock-fix-sg-page-frag-coalescing-in-sk_alloc_sg.patch tcp-add-tcp_ooo_try_coalesce-helper.patch tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch tcp-fix-dctcp-delayed-ack-schedule.patch tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch tcp-helpers-to-send-special-dctcp-ack.patch tls-check-rcv_shutdown-in-tls_wait_data.patch vxlan-add-new-fdb-alloc-and-create-helpers.patch vxlan-fix-default-fdb-entry-netlink-notify-ordering-during-netdev-create.patch vxlan-make-netlink-notify-in-vxlan_fdb_destroy-optional.patch --- diff --git a/queue-4.17/bonding-set-default-miimon-value-for-non-arp-modes-if-not-set.patch b/queue-4.17/bonding-set-default-miimon-value-for-non-arp-modes-if-not-set.patch new file mode 100644 index 00000000000..d8e0bfbcf56 --- /dev/null +++ b/queue-4.17/bonding-set-default-miimon-value-for-non-arp-modes-if-not-set.patch @@ -0,0 +1,103 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Jarod Wilson +Date: Wed, 18 Jul 2018 14:49:36 -0400 +Subject: bonding: set default miimon value for non-arp modes if not set + +From: Jarod Wilson + +[ Upstream commit c1f897ce186a529a494441642125479d38727a3d ] + +For some time now, if you load the bonding driver and configure bond +parameters via sysfs using minimal config options, such as specifying +nothing but the mode, relying on defaults for everything else, modes +that cannot use arp monitoring (802.3ad, balance-tlb, balance-alb) all +wind up with both arp_interval=0 (as it should be) and miimon=0, which +means the miimon monitor thread never actually runs. This is particularly +problematic for 802.3ad. + +For example, from an LNST recipe I've set up: + +$ modprobe bonding max_bonds=0" +$ echo "+t_bond0" > /sys/class/net/bonding_masters" +$ ip link set t_bond0 down" +$ echo "802.3ad" > /sys/class/net/t_bond0/bonding/mode" +$ ip link set ens1f1 down" +$ echo "+ens1f1" > /sys/class/net/t_bond0/bonding/slaves" +$ ip link set ens1f0 down" +$ echo "+ens1f0" > /sys/class/net/t_bond0/bonding/slaves" +$ ethtool -i t_bond0" +$ ip link set ens1f1 up" +$ ip link set ens1f0 up" +$ ip link set t_bond0 up" +$ ip addr add 192.168.9.1/24 dev t_bond0" +$ ip addr add 2002::1/64 dev t_bond0" + +This bond comes up okay, but things look slightly suspect in +/proc/net/bonding/t_bond0 output: + +$ grep -i mii /proc/net/bonding/t_bond0 +MII Status: up +MII Polling Interval (ms): 0 +MII Status: up +MII Status: up + +Now, pull a cable on one of the ports in the bond, then reconnect it, and +you'll see: + +Slave Interface: ens1f0 +MII Status: down +Speed: 1000 Mbps +Duplex: full + +I believe this became a major issue as of commit 4d2c0cda0744, which for +802.3ad bonds, sets slave->link = BOND_LINK_DOWN, with a comment about +relying on link monitoring via miimon to set it correctly, but since the +miimon work queue never runs, the link just stays marked down. + +If we simply tweak bond_option_mode_set() slightly, we can check for the +non-arp modes having no miimon value set, and insert BOND_DEFAULT_MIIMON, +which gets things back in full working order. This problem exists as far +back as 4.14, and might be worth fixing in all stable trees since, though +the work-around is to simply specify an miimon value yourself. + +Reported-by: Bob Ball +Signed-off-by: Jarod Wilson +Acked-by: Mahesh Bandewar +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/bonding/bond_options.c | 23 ++++++++++++++--------- + 1 file changed, 14 insertions(+), 9 deletions(-) + +--- a/drivers/net/bonding/bond_options.c ++++ b/drivers/net/bonding/bond_options.c +@@ -743,15 +743,20 @@ const struct bond_option *bond_opt_get(u + static int bond_option_mode_set(struct bonding *bond, + const struct bond_opt_value *newval) + { +- if (!bond_mode_uses_arp(newval->value) && bond->params.arp_interval) { +- netdev_dbg(bond->dev, "%s mode is incompatible with arp monitoring, start mii monitoring\n", +- newval->string); +- /* disable arp monitoring */ +- bond->params.arp_interval = 0; +- /* set miimon to default value */ +- bond->params.miimon = BOND_DEFAULT_MIIMON; +- netdev_dbg(bond->dev, "Setting MII monitoring interval to %d\n", +- bond->params.miimon); ++ if (!bond_mode_uses_arp(newval->value)) { ++ if (bond->params.arp_interval) { ++ netdev_dbg(bond->dev, "%s mode is incompatible with arp monitoring, start mii monitoring\n", ++ newval->string); ++ /* disable arp monitoring */ ++ bond->params.arp_interval = 0; ++ } ++ ++ if (!bond->params.miimon) { ++ /* set miimon to default value */ ++ bond->params.miimon = BOND_DEFAULT_MIIMON; ++ netdev_dbg(bond->dev, "Setting MII monitoring interval to %d\n", ++ bond->params.miimon); ++ } + } + + if (newval->value == BOND_MODE_ALB) diff --git a/queue-4.17/clk-meson-gxbb-set-fclk_div2-as-clk_is_critical.patch b/queue-4.17/clk-meson-gxbb-set-fclk_div2-as-clk_is_critical.patch new file mode 100644 index 00000000000..dc0959e37f3 --- /dev/null +++ b/queue-4.17/clk-meson-gxbb-set-fclk_div2-as-clk_is_critical.patch @@ -0,0 +1,57 @@ +From c987ac6f1f088663b6dad39281071aeb31d450a8 Mon Sep 17 00:00:00 2001 +From: Neil Armstrong +Date: Wed, 13 Jun 2018 14:20:21 +0200 +Subject: clk: meson-gxbb: set fclk_div2 as CLK_IS_CRITICAL + +From: Neil Armstrong + +commit c987ac6f1f088663b6dad39281071aeb31d450a8 upstream. + +On Amlogic Meson GXBB & GXL platforms, the SCPI Cortex-M4 Co-Processor +seems to be dependent on the FCLK_DIV2 to be operationnal. + +The issue occurred since v4.17-rc1 by freezing the kernel boot when +the 'schedutil' cpufreq governor was selected as default : + + [ 12.071837] scpi_protocol scpi: SCP Protocol 0.0 Firmware 0.0.0 version + domain-0 init dvfs: 4 + [ 12.087757] hctosys: unable to open rtc device (rtc0) + [ 12.087907] cfg80211: Loading compiled-in X.509 certificates for regulatory database + [ 12.102241] cfg80211: Loaded X.509 cert 'sforshee: 00b28ddf47aef9cea7' + +But when disabling the MMC driver, the boot finished but cpufreq failed to +change the CPU frequency : + + [ 12.153045] cpufreq: __target_index: Failed to change cpu frequency: -5 + +A bisect between v4.16 and v4.16-rc1 gave +05f814402d61 ("clk: meson: add fdiv clock gates") to be the first bad commit. +This commit added support for the missing clock gates before the fixed PLL +fixed dividers (FCLK_DIVx) and the clock framework basically disabled +all the unused fixed dividers, thus disabled a critical clock path for +the SCPI Co-Processor. + +This patch simply sets the FCLK_DIV2 gate as critical to ensure +nobody can disable it. + +Fixes: 05f814402d61 ("clk: meson: add fdiv clock gates") +Signed-off-by: Neil Armstrong +Tested-by: Kevin Hilman +[few corrections in the commit description] +Signed-off-by: Jerome Brunet +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/clk/meson/gxbb.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/clk/meson/gxbb.c ++++ b/drivers/clk/meson/gxbb.c +@@ -511,6 +511,7 @@ static struct clk_regmap gxbb_fclk_div2 + .ops = &clk_regmap_gate_ops, + .parent_names = (const char *[]){ "fclk_div2_div" }, + .num_parents = 1, ++ .flags = CLK_IS_CRITICAL, + }, + }; + diff --git a/queue-4.17/ip-hash-fragments-consistently.patch b/queue-4.17/ip-hash-fragments-consistently.patch new file mode 100644 index 00000000000..d363c4669cc --- /dev/null +++ b/queue-4.17/ip-hash-fragments-consistently.patch @@ -0,0 +1,73 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Paolo Abeni +Date: Mon, 23 Jul 2018 16:50:48 +0200 +Subject: ip: hash fragments consistently + +From: Paolo Abeni + +[ Upstream commit 3dd1c9a1270736029ffca670e9bd0265f4120600 ] + +The skb hash for locally generated ip[v6] fragments belonging +to the same datagram can vary in several circumstances: +* for connected UDP[v6] sockets, the first fragment get its hash + via set_owner_w()/skb_set_hash_from_sk() +* for unconnected IPv6 UDPv6 sockets, the first fragment can get + its hash via ip6_make_flowlabel()/skb_get_hash_flowi6(), if + auto_flowlabel is enabled + +For the following frags the hash is usually computed via +skb_get_hash(). +The above can cause OoO for unconnected IPv6 UDPv6 socket: in that +scenario the egress tx queue can be selected on a per packet basis +via the skb hash. +It may also fool flow-oriented schedulers to place fragments belonging +to the same datagram in different flows. + +Fix the issue by copying the skb hash from the head frag into +the others at fragmentation time. + +Before this commit: +perf probe -a "dev_queue_xmit skb skb->hash skb->l4_hash:b1@0/8 skb->sw_hash:b1@1/8" +netperf -H $IPV4 -t UDP_STREAM -l 5 -- -m 2000 -n & +perf record -e probe:dev_queue_xmit -e probe:skb_set_owner_w -a sleep 0.1 +perf script +probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=3713014309 l4_hash=1 sw_hash=0 +probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=0 l4_hash=0 sw_hash=0 + +After this commit: +probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=2171763177 l4_hash=1 sw_hash=0 +probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=2171763177 l4_hash=1 sw_hash=0 + +Fixes: b73c3d0e4f0e ("net: Save TX flow hash in sock and set in skbuf on xmit") +Fixes: 67800f9b1f4e ("ipv6: Call skb_get_hash_flowi6 to get skb->hash in ip6_make_flowlabel") +Signed-off-by: Paolo Abeni +Reviewed-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ip_output.c | 2 ++ + net/ipv6/ip6_output.c | 2 ++ + 2 files changed, 4 insertions(+) + +--- a/net/ipv4/ip_output.c ++++ b/net/ipv4/ip_output.c +@@ -523,6 +523,8 @@ static void ip_copy_metadata(struct sk_b + to->dev = from->dev; + to->mark = from->mark; + ++ skb_copy_hash(to, from); ++ + /* Copy the flags to each fragment. */ + IPCB(to)->flags = IPCB(from)->flags; + +--- a/net/ipv6/ip6_output.c ++++ b/net/ipv6/ip6_output.c +@@ -596,6 +596,8 @@ static void ip6_copy_metadata(struct sk_ + to->dev = from->dev; + to->mark = from->mark; + ++ skb_copy_hash(to, from); ++ + #ifdef CONFIG_NET_SCHED + to->tc_index = from->tc_index; + #endif diff --git a/queue-4.17/ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch b/queue-4.17/ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch new file mode 100644 index 00000000000..37d85ca123a --- /dev/null +++ b/queue-4.17/ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch @@ -0,0 +1,93 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Willem de Bruijn +Date: Mon, 23 Jul 2018 19:36:48 -0400 +Subject: ip: in cmsg IP(V6)_ORIGDSTADDR call pskb_may_pull + +From: Willem de Bruijn + +[ Upstream commit 2efd4fca703a6707cad16ab486eaab8fc7f0fd49 ] + +Syzbot reported a read beyond the end of the skb head when returning +IPV6_ORIGDSTADDR: + + BUG: KMSAN: kernel-infoleak in put_cmsg+0x5ef/0x860 net/core/scm.c:242 + CPU: 0 PID: 4501 Comm: syz-executor128 Not tainted 4.17.0+ #9 + Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS + Google 01/01/2011 + Call Trace: + __dump_stack lib/dump_stack.c:77 [inline] + dump_stack+0x185/0x1d0 lib/dump_stack.c:113 + kmsan_report+0x188/0x2a0 mm/kmsan/kmsan.c:1125 + kmsan_internal_check_memory+0x138/0x1f0 mm/kmsan/kmsan.c:1219 + kmsan_copy_to_user+0x7a/0x160 mm/kmsan/kmsan.c:1261 + copy_to_user include/linux/uaccess.h:184 [inline] + put_cmsg+0x5ef/0x860 net/core/scm.c:242 + ip6_datagram_recv_specific_ctl+0x1cf3/0x1eb0 net/ipv6/datagram.c:719 + ip6_datagram_recv_ctl+0x41c/0x450 net/ipv6/datagram.c:733 + rawv6_recvmsg+0x10fb/0x1460 net/ipv6/raw.c:521 + [..] + +This logic and its ipv4 counterpart read the destination port from +the packet at skb_transport_offset(skb) + 4. + +With MSG_MORE and a local SOCK_RAW sender, syzbot was able to cook a +packet that stores headers exactly up to skb_transport_offset(skb) in +the head and the remainder in a frag. + +Call pskb_may_pull before accessing the pointer to ensure that it lies +in skb head. + +Link: http://lkml.kernel.org/r/CAF=yD-LEJwZj5a1-bAAj2Oy_hKmGygV6rsJ_WOrAYnv-fnayiQ@mail.gmail.com +Reported-by: syzbot+9adb4b567003cac781f0@syzkaller.appspotmail.com +Signed-off-by: Willem de Bruijn +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ip_sockglue.c | 7 +++++-- + net/ipv6/datagram.c | 7 +++++-- + 2 files changed, 10 insertions(+), 4 deletions(-) + +--- a/net/ipv4/ip_sockglue.c ++++ b/net/ipv4/ip_sockglue.c +@@ -148,15 +148,18 @@ static void ip_cmsg_recv_dstaddr(struct + { + struct sockaddr_in sin; + const struct iphdr *iph = ip_hdr(skb); +- __be16 *ports = (__be16 *)skb_transport_header(skb); ++ __be16 *ports; ++ int end; + +- if (skb_transport_offset(skb) + 4 > (int)skb->len) ++ end = skb_transport_offset(skb) + 4; ++ if (end > 0 && !pskb_may_pull(skb, end)) + return; + + /* All current transport protocols have the port numbers in the + * first four bytes of the transport header and this function is + * written with this assumption in mind. + */ ++ ports = (__be16 *)skb_transport_header(skb); + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = iph->daddr; +--- a/net/ipv6/datagram.c ++++ b/net/ipv6/datagram.c +@@ -700,13 +700,16 @@ void ip6_datagram_recv_specific_ctl(stru + } + if (np->rxopt.bits.rxorigdstaddr) { + struct sockaddr_in6 sin6; +- __be16 *ports = (__be16 *) skb_transport_header(skb); ++ __be16 *ports; ++ int end; + +- if (skb_transport_offset(skb) + 4 <= (int)skb->len) { ++ end = skb_transport_offset(skb) + 4; ++ if (end <= 0 || pskb_may_pull(skb, end)) { + /* All current transport protocols have the port numbers in the + * first four bytes of the transport header and this function is + * written with this assumption in mind. + */ ++ ports = (__be16 *)skb_transport_header(skb); + + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = ipv6_hdr(skb)->daddr; diff --git a/queue-4.17/multicast-do-not-restore-deleted-record-source-filter-mode-to-new-one.patch b/queue-4.17/multicast-do-not-restore-deleted-record-source-filter-mode-to-new-one.patch new file mode 100644 index 00000000000..d87fd8c6db3 --- /dev/null +++ b/queue-4.17/multicast-do-not-restore-deleted-record-source-filter-mode-to-new-one.patch @@ -0,0 +1,56 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Hangbin Liu +Date: Fri, 20 Jul 2018 14:04:27 +0800 +Subject: multicast: do not restore deleted record source filter mode to new one + +From: Hangbin Liu + +There are two scenarios that we will restore deleted records. The first is +when device down and up(or unmap/remap). In this scenario the new filter +mode is same with previous one. Because we get it from in_dev->mc_list and +we do not touch it during device down and up. + +The other scenario is when a new socket join a group which was just delete +and not finish sending status reports. In this scenario, we should use the +current filter mode instead of restore old one. Here are 4 cases in total. + +old_socket new_socket before_fix after_fix + IN(A) IN(A) ALLOW(A) ALLOW(A) + IN(A) EX( ) TO_IN( ) TO_EX( ) + EX( ) IN(A) TO_EX( ) ALLOW(A) + EX( ) EX( ) TO_EX( ) TO_EX( ) + +Fixes: 24803f38a5c0b (igmp: do not remove igmp souce list info when set link down) +Fixes: 1666d49e1d416 (mld: do not remove mld souce list info when set link down) +Signed-off-by: Hangbin Liu +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/igmp.c | 3 +-- + net/ipv6/mcast.c | 3 +-- + 2 files changed, 2 insertions(+), 4 deletions(-) + +--- a/net/ipv4/igmp.c ++++ b/net/ipv4/igmp.c +@@ -1201,8 +1201,7 @@ static void igmpv3_del_delrec(struct in_ + if (pmc) { + im->interface = pmc->interface; + im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv; +- im->sfmode = pmc->sfmode; +- if (pmc->sfmode == MCAST_INCLUDE) { ++ if (im->sfmode == MCAST_INCLUDE) { + im->tomb = pmc->tomb; + im->sources = pmc->sources; + for (psf = im->sources; psf; psf = psf->sf_next) +--- a/net/ipv6/mcast.c ++++ b/net/ipv6/mcast.c +@@ -771,8 +771,7 @@ static void mld_del_delrec(struct inet6_ + if (pmc) { + im->idev = pmc->idev; + im->mca_crcount = idev->mc_qrv; +- im->mca_sfmode = pmc->mca_sfmode; +- if (pmc->mca_sfmode == MCAST_INCLUDE) { ++ if (im->mca_sfmode == MCAST_INCLUDE) { + im->mca_tomb = pmc->mca_tomb; + im->mca_sources = pmc->mca_sources; + for (psf = im->mca_sources; psf; psf = psf->sf_next) diff --git a/queue-4.17/net-dsa-mv88e6xxx-fix-races-between-lock-and-irq-freeing.patch b/queue-4.17/net-dsa-mv88e6xxx-fix-races-between-lock-and-irq-freeing.patch new file mode 100644 index 00000000000..e1aeb00bc7e --- /dev/null +++ b/queue-4.17/net-dsa-mv88e6xxx-fix-races-between-lock-and-irq-freeing.patch @@ -0,0 +1,98 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: "Uwe Kleine-König" +Date: Fri, 20 Jul 2018 11:53:15 +0200 +Subject: net: dsa: mv88e6xxx: fix races between lock and irq freeing + +From: "Uwe Kleine-König" + +[ Upstream commit 3d82475ad46c0b65f2618b5f2bbb4cadbb5ac5d8 ] + +free_irq() waits until all handlers for this IRQ have completed. As the +relevant handler (mv88e6xxx_g1_irq_thread_fn()) takes the chip's reg_lock +it might never return if the thread calling free_irq() holds this lock. + +For the same reason kthread_cancel_delayed_work_sync() in the polling case +must not hold this lock. + +Also first free the irq (or stop the worker respectively) such that +mv88e6xxx_g1_irq_thread_work() isn't called any more before the irq +mappings are dropped in mv88e6xxx_g1_irq_free_common() to prevent the +worker thread to call handle_nested_irq(0) which results in a NULL-pointer +exception. + +Signed-off-by: Uwe Kleine-König +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/dsa/mv88e6xxx/chip.c | 21 +++++++++++++-------- + 1 file changed, 13 insertions(+), 8 deletions(-) + +--- a/drivers/net/dsa/mv88e6xxx/chip.c ++++ b/drivers/net/dsa/mv88e6xxx/chip.c +@@ -341,6 +341,7 @@ static const struct irq_domain_ops mv88e + .xlate = irq_domain_xlate_twocell, + }; + ++/* To be called with reg_lock held */ + static void mv88e6xxx_g1_irq_free_common(struct mv88e6xxx_chip *chip) + { + int irq, virq; +@@ -360,9 +361,15 @@ static void mv88e6xxx_g1_irq_free_common + + static void mv88e6xxx_g1_irq_free(struct mv88e6xxx_chip *chip) + { +- mv88e6xxx_g1_irq_free_common(chip); +- ++ /* ++ * free_irq must be called without reg_lock taken because the irq ++ * handler takes this lock, too. ++ */ + free_irq(chip->irq, chip); ++ ++ mutex_lock(&chip->reg_lock); ++ mv88e6xxx_g1_irq_free_common(chip); ++ mutex_unlock(&chip->reg_lock); + } + + static int mv88e6xxx_g1_irq_setup_common(struct mv88e6xxx_chip *chip) +@@ -467,10 +474,12 @@ static int mv88e6xxx_irq_poll_setup(stru + + static void mv88e6xxx_irq_poll_free(struct mv88e6xxx_chip *chip) + { +- mv88e6xxx_g1_irq_free_common(chip); +- + kthread_cancel_delayed_work_sync(&chip->irq_poll_work); + kthread_destroy_worker(chip->kworker); ++ ++ mutex_lock(&chip->reg_lock); ++ mv88e6xxx_g1_irq_free_common(chip); ++ mutex_unlock(&chip->reg_lock); + } + + int mv88e6xxx_wait(struct mv88e6xxx_chip *chip, int addr, int reg, u16 mask) +@@ -4286,12 +4295,10 @@ out_g2_irq: + if (chip->info->g2_irqs > 0) + mv88e6xxx_g2_irq_free(chip); + out_g1_irq: +- mutex_lock(&chip->reg_lock); + if (chip->irq > 0) + mv88e6xxx_g1_irq_free(chip); + else + mv88e6xxx_irq_poll_free(chip); +- mutex_unlock(&chip->reg_lock); + out: + return err; + } +@@ -4316,12 +4323,10 @@ static void mv88e6xxx_remove(struct mdio + if (chip->info->g2_irqs > 0) + mv88e6xxx_g2_irq_free(chip); + +- mutex_lock(&chip->reg_lock); + if (chip->irq > 0) + mv88e6xxx_g1_irq_free(chip); + else + mv88e6xxx_irq_poll_free(chip); +- mutex_unlock(&chip->reg_lock); + } + + static const struct of_device_id mv88e6xxx_of_match[] = { diff --git a/queue-4.17/net-ipv6-fix-linklocal-to-global-address-with-vrf.patch b/queue-4.17/net-ipv6-fix-linklocal-to-global-address-with-vrf.patch new file mode 100644 index 00000000000..3a58b5e160a --- /dev/null +++ b/queue-4.17/net-ipv6-fix-linklocal-to-global-address-with-vrf.patch @@ -0,0 +1,93 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: David Ahern +Date: Thu, 19 Jul 2018 12:41:18 -0700 +Subject: net/ipv6: Fix linklocal to global address with VRF + +From: David Ahern + +[ Upstream commit 24b711edfc34bc45777a3f068812b7d1ed004a5d ] + +Example setup: + host: ip -6 addr add dev eth1 2001:db8:104::4 + where eth1 is enslaved to a VRF + + switch: ip -6 ro add 2001:db8:104::4/128 dev br1 + where br1 only has an LLA + + ping6 2001:db8:104::4 + ssh 2001:db8:104::4 + +(NOTE: UDP works fine if the PKTINFO has the address set to the global +address and ifindex is set to the index of eth1 with a destination an +LLA). + +For ICMP, icmp6_iif needs to be updated to check if skb->dev is an +L3 master. If it is then return the ifindex from rt6i_idev similar +to what is done for loopback. + +For TCP, restore the original tcp_v6_iif definition which is needed in +most places and add a new tcp_v6_iif_l3_slave that considers the +l3_slave variability. This latter check is only needed for socket +lookups. + +Fixes: 9ff74384600a ("net: vrf: Handle ipv6 multicast and link-local addresses") +Signed-off-by: David Ahern +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/tcp.h | 5 +++++ + net/ipv6/icmp.c | 5 +++-- + net/ipv6/tcp_ipv6.c | 6 ++++-- + 3 files changed, 12 insertions(+), 4 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -829,6 +829,11 @@ struct tcp_skb_cb { + */ + static inline int tcp_v6_iif(const struct sk_buff *skb) + { ++ return TCP_SKB_CB(skb)->header.h6.iif; ++} ++ ++static inline int tcp_v6_iif_l3_slave(const struct sk_buff *skb) ++{ + bool l3_slave = ipv6_l3mdev_skb(TCP_SKB_CB(skb)->header.h6.flags); + + return l3_slave ? skb->skb_iif : TCP_SKB_CB(skb)->header.h6.iif; +--- a/net/ipv6/icmp.c ++++ b/net/ipv6/icmp.c +@@ -402,9 +402,10 @@ static int icmp6_iif(const struct sk_buf + + /* for local traffic to local address, skb dev is the loopback + * device. Check if there is a dst attached to the skb and if so +- * get the real device index. ++ * get the real device index. Same is needed for replies to a link ++ * local address on a device enslaved to an L3 master device + */ +- if (unlikely(iif == LOOPBACK_IFINDEX)) { ++ if (unlikely(iif == LOOPBACK_IFINDEX || netif_is_l3_master(skb->dev))) { + const struct rt6_info *rt6 = skb_rt6_info(skb); + + if (rt6) +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -934,7 +934,8 @@ static void tcp_v6_send_reset(const stru + &tcp_hashinfo, NULL, 0, + &ipv6h->saddr, + th->source, &ipv6h->daddr, +- ntohs(th->source), tcp_v6_iif(skb), ++ ntohs(th->source), ++ tcp_v6_iif_l3_slave(skb), + tcp_v6_sdif(skb)); + if (!sk1) + goto out; +@@ -1605,7 +1606,8 @@ do_time_wait: + skb, __tcp_hdrlen(th), + &ipv6_hdr(skb)->saddr, th->source, + &ipv6_hdr(skb)->daddr, +- ntohs(th->dest), tcp_v6_iif(skb), ++ ntohs(th->dest), ++ tcp_v6_iif_l3_slave(skb), + sdif); + if (sk2) { + struct inet_timewait_sock *tw = inet_twsk(sk); diff --git a/queue-4.17/net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch b/queue-4.17/net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch new file mode 100644 index 00000000000..3a9a587114d --- /dev/null +++ b/queue-4.17/net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch @@ -0,0 +1,40 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Jack Morgenstein +Date: Tue, 24 Jul 2018 14:27:55 +0300 +Subject: net/mlx4_core: Save the qpn from the input modifier in RST2INIT wrapper + +From: Jack Morgenstein + +[ Upstream commit 958c696f5a7274d9447a458ad7aa70719b29a50a ] + +Function mlx4_RST2INIT_QP_wrapper saved the qp number passed in the qp +context, rather than the one passed in the input modifier. + +However, the qp number in the qp context is not defined as a +required parameter by the FW. Therefore, drivers may choose to not +specify the qp number in the qp context for the reset-to-init transition. + +Thus, we must save the qp number passed in the command input modifier -- +which is always present. (This saved qp number is used as the input +modifier for command 2RST_QP when a slave's qp's are destroyed). + +Fixes: c82e9aa0a8bc ("mlx4_core: resource tracking for HCA resources used by guests") +Signed-off-by: Jack Morgenstein +Signed-off-by: Tariq Toukan +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx4/resource_tracker.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c ++++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +@@ -2956,7 +2956,7 @@ int mlx4_RST2INIT_QP_wrapper(struct mlx4 + u32 srqn = qp_get_srqn(qpc) & 0xffffff; + int use_srq = (qp_get_srqn(qpc) >> 24) & 1; + struct res_srq *srq; +- int local_qpn = be32_to_cpu(qpc->local_qpn) & 0xffffff; ++ int local_qpn = vhcr->in_modifier & 0xffffff; + + err = adjust_qp_sched_queue(dev, slave, qpc, inbox); + if (err) diff --git a/queue-4.17/net-mlx5-adjust-clock-overflow-work-period.patch b/queue-4.17/net-mlx5-adjust-clock-overflow-work-period.patch new file mode 100644 index 00000000000..a3d7520f773 --- /dev/null +++ b/queue-4.17/net-mlx5-adjust-clock-overflow-work-period.patch @@ -0,0 +1,70 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Ariel Levkovich +Date: Mon, 25 Jun 2018 19:12:02 +0300 +Subject: net/mlx5: Adjust clock overflow work period + +From: Ariel Levkovich + +[ Upstream commit 33180bee86a8940a84950edca46315cd9dd6deb5 ] + +When driver converts HW timestamp to wall clock time it subtracts +the last saved cycle counter from the HW timestamp and converts the +difference to nanoseconds. +The conversion is done by multiplying the cycles difference with the +clock multiplier value as a first step and therefore the cycles +difference should be small enough so that the multiplication product +doesn't exceed 64bit. + +The overflow handling routine is in charge of updating the last saved +cycle counter in driver and it is called periodically using kernel +delayed workqueue. + +The delay period for this work is calculated using the max HW cycle +counter value (a 41 bit mask) as a base which doesn't take the 64bit +limit into account so the delay period may be incorrect and too +long to prevent a large difference between the HW counter and the last +saved counter in SW. + +This change adjusts the work period for the HW clock overflow work by +taking the minimum between the previous value and the quotient of max +u64 value and the clock multiplier value. + +Fixes: ef9814deafd0 ("net/mlx5e: Add HW timestamping (TS) support") +Signed-off-by: Ariel Levkovich +Reviewed-by: Eran Ben Elisha +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c | 12 ++++++++++-- + 1 file changed, 10 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/clock.c +@@ -487,6 +487,7 @@ void mlx5_pps_event(struct mlx5_core_dev + void mlx5_init_clock(struct mlx5_core_dev *mdev) + { + struct mlx5_clock *clock = &mdev->clock; ++ u64 overflow_cycles; + u64 ns; + u64 frac = 0; + u32 dev_freq; +@@ -510,10 +511,17 @@ void mlx5_init_clock(struct mlx5_core_de + + /* Calculate period in seconds to call the overflow watchdog - to make + * sure counter is checked at least once every wrap around. ++ * The period is calculated as the minimum between max HW cycles count ++ * (The clock source mask) and max amount of cycles that can be ++ * multiplied by clock multiplier where the result doesn't exceed ++ * 64bits. + */ +- ns = cyclecounter_cyc2ns(&clock->cycles, clock->cycles.mask, ++ overflow_cycles = div64_u64(~0ULL >> 1, clock->cycles.mult); ++ overflow_cycles = min(overflow_cycles, clock->cycles.mask >> 1); ++ ++ ns = cyclecounter_cyc2ns(&clock->cycles, overflow_cycles, + frac, &frac); +- do_div(ns, NSEC_PER_SEC / 2 / HZ); ++ do_div(ns, NSEC_PER_SEC / HZ); + clock->overflow_period = ns; + + mdev->clock_info_page = alloc_page(GFP_KERNEL); diff --git a/queue-4.17/net-mlx5-e-switch-ubsan-fix-undefined-behavior-in-mlx5_eswitch_mode.patch b/queue-4.17/net-mlx5-e-switch-ubsan-fix-undefined-behavior-in-mlx5_eswitch_mode.patch new file mode 100644 index 00000000000..83fca61b84d --- /dev/null +++ b/queue-4.17/net-mlx5-e-switch-ubsan-fix-undefined-behavior-in-mlx5_eswitch_mode.patch @@ -0,0 +1,50 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Saeed Mahameed +Date: Mon, 9 Jul 2018 16:41:40 -0700 +Subject: net/mlx5: E-Switch, UBSAN fix undefined behavior in mlx5_eswitch_mode + +From: Saeed Mahameed + +[ Upstream commit 443a858158d35916e572b75667ca4924a6af2182 ] + +With debug kernel UBSAN detects the following issue, which might happen +when eswitch instance is not created, fix this by testing the eswitch +pointer before returning the eswitch mode, if not set return mode = +SRIOV_NONE. + +[ 32.528951] UBSAN: Undefined behaviour in drivers/net/ethernet/mellanox/mlx5/core/eswitch.c:2219:12 +[ 32.528951] member access within null pointer of type 'struct mlx5_eswitch' +[ 32.528951] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 4.18.0-rc3-dirty #181 +[ 32.528951] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.11.0-0-g63451fca13-prebuilt.qemu-project.org 04/01/2014 +[ 32.528951] Call Trace: +[ 32.528951] dump_stack+0xc7/0x13b +[ 32.528951] ? show_regs_print_info+0x5/0x5 +[ 32.528951] ? __pm_runtime_use_autosuspend+0x140/0x140 +[ 32.528951] ubsan_epilogue+0x9/0x49 +[ 32.528951] ubsan_type_mismatch_common+0x1f9/0x2c0 +[ 32.528951] ? ucs2_as_utf8+0x310/0x310 +[ 32.528951] ? device_initialize+0x229/0x2e0 +[ 32.528951] __ubsan_handle_type_mismatch+0x9f/0xc9 +[ 32.528951] ? __ubsan_handle_divrem_overflow+0x19b/0x19b +[ 32.578008] ? ib_device_get_by_index+0xf0/0xf0 +[ 32.578008] mlx5_eswitch_mode+0x30/0x40 +[ 32.578008] mlx5_ib_add+0x1e0/0x4a0 + +Fixes: 57cbd893c4c5 ("net/mlx5: E-Switch, Move representors definition to a global scope") +Signed-off-by: Saeed Mahameed +Reviewed-by: Leon Romanovsky +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/eswitch.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch.c +@@ -2221,6 +2221,6 @@ free_out: + + u8 mlx5_eswitch_mode(struct mlx5_eswitch *esw) + { +- return esw->mode; ++ return ESW_ALLOWED(esw) ? esw->mode : SRIOV_NONE; + } + EXPORT_SYMBOL_GPL(mlx5_eswitch_mode); diff --git a/queue-4.17/net-mlx5e-add-ingress-egress-indication-for-offloaded-tc-flows.patch b/queue-4.17/net-mlx5e-add-ingress-egress-indication-for-offloaded-tc-flows.patch new file mode 100644 index 00000000000..04942c827d9 --- /dev/null +++ b/queue-4.17/net-mlx5e-add-ingress-egress-indication-for-offloaded-tc-flows.patch @@ -0,0 +1,266 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Or Gerlitz +Date: Wed, 18 Apr 2018 13:45:11 +0300 +Subject: net/mlx5e: Add ingress/egress indication for offloaded TC flows + +From: Or Gerlitz + +[ Upstream commit 60bd4af814fec164c42bdd2efd7984b85d6b1e1e ] + +When an e-switch TC rule is offloaded through the egdev (egress +device) mechanism, we treat this as egress, all other cases (NIC +and e-switch) are considred ingress. + +This is preparation step that will allow us to identify "wrong" +stat/del offload calls made by the TC core on egdev based flows and +ignore them. + +Signed-off-by: Or Gerlitz +Signed-off-by: Jiri Pirko +Reviewed-by: Paul Blakey +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en.h | 3 - + drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 15 ++++---- + drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 32 +++++++++++++----- + drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 38 ++++++++++++++++------ + drivers/net/ethernet/mellanox/mlx5/core/en_tc.h | 13 +++++-- + 5 files changed, 70 insertions(+), 31 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h +@@ -1092,9 +1092,6 @@ int mlx5e_ethtool_get_ts_info(struct mlx + int mlx5e_ethtool_flash_device(struct mlx5e_priv *priv, + struct ethtool_flash *flash); + +-int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, +- void *cb_priv); +- + /* mlx5e generic netdev management API */ + struct net_device* + mlx5e_create_netdev(struct mlx5_core_dev *mdev, const struct mlx5e_profile *profile, +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +@@ -3093,22 +3093,23 @@ out: + + #ifdef CONFIG_MLX5_ESWITCH + static int mlx5e_setup_tc_cls_flower(struct mlx5e_priv *priv, +- struct tc_cls_flower_offload *cls_flower) ++ struct tc_cls_flower_offload *cls_flower, ++ int flags) + { + switch (cls_flower->command) { + case TC_CLSFLOWER_REPLACE: +- return mlx5e_configure_flower(priv, cls_flower); ++ return mlx5e_configure_flower(priv, cls_flower, flags); + case TC_CLSFLOWER_DESTROY: +- return mlx5e_delete_flower(priv, cls_flower); ++ return mlx5e_delete_flower(priv, cls_flower, flags); + case TC_CLSFLOWER_STATS: +- return mlx5e_stats_flower(priv, cls_flower); ++ return mlx5e_stats_flower(priv, cls_flower, flags); + default: + return -EOPNOTSUPP; + } + } + +-int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, +- void *cb_priv) ++static int mlx5e_setup_tc_block_cb(enum tc_setup_type type, void *type_data, ++ void *cb_priv) + { + struct mlx5e_priv *priv = cb_priv; + +@@ -3117,7 +3118,7 @@ int mlx5e_setup_tc_block_cb(enum tc_setu + + switch (type) { + case TC_SETUP_CLSFLOWER: +- return mlx5e_setup_tc_cls_flower(priv, type_data); ++ return mlx5e_setup_tc_cls_flower(priv, type_data, MLX5E_TC_INGRESS); + default: + return -EOPNOTSUPP; + } +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +@@ -723,15 +723,31 @@ static int mlx5e_rep_get_phys_port_name( + + static int + mlx5e_rep_setup_tc_cls_flower(struct mlx5e_priv *priv, +- struct tc_cls_flower_offload *cls_flower) ++ struct tc_cls_flower_offload *cls_flower, int flags) + { + switch (cls_flower->command) { + case TC_CLSFLOWER_REPLACE: +- return mlx5e_configure_flower(priv, cls_flower); ++ return mlx5e_configure_flower(priv, cls_flower, flags); + case TC_CLSFLOWER_DESTROY: +- return mlx5e_delete_flower(priv, cls_flower); ++ return mlx5e_delete_flower(priv, cls_flower, flags); + case TC_CLSFLOWER_STATS: +- return mlx5e_stats_flower(priv, cls_flower); ++ return mlx5e_stats_flower(priv, cls_flower, flags); ++ default: ++ return -EOPNOTSUPP; ++ } ++} ++ ++static int mlx5e_rep_setup_tc_cb_egdev(enum tc_setup_type type, void *type_data, ++ void *cb_priv) ++{ ++ struct mlx5e_priv *priv = cb_priv; ++ ++ if (!tc_cls_can_offload_and_chain0(priv->netdev, type_data)) ++ return -EOPNOTSUPP; ++ ++ switch (type) { ++ case TC_SETUP_CLSFLOWER: ++ return mlx5e_rep_setup_tc_cls_flower(priv, type_data, MLX5E_TC_EGRESS); + default: + return -EOPNOTSUPP; + } +@@ -747,7 +763,7 @@ static int mlx5e_rep_setup_tc_cb(enum tc + + switch (type) { + case TC_SETUP_CLSFLOWER: +- return mlx5e_rep_setup_tc_cls_flower(priv, type_data); ++ return mlx5e_rep_setup_tc_cls_flower(priv, type_data, MLX5E_TC_INGRESS); + default: + return -EOPNOTSUPP; + } +@@ -1111,7 +1127,7 @@ mlx5e_vport_rep_load(struct mlx5_core_de + + uplink_rpriv = mlx5_eswitch_get_uplink_priv(dev->priv.eswitch, REP_ETH); + upriv = netdev_priv(uplink_rpriv->netdev); +- err = tc_setup_cb_egdev_register(netdev, mlx5e_setup_tc_block_cb, ++ err = tc_setup_cb_egdev_register(netdev, mlx5e_rep_setup_tc_cb_egdev, + upriv); + if (err) + goto err_neigh_cleanup; +@@ -1126,7 +1142,7 @@ mlx5e_vport_rep_load(struct mlx5_core_de + return 0; + + err_egdev_cleanup: +- tc_setup_cb_egdev_unregister(netdev, mlx5e_setup_tc_block_cb, ++ tc_setup_cb_egdev_unregister(netdev, mlx5e_rep_setup_tc_cb_egdev, + upriv); + + err_neigh_cleanup: +@@ -1155,7 +1171,7 @@ mlx5e_vport_rep_unload(struct mlx5_eswit + uplink_rpriv = mlx5_eswitch_get_uplink_priv(priv->mdev->priv.eswitch, + REP_ETH); + upriv = netdev_priv(uplink_rpriv->netdev); +- tc_setup_cb_egdev_unregister(netdev, mlx5e_setup_tc_block_cb, ++ tc_setup_cb_egdev_unregister(netdev, mlx5e_rep_setup_tc_cb_egdev, + upriv); + mlx5e_rep_neigh_cleanup(rpriv); + mlx5e_detach_netdev(priv); +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +@@ -61,12 +61,16 @@ struct mlx5_nic_flow_attr { + struct mlx5_flow_table *hairpin_ft; + }; + ++#define MLX5E_TC_FLOW_BASE (MLX5E_TC_LAST_EXPORTED_BIT + 1) ++ + enum { +- MLX5E_TC_FLOW_ESWITCH = BIT(0), +- MLX5E_TC_FLOW_NIC = BIT(1), +- MLX5E_TC_FLOW_OFFLOADED = BIT(2), +- MLX5E_TC_FLOW_HAIRPIN = BIT(3), +- MLX5E_TC_FLOW_HAIRPIN_RSS = BIT(4), ++ MLX5E_TC_FLOW_INGRESS = MLX5E_TC_INGRESS, ++ MLX5E_TC_FLOW_EGRESS = MLX5E_TC_EGRESS, ++ MLX5E_TC_FLOW_ESWITCH = BIT(MLX5E_TC_FLOW_BASE), ++ MLX5E_TC_FLOW_NIC = BIT(MLX5E_TC_FLOW_BASE + 1), ++ MLX5E_TC_FLOW_OFFLOADED = BIT(MLX5E_TC_FLOW_BASE + 2), ++ MLX5E_TC_FLOW_HAIRPIN = BIT(MLX5E_TC_FLOW_BASE + 3), ++ MLX5E_TC_FLOW_HAIRPIN_RSS = BIT(MLX5E_TC_FLOW_BASE + 4), + }; + + struct mlx5e_tc_flow { +@@ -2566,8 +2570,20 @@ static int parse_tc_fdb_actions(struct m + return err; + } + ++static void get_flags(int flags, u8 *flow_flags) ++{ ++ u8 __flow_flags = 0; ++ ++ if (flags & MLX5E_TC_INGRESS) ++ __flow_flags |= MLX5E_TC_FLOW_INGRESS; ++ if (flags & MLX5E_TC_EGRESS) ++ __flow_flags |= MLX5E_TC_FLOW_EGRESS; ++ ++ *flow_flags = __flow_flags; ++} ++ + int mlx5e_configure_flower(struct mlx5e_priv *priv, +- struct tc_cls_flower_offload *f) ++ struct tc_cls_flower_offload *f, int flags) + { + struct mlx5_eswitch *esw = priv->mdev->priv.eswitch; + struct mlx5e_tc_flow_parse_attr *parse_attr; +@@ -2576,11 +2592,13 @@ int mlx5e_configure_flower(struct mlx5e_ + int attr_size, err = 0; + u8 flow_flags = 0; + ++ get_flags(flags, &flow_flags); ++ + if (esw && esw->mode == SRIOV_OFFLOADS) { +- flow_flags = MLX5E_TC_FLOW_ESWITCH; ++ flow_flags |= MLX5E_TC_FLOW_ESWITCH; + attr_size = sizeof(struct mlx5_esw_flow_attr); + } else { +- flow_flags = MLX5E_TC_FLOW_NIC; ++ flow_flags |= MLX5E_TC_FLOW_NIC; + attr_size = sizeof(struct mlx5_nic_flow_attr); + } + +@@ -2639,7 +2657,7 @@ err_free: + } + + int mlx5e_delete_flower(struct mlx5e_priv *priv, +- struct tc_cls_flower_offload *f) ++ struct tc_cls_flower_offload *f, int flags) + { + struct mlx5e_tc_flow *flow; + struct mlx5e_tc_table *tc = &priv->fs.tc; +@@ -2659,7 +2677,7 @@ int mlx5e_delete_flower(struct mlx5e_pri + } + + int mlx5e_stats_flower(struct mlx5e_priv *priv, +- struct tc_cls_flower_offload *f) ++ struct tc_cls_flower_offload *f, int flags) + { + struct mlx5e_tc_table *tc = &priv->fs.tc; + struct mlx5e_tc_flow *flow; +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.h +@@ -38,16 +38,23 @@ + #define MLX5E_TC_FLOW_ID_MASK 0x0000ffff + + #ifdef CONFIG_MLX5_ESWITCH ++ ++enum { ++ MLX5E_TC_INGRESS = BIT(0), ++ MLX5E_TC_EGRESS = BIT(1), ++ MLX5E_TC_LAST_EXPORTED_BIT = 1, ++}; ++ + int mlx5e_tc_init(struct mlx5e_priv *priv); + void mlx5e_tc_cleanup(struct mlx5e_priv *priv); + + int mlx5e_configure_flower(struct mlx5e_priv *priv, +- struct tc_cls_flower_offload *f); ++ struct tc_cls_flower_offload *f, int flags); + int mlx5e_delete_flower(struct mlx5e_priv *priv, +- struct tc_cls_flower_offload *f); ++ struct tc_cls_flower_offload *f, int flags); + + int mlx5e_stats_flower(struct mlx5e_priv *priv, +- struct tc_cls_flower_offload *f); ++ struct tc_cls_flower_offload *f, int flags); + + struct mlx5e_encap_entry; + void mlx5e_tc_encap_flows_add(struct mlx5e_priv *priv, diff --git a/queue-4.17/net-mlx5e-don-t-allow-arfs-for-encapsulated-packets.patch b/queue-4.17/net-mlx5e-don-t-allow-arfs-for-encapsulated-packets.patch new file mode 100644 index 00000000000..4713e6fb77e --- /dev/null +++ b/queue-4.17/net-mlx5e-don-t-allow-arfs-for-encapsulated-packets.patch @@ -0,0 +1,33 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Eran Ben Elisha +Date: Sun, 8 Jul 2018 14:52:12 +0300 +Subject: net/mlx5e: Don't allow aRFS for encapsulated packets + +From: Eran Ben Elisha + +[ Upstream commit d2e1c57bcf9a07cbb67f30ecf238f298799bce1c ] + +Driver is yet to support aRFS for encapsulated packets, return early +error in such case. + +Fixes: 18c908e477dc ("net/mlx5e: Add accelerated RFS support") +Signed-off-by: Eran Ben Elisha +Reviewed-by: Tariq Toukan +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +@@ -711,6 +711,9 @@ int mlx5e_rx_flow_steer(struct net_devic + skb->protocol != htons(ETH_P_IPV6)) + return -EPROTONOSUPPORT; + ++ if (skb->encapsulation) ++ return -EPROTONOSUPPORT; ++ + arfs_t = arfs_get_table(arfs, arfs_get_ip_proto(skb), skb->protocol); + if (!arfs_t) + return -EPROTONOSUPPORT; diff --git a/queue-4.17/net-mlx5e-fix-quota-counting-in-arfs-expire-flow.patch b/queue-4.17/net-mlx5e-fix-quota-counting-in-arfs-expire-flow.patch new file mode 100644 index 00000000000..9d3b0d3f71a --- /dev/null +++ b/queue-4.17/net-mlx5e-fix-quota-counting-in-arfs-expire-flow.patch @@ -0,0 +1,41 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Eran Ben Elisha +Date: Sun, 8 Jul 2018 13:08:55 +0300 +Subject: net/mlx5e: Fix quota counting in aRFS expire flow + +From: Eran Ben Elisha + +[ Upstream commit 2630bae8018823c3b88788b69fb9f16ea3b4a11e ] + +Quota should follow the amount of rules which do expire, and not the +number of rules that were examined, fixed that. + +Fixes: 18c908e477dc ("net/mlx5e: Add accelerated RFS support") +Signed-off-by: Eran Ben Elisha +Reviewed-by: Maor Gottlieb +Reviewed-by: Tariq Toukan +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +@@ -381,14 +381,14 @@ static void arfs_may_expire_flow(struct + HLIST_HEAD(del_list); + spin_lock_bh(&priv->fs.arfs.arfs_lock); + mlx5e_for_each_arfs_rule(arfs_rule, htmp, priv->fs.arfs.arfs_tables, i, j) { +- if (quota++ > MLX5E_ARFS_EXPIRY_QUOTA) +- break; + if (!work_pending(&arfs_rule->arfs_work) && + rps_may_expire_flow(priv->netdev, + arfs_rule->rxq, arfs_rule->flow_id, + arfs_rule->filter_id)) { + hlist_del_init(&arfs_rule->hlist); + hlist_add_head(&arfs_rule->hlist, &del_list); ++ if (quota++ > MLX5E_ARFS_EXPIRY_QUOTA) ++ break; + } + } + spin_unlock_bh(&priv->fs.arfs.arfs_lock); diff --git a/queue-4.17/net-mlx5e-only-allow-offloading-decap-egress-egdev-flows.patch b/queue-4.17/net-mlx5e-only-allow-offloading-decap-egress-egdev-flows.patch new file mode 100644 index 00000000000..29549223330 --- /dev/null +++ b/queue-4.17/net-mlx5e-only-allow-offloading-decap-egress-egdev-flows.patch @@ -0,0 +1,39 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Roi Dayan +Date: Thu, 12 Jul 2018 18:25:59 +0300 +Subject: net/mlx5e: Only allow offloading decap egress (egdev) flows + +From: Roi Dayan + +[ Upstream commit 7e29392eee7a1e3318eeb1099807264a49f60e33 ] + +We get egress rules through the egdev mechanism when the ingress device +is not supporting offload, with the expected use-case of tunnel decap +ingress rule set on shared tunnel device. + +Make sure to offload egress/egdev rules only if decap action (tunnel key +unset) exists there and err otherwise. + +Fixes: 717503b9cf57 ("net: sched: convert cls_flower->egress_dev users to tc_setup_cb_egdev infra") +Signed-off-by: Roi Dayan +Signed-off-by: Paul Blakey +Reviewed-by: Or Gerlitz +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +@@ -1894,6 +1894,10 @@ static bool actions_match_supported(stru + else + actions = flow->nic_attr->action; + ++ if (flow->flags & MLX5E_TC_FLOW_EGRESS && ++ !(actions & MLX5_FLOW_CONTEXT_ACTION_DECAP)) ++ return false; ++ + if (actions & MLX5_FLOW_CONTEXT_ACTION_MOD_HDR) + return modify_header_match_supported(&parse_attr->spec, exts); + diff --git a/queue-4.17/net-mlx5e-refine-ets-validation-function.patch b/queue-4.17/net-mlx5e-refine-ets-validation-function.patch new file mode 100644 index 00000000000..4de326750c0 --- /dev/null +++ b/queue-4.17/net-mlx5e-refine-ets-validation-function.patch @@ -0,0 +1,74 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Shay Agroskin +Date: Wed, 27 Jun 2018 15:43:07 +0300 +Subject: net/mlx5e: Refine ets validation function + +From: Shay Agroskin + +[ Upstream commit e279d634f3d57452eb106a0c0e99a6add3fba1a6 ] + +Removed an error message received when configuring ETS total +bandwidth to be zero. +Our hardware doesn't support such configuration, so we shall +reject it in the driver. Nevertheless, we removed the error message +in order to eliminate error messages caused by old userspace tools +who try to pass such configuration. + +Fixes: ff0891915cd7 ("net/mlx5e: Fix ETS BW check") +Signed-off-by: Shay Agroskin +Reviewed-by: Huy Nguyen +Reviewed-by: Eran Ben Elisha +Signed-off-by: Saeed Mahameed +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c | 17 ++++++++--------- + 1 file changed, 8 insertions(+), 9 deletions(-) + +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_dcbnl.c +@@ -272,7 +272,8 @@ int mlx5e_dcbnl_ieee_setets_core(struct + } + + static int mlx5e_dbcnl_validate_ets(struct net_device *netdev, +- struct ieee_ets *ets) ++ struct ieee_ets *ets, ++ bool zero_sum_allowed) + { + bool have_ets_tc = false; + int bw_sum = 0; +@@ -297,8 +298,9 @@ static int mlx5e_dbcnl_validate_ets(stru + } + + if (have_ets_tc && bw_sum != 100) { +- netdev_err(netdev, +- "Failed to validate ETS: BW sum is illegal\n"); ++ if (bw_sum || (!bw_sum && !zero_sum_allowed)) ++ netdev_err(netdev, ++ "Failed to validate ETS: BW sum is illegal\n"); + return -EINVAL; + } + return 0; +@@ -313,7 +315,7 @@ static int mlx5e_dcbnl_ieee_setets(struc + if (!MLX5_CAP_GEN(priv->mdev, ets)) + return -EOPNOTSUPP; + +- err = mlx5e_dbcnl_validate_ets(netdev, ets); ++ err = mlx5e_dbcnl_validate_ets(netdev, ets, false); + if (err) + return err; + +@@ -613,12 +615,9 @@ static u8 mlx5e_dcbnl_setall(struct net_ + ets.prio_tc[i]); + } + +- err = mlx5e_dbcnl_validate_ets(netdev, &ets); +- if (err) { +- netdev_err(netdev, +- "%s, Failed to validate ETS: %d\n", __func__, err); ++ err = mlx5e_dbcnl_validate_ets(netdev, &ets, true); ++ if (err) + goto out; +- } + + err = mlx5e_dcbnl_ieee_setets_core(priv, &ets); + if (err) { diff --git a/queue-4.17/net-next-hinic-fix-a-problem-in-hinic_xmit_frame.patch b/queue-4.17/net-next-hinic-fix-a-problem-in-hinic_xmit_frame.patch new file mode 100644 index 00000000000..82fe5b4a6f3 --- /dev/null +++ b/queue-4.17/net-next-hinic-fix-a-problem-in-hinic_xmit_frame.patch @@ -0,0 +1,37 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Zhao Chen +Date: Wed, 18 Jul 2018 00:33:18 -0400 +Subject: net-next/hinic: fix a problem in hinic_xmit_frame() + +From: Zhao Chen + +[ Upstream commit f7482683f1f4925c60941dbbd0813ceaa069d106 ] + +The calculation of "wqe_size" is not correct when the tx queue is busy in +hinic_xmit_frame(). + +When there are no free WQEs, the tx flow will unmap the skb buffer, then +ring the doobell for the pending packets. But the "wqe_size" which used +to calculate the doorbell address is not correct. The wqe size should be +cleared to 0, otherwise, it will cause a doorbell error. + +This patch fixes the problem. + +Reported-by: Zhou Wang +Signed-off-by: Zhao Chen +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/huawei/hinic/hinic_tx.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/drivers/net/ethernet/huawei/hinic/hinic_tx.c ++++ b/drivers/net/ethernet/huawei/hinic/hinic_tx.c +@@ -229,6 +229,7 @@ netdev_tx_t hinic_xmit_frame(struct sk_b + txq->txq_stats.tx_busy++; + u64_stats_update_end(&txq->txq_stats.syncp); + err = NETDEV_TX_BUSY; ++ wqe_size = 0; + goto flush_skbs; + } + diff --git a/queue-4.17/net-phy-consider-phy_ignore_interrupt-in-phy_start_aneg_priv.patch b/queue-4.17/net-phy-consider-phy_ignore_interrupt-in-phy_start_aneg_priv.patch new file mode 100644 index 00000000000..3bb5a093b39 --- /dev/null +++ b/queue-4.17/net-phy-consider-phy_ignore_interrupt-in-phy_start_aneg_priv.patch @@ -0,0 +1,32 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Heiner Kallweit +Date: Thu, 19 Jul 2018 08:15:16 +0200 +Subject: net: phy: consider PHY_IGNORE_INTERRUPT in phy_start_aneg_priv + +From: Heiner Kallweit + +[ Upstream commit 215d08a85b9acf5e1fe9dbf50f1774cde333efef ] + +The situation described in the comment can occur also with +PHY_IGNORE_INTERRUPT, therefore change the condition to include it. + +Fixes: f555f34fdc58 ("net: phy: fix auto-negotiation stall due to unavailable interrupt") +Signed-off-by: Heiner Kallweit +Reviewed-by: Andrew Lunn +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/phy/phy.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/phy/phy.c ++++ b/drivers/net/phy/phy.c +@@ -514,7 +514,7 @@ static int phy_start_aneg_priv(struct ph + * negotiation may already be done and aneg interrupt may not be + * generated. + */ +- if (phy_interrupt_is_valid(phydev) && (phydev->state == PHY_AN)) { ++ if (phydev->irq != PHY_POLL && phydev->state == PHY_AN) { + err = phy_aneg_done(phydev); + if (err > 0) { + trigger = true; diff --git a/queue-4.17/net-skb_segment-should-not-return-null.patch b/queue-4.17/net-skb_segment-should-not-return-null.patch new file mode 100644 index 00000000000..8f2831a266d --- /dev/null +++ b/queue-4.17/net-skb_segment-should-not-return-null.patch @@ -0,0 +1,139 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Eric Dumazet +Date: Thu, 19 Jul 2018 16:04:38 -0700 +Subject: net: skb_segment() should not return NULL + +From: Eric Dumazet + +[ Upstream commit ff907a11a0d68a749ce1a321f4505c03bf72190c ] + +syzbot caught a NULL deref [1], caused by skb_segment() + +skb_segment() has many "goto err;" that assume the @err variable +contains -ENOMEM. + +A successful call to __skb_linearize() should not clear @err, +otherwise a subsequent memory allocation error could return NULL. + +While we are at it, we might use -EINVAL instead of -ENOMEM when +MAX_SKB_FRAGS limit is reached. + +[1] +kasan: CONFIG_KASAN_INLINE enabled +kasan: GPF could be caused by NULL-ptr deref or user memory access +general protection fault: 0000 [#1] SMP KASAN +CPU: 0 PID: 13285 Comm: syz-executor3 Not tainted 4.18.0-rc4+ #146 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 +RIP: 0010:tcp_gso_segment+0x3dc/0x1780 net/ipv4/tcp_offload.c:106 +Code: f0 ff ff 0f 87 1c fd ff ff e8 00 88 0b fb 48 8b 75 d0 48 b9 00 00 00 00 00 fc ff df 48 8d be 90 00 00 00 48 89 f8 48 c1 e8 03 <0f> b6 14 08 48 8d 86 94 00 00 00 48 89 c6 83 e0 07 48 c1 ee 03 0f +RSP: 0018:ffff88019b7fd060 EFLAGS: 00010206 +RAX: 0000000000000012 RBX: 0000000000000020 RCX: dffffc0000000000 +RDX: 0000000000040000 RSI: 0000000000000000 RDI: 0000000000000090 +RBP: ffff88019b7fd0f0 R08: ffff88019510e0c0 R09: ffffed003b5c46d6 +R10: ffffed003b5c46d6 R11: ffff8801dae236b3 R12: 0000000000000001 +R13: ffff8801d6c581f4 R14: 0000000000000000 R15: ffff8801d6c58128 +FS: 00007fcae64d6700(0000) GS:ffff8801dae00000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00000000004e8664 CR3: 00000001b669b000 CR4: 00000000001406f0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +Call Trace: + tcp4_gso_segment+0x1c3/0x440 net/ipv4/tcp_offload.c:54 + inet_gso_segment+0x64e/0x12d0 net/ipv4/af_inet.c:1342 + inet_gso_segment+0x64e/0x12d0 net/ipv4/af_inet.c:1342 + skb_mac_gso_segment+0x3b5/0x740 net/core/dev.c:2792 + __skb_gso_segment+0x3c3/0x880 net/core/dev.c:2865 + skb_gso_segment include/linux/netdevice.h:4099 [inline] + validate_xmit_skb+0x640/0xf30 net/core/dev.c:3104 + __dev_queue_xmit+0xc14/0x3910 net/core/dev.c:3561 + dev_queue_xmit+0x17/0x20 net/core/dev.c:3602 + neigh_hh_output include/net/neighbour.h:473 [inline] + neigh_output include/net/neighbour.h:481 [inline] + ip_finish_output2+0x1063/0x1860 net/ipv4/ip_output.c:229 + ip_finish_output+0x841/0xfa0 net/ipv4/ip_output.c:317 + NF_HOOK_COND include/linux/netfilter.h:276 [inline] + ip_output+0x223/0x880 net/ipv4/ip_output.c:405 + dst_output include/net/dst.h:444 [inline] + ip_local_out+0xc5/0x1b0 net/ipv4/ip_output.c:124 + iptunnel_xmit+0x567/0x850 net/ipv4/ip_tunnel_core.c:91 + ip_tunnel_xmit+0x1598/0x3af1 net/ipv4/ip_tunnel.c:778 + ipip_tunnel_xmit+0x264/0x2c0 net/ipv4/ipip.c:308 + __netdev_start_xmit include/linux/netdevice.h:4148 [inline] + netdev_start_xmit include/linux/netdevice.h:4157 [inline] + xmit_one net/core/dev.c:3034 [inline] + dev_hard_start_xmit+0x26c/0xc30 net/core/dev.c:3050 + __dev_queue_xmit+0x29ef/0x3910 net/core/dev.c:3569 + dev_queue_xmit+0x17/0x20 net/core/dev.c:3602 + neigh_direct_output+0x15/0x20 net/core/neighbour.c:1403 + neigh_output include/net/neighbour.h:483 [inline] + ip_finish_output2+0xa67/0x1860 net/ipv4/ip_output.c:229 + ip_finish_output+0x841/0xfa0 net/ipv4/ip_output.c:317 + NF_HOOK_COND include/linux/netfilter.h:276 [inline] + ip_output+0x223/0x880 net/ipv4/ip_output.c:405 + dst_output include/net/dst.h:444 [inline] + ip_local_out+0xc5/0x1b0 net/ipv4/ip_output.c:124 + ip_queue_xmit+0x9df/0x1f80 net/ipv4/ip_output.c:504 + tcp_transmit_skb+0x1bf9/0x3f10 net/ipv4/tcp_output.c:1168 + tcp_write_xmit+0x1641/0x5c20 net/ipv4/tcp_output.c:2363 + __tcp_push_pending_frames+0xb2/0x290 net/ipv4/tcp_output.c:2536 + tcp_push+0x638/0x8c0 net/ipv4/tcp.c:735 + tcp_sendmsg_locked+0x2ec5/0x3f00 net/ipv4/tcp.c:1410 + tcp_sendmsg+0x2f/0x50 net/ipv4/tcp.c:1447 + inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798 + sock_sendmsg_nosec net/socket.c:641 [inline] + sock_sendmsg+0xd5/0x120 net/socket.c:651 + __sys_sendto+0x3d7/0x670 net/socket.c:1797 + __do_sys_sendto net/socket.c:1809 [inline] + __se_sys_sendto net/socket.c:1805 [inline] + __x64_sys_sendto+0xe1/0x1a0 net/socket.c:1805 + do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290 + entry_SYSCALL_64_after_hwframe+0x49/0xbe +RIP: 0033:0x455ab9 +Code: 1d ba fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb b9 fb ff c3 66 2e 0f 1f 84 00 00 00 00 +RSP: 002b:00007fcae64d5c68 EFLAGS: 00000246 ORIG_RAX: 000000000000002c +RAX: ffffffffffffffda RBX: 00007fcae64d66d4 RCX: 0000000000455ab9 +RDX: 0000000000000001 RSI: 0000000020000200 RDI: 0000000000000013 +RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000014 +R13: 00000000004c1145 R14: 00000000004d1818 R15: 0000000000000006 +Modules linked in: +Dumping ftrace buffer: + (ftrace buffer empty) + +Fixes: ddff00d42043 ("net: Move skb_has_shared_frag check out of GRE code and into segmentation") +Signed-off-by: Eric Dumazet +Cc: Alexander Duyck +Reported-by: syzbot +Acked-by: Alexander Duyck +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/skbuff.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/net/core/skbuff.c ++++ b/net/core/skbuff.c +@@ -3705,6 +3705,7 @@ normal: + net_warn_ratelimited( + "skb_segment: too many frags: %u %u\n", + pos, mss); ++ err = -EINVAL; + goto err; + } + +@@ -3738,11 +3739,10 @@ skip_fraglist: + + perform_csum_check: + if (!csum) { +- if (skb_has_shared_frag(nskb)) { +- err = __skb_linearize(nskb); +- if (err) +- goto err; +- } ++ if (skb_has_shared_frag(nskb) && ++ __skb_linearize(nskb)) ++ goto err; ++ + if (!nskb->remcsum_offload) + nskb->ip_summed = CHECKSUM_NONE; + SKB_GSO_CB(nskb)->csum = diff --git a/queue-4.17/nfp-flower-ensure-dead-neighbour-entries-are-not-offloaded.patch b/queue-4.17/nfp-flower-ensure-dead-neighbour-entries-are-not-offloaded.patch new file mode 100644 index 00000000000..829cf36041e --- /dev/null +++ b/queue-4.17/nfp-flower-ensure-dead-neighbour-entries-are-not-offloaded.patch @@ -0,0 +1,37 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: John Hurley +Date: Fri, 20 Jul 2018 21:07:54 -0700 +Subject: nfp: flower: ensure dead neighbour entries are not offloaded + +From: John Hurley + +[ Upstream commit b809ec869b2cf2af053ffd99e5a46ab600e94aa2 ] + +Previously only the neighbour state was checked to decide if an offloaded +entry should be removed. However, there can be situations when the entry +is dead but still marked as valid. This can lead to dead entries not +being removed from fw tables or even incorrect data being added. + +Check the entry dead bit before deciding if it should be added to or +removed from fw neighbour tables. + +Fixes: 8e6a9046b66a ("nfp: flower vxlan neighbour offload") +Signed-off-by: John Hurley +Reviewed-by: Jakub Kicinski +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c ++++ b/drivers/net/ethernet/netronome/nfp/flower/tunnel_conf.c +@@ -317,7 +317,7 @@ nfp_tun_write_neigh(struct net_device *n + payload.dst_ipv4 = flow->daddr; + + /* If entry has expired send dst IP with all other fields 0. */ +- if (!(neigh->nud_state & NUD_VALID)) { ++ if (!(neigh->nud_state & NUD_VALID) || neigh->dead) { + nfp_tun_del_route_from_cache(app, payload.dst_ipv4); + /* Trigger ARP to verify invalid neighbour state. */ + neigh_event_send(neigh, NULL); diff --git a/queue-4.17/r8169-restore-previous-behavior-to-accept-bios-wol-settings.patch b/queue-4.17/r8169-restore-previous-behavior-to-accept-bios-wol-settings.patch new file mode 100644 index 00000000000..b38990a50e5 --- /dev/null +++ b/queue-4.17/r8169-restore-previous-behavior-to-accept-bios-wol-settings.patch @@ -0,0 +1,40 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Heiner Kallweit +Date: Tue, 24 Jul 2018 22:21:04 +0200 +Subject: r8169: restore previous behavior to accept BIOS WoL settings + +From: Heiner Kallweit + +[ Upstream commit 18041b523692038d41751fd8046638c356d77a36 ] + +Commit 7edf6d314cd0 tried to resolve an inconsistency (BIOS WoL +settings are accepted, but device isn't wakeup-enabled) resulting +from a previous broken-BIOS workaround by making disabled WoL the +default. +This however had some side effects, most likely due to a broken BIOS +some systems don't properly resume from suspend when the MagicPacket +WoL bit isn't set in the chip, see +https://bugzilla.kernel.org/show_bug.cgi?id=200195 +Therefore restore the WoL behavior from 4.16. + +Reported-by: Albert Astals Cid +Fixes: 7edf6d314cd0 ("r8169: disable WOL per default") +Signed-off-by: Heiner Kallweit +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/realtek/r8169.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/drivers/net/ethernet/realtek/r8169.c ++++ b/drivers/net/ethernet/realtek/r8169.c +@@ -8272,8 +8272,7 @@ static int rtl_init_one(struct pci_dev * + return rc; + } + +- /* override BIOS settings, use userspace tools to enable WOL */ +- __rtl8169_set_wol(tp, 0); ++ tp->saved_wolopts = __rtl8169_get_wol(tp); + + if (rtl_tbi_enabled(tp)) { + tp->set_speed = rtl8169_set_speed_tbi; diff --git a/queue-4.17/rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch b/queue-4.17/rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch new file mode 100644 index 00000000000..fb62fd83ff8 --- /dev/null +++ b/queue-4.17/rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch @@ -0,0 +1,65 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Roopa Prabhu +Date: Fri, 20 Jul 2018 13:21:01 -0700 +Subject: rtnetlink: add rtnl_link_state check in rtnl_configure_link + +From: Roopa Prabhu + +[ Upstream commit 5025f7f7d506fba9b39e7fe8ca10f6f34cb9bc2d ] + +rtnl_configure_link sets dev->rtnl_link_state to +RTNL_LINK_INITIALIZED and unconditionally calls +__dev_notify_flags to notify user-space of dev flags. + +current call sequence for rtnl_configure_link +rtnetlink_newlink + rtnl_link_ops->newlink + rtnl_configure_link (unconditionally notifies userspace of + default and new dev flags) + +If a newlink handler wants to call rtnl_configure_link +early, we will end up with duplicate notifications to +user-space. + +This patch fixes rtnl_configure_link to check rtnl_link_state +and call __dev_notify_flags with gchanges = 0 if already +RTNL_LINK_INITIALIZED. + +Later in the series, this patch will help the following sequence +where a driver implementing newlink can call rtnl_configure_link +to initialize the link early. + +makes the following call sequence work: +rtnetlink_newlink + rtnl_link_ops->newlink (vxlan) -> rtnl_configure_link (initializes + link and notifies + user-space of default + dev flags) + rtnl_configure_link (updates dev flags if requested by user ifm + and notifies user-space of new dev flags) + +Signed-off-by: Roopa Prabhu +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/rtnetlink.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +--- a/net/core/rtnetlink.c ++++ b/net/core/rtnetlink.c +@@ -2749,9 +2749,12 @@ int rtnl_configure_link(struct net_devic + return err; + } + +- dev->rtnl_link_state = RTNL_LINK_INITIALIZED; +- +- __dev_notify_flags(dev, old_flags, ~0U); ++ if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { ++ __dev_notify_flags(dev, old_flags, 0U); ++ } else { ++ dev->rtnl_link_state = RTNL_LINK_INITIALIZED; ++ __dev_notify_flags(dev, old_flags, ~0U); ++ } + return 0; + } + EXPORT_SYMBOL(rtnl_configure_link); diff --git a/queue-4.17/series b/queue-4.17/series index ae69565c2d7..5bf12943b4f 100644 --- a/queue-4.17/series +++ b/queue-4.17/series @@ -9,3 +9,38 @@ xen-pvh-set-up-gs-segment-for-stack-canary.patch kvm-ppc-check-if-iommu-page-is-contained-in-the-pinned-physical-page.patch drm-nouveau-drm-nouveau-fix-runtime-pm-leak-in-nv50_disp_atomic_commit.patch drm-nouveau-set-driver_atomic-cap-earlier-to-fix-debugfs.patch +clk-meson-gxbb-set-fclk_div2-as-clk_is_critical.patch +bonding-set-default-miimon-value-for-non-arp-modes-if-not-set.patch +ip-hash-fragments-consistently.patch +ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch +net-dsa-mv88e6xxx-fix-races-between-lock-and-irq-freeing.patch +net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch +net-next-hinic-fix-a-problem-in-hinic_xmit_frame.patch +net-skb_segment-should-not-return-null.patch +tcp-fix-dctcp-delayed-ack-schedule.patch +tcp-helpers-to-send-special-dctcp-ack.patch +tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch +tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch +net-mlx5-e-switch-ubsan-fix-undefined-behavior-in-mlx5_eswitch_mode.patch +r8169-restore-previous-behavior-to-accept-bios-wol-settings.patch +tls-check-rcv_shutdown-in-tls_wait_data.patch +net-mlx5e-add-ingress-egress-indication-for-offloaded-tc-flows.patch +net-mlx5e-only-allow-offloading-decap-egress-egdev-flows.patch +net-mlx5e-refine-ets-validation-function.patch +nfp-flower-ensure-dead-neighbour-entries-are-not-offloaded.patch +sock-fix-sg-page-frag-coalescing-in-sk_alloc_sg.patch +net-phy-consider-phy_ignore_interrupt-in-phy_start_aneg_priv.patch +multicast-do-not-restore-deleted-record-source-filter-mode-to-new-one.patch +net-ipv6-fix-linklocal-to-global-address-with-vrf.patch +net-mlx5e-don-t-allow-arfs-for-encapsulated-packets.patch +net-mlx5e-fix-quota-counting-in-arfs-expire-flow.patch +net-mlx5-adjust-clock-overflow-work-period.patch +rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch +vxlan-add-new-fdb-alloc-and-create-helpers.patch +vxlan-make-netlink-notify-in-vxlan_fdb_destroy-optional.patch +vxlan-fix-default-fdb-entry-netlink-notify-ordering-during-netdev-create.patch +tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch +tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch +tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch +tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch +tcp-add-tcp_ooo_try_coalesce-helper.patch diff --git a/queue-4.17/sock-fix-sg-page-frag-coalescing-in-sk_alloc_sg.patch b/queue-4.17/sock-fix-sg-page-frag-coalescing-in-sk_alloc_sg.patch new file mode 100644 index 00000000000..47df2668f92 --- /dev/null +++ b/queue-4.17/sock-fix-sg-page-frag-coalescing-in-sk_alloc_sg.patch @@ -0,0 +1,40 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Daniel Borkmann +Date: Mon, 23 Jul 2018 22:37:54 +0200 +Subject: sock: fix sg page frag coalescing in sk_alloc_sg + +From: Daniel Borkmann + +[ Upstream commit 144fe2bfd236dc814eae587aea7e2af03dbdd755 ] + +Current sg coalescing logic in sk_alloc_sg() (latter is used by tls and +sockmap) is not quite correct in that we do fetch the previous sg entry, +however the subsequent check whether the refilled page frag from the +socket is still the same as from the last entry with prior offset and +length matching the start of the current buffer is comparing always the +first sg list entry instead of the prior one. + +Fixes: 3c4d7559159b ("tls: kernel TLS support") +Signed-off-by: Daniel Borkmann +Acked-by: Dave Watson +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/sock.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -2270,9 +2270,9 @@ int sk_alloc_sg(struct sock *sk, int len + pfrag->offset += use; + + sge = sg + sg_curr - 1; +- if (sg_curr > first_coalesce && sg_page(sg) == pfrag->page && +- sg->offset + sg->length == orig_offset) { +- sg->length += use; ++ if (sg_curr > first_coalesce && sg_page(sge) == pfrag->page && ++ sge->offset + sge->length == orig_offset) { ++ sge->length += use; + } else { + sge = sg + sg_curr; + sg_unmark_end(sge); diff --git a/queue-4.17/tcp-add-tcp_ooo_try_coalesce-helper.patch b/queue-4.17/tcp-add-tcp_ooo_try_coalesce-helper.patch new file mode 100644 index 00000000000..a8ec886f27d --- /dev/null +++ b/queue-4.17/tcp-add-tcp_ooo_try_coalesce-helper.patch @@ -0,0 +1,74 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Eric Dumazet +Date: Mon, 23 Jul 2018 09:28:21 -0700 +Subject: tcp: add tcp_ooo_try_coalesce() helper + +From: Eric Dumazet + +[ Upstream commit 58152ecbbcc6a0ce7fddd5bf5f6ee535834ece0c ] + +In case skb in out_or_order_queue is the result of +multiple skbs coalescing, we would like to get a proper gso_segs +counter tracking, so that future tcp_drop() can report an accurate +number. + +I chose to not implement this tracking for skbs in receive queue, +since they are not dropped, unless socket is disconnected. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Acked-by: Yuchung Cheng +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 25 +++++++++++++++++++++---- + 1 file changed, 21 insertions(+), 4 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4299,6 +4299,23 @@ static bool tcp_try_coalesce(struct sock + return true; + } + ++static bool tcp_ooo_try_coalesce(struct sock *sk, ++ struct sk_buff *to, ++ struct sk_buff *from, ++ bool *fragstolen) ++{ ++ bool res = tcp_try_coalesce(sk, to, from, fragstolen); ++ ++ /* In case tcp_drop() is called later, update to->gso_segs */ ++ if (res) { ++ u32 gso_segs = max_t(u16, 1, skb_shinfo(to)->gso_segs) + ++ max_t(u16, 1, skb_shinfo(from)->gso_segs); ++ ++ skb_shinfo(to)->gso_segs = min_t(u32, gso_segs, 0xFFFF); ++ } ++ return res; ++} ++ + static void tcp_drop(struct sock *sk, struct sk_buff *skb) + { + sk_drops_add(sk, skb); +@@ -4422,8 +4439,8 @@ static void tcp_data_queue_ofo(struct so + /* In the typical case, we are adding an skb to the end of the list. + * Use of ooo_last_skb avoids the O(Log(N)) rbtree lookup. + */ +- if (tcp_try_coalesce(sk, tp->ooo_last_skb, +- skb, &fragstolen)) { ++ if (tcp_ooo_try_coalesce(sk, tp->ooo_last_skb, ++ skb, &fragstolen)) { + coalesce_done: + tcp_grow_window(sk, skb); + kfree_skb_partial(skb, fragstolen); +@@ -4473,8 +4490,8 @@ coalesce_done: + tcp_drop(sk, skb1); + goto merge_right; + } +- } else if (tcp_try_coalesce(sk, skb1, +- skb, &fragstolen)) { ++ } else if (tcp_ooo_try_coalesce(sk, skb1, ++ skb, &fragstolen)) { + goto coalesce_done; + } + p = &parent->rb_right; diff --git a/queue-4.17/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch b/queue-4.17/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch new file mode 100644 index 00000000000..56b9c8a9b41 --- /dev/null +++ b/queue-4.17/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch @@ -0,0 +1,46 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Eric Dumazet +Date: Mon, 23 Jul 2018 09:28:18 -0700 +Subject: tcp: avoid collapses in tcp_prune_queue() if possible + +From: Eric Dumazet + +[ Upstream commit f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7 ] + +Right after a TCP flow is created, receiving tiny out of order +packets allways hit the condition : + +if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + tcp_clamp_window(sk); + +tcp_clamp_window() increases sk_rcvbuf to match sk_rmem_alloc +(guarded by tcp_rmem[2]) + +Calling tcp_collapse_ofo_queue() in this case is not useful, +and offers a O(N^2) surface attack to malicious peers. + +Better not attempt anything before full queue capacity is reached, +forcing attacker to spend lots of resource and allow us to more +easily detect the abuse. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Acked-by: Yuchung Cheng +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4936,6 +4936,9 @@ static int tcp_prune_queue(struct sock * + else if (tcp_under_memory_pressure(sk)) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + ++ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) ++ return 0; ++ + tcp_collapse_ofo_queue(sk); + if (!skb_queue_empty(&sk->sk_receive_queue)) + tcp_collapse(sk, &sk->sk_receive_queue, NULL, diff --git a/queue-4.17/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch b/queue-4.17/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch new file mode 100644 index 00000000000..02821ebff1c --- /dev/null +++ b/queue-4.17/tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch @@ -0,0 +1,42 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Eric Dumazet +Date: Mon, 23 Jul 2018 09:28:20 -0700 +Subject: tcp: call tcp_drop() from tcp_data_queue_ofo() + +From: Eric Dumazet + +[ Upstream commit 8541b21e781a22dce52a74fef0b9bed00404a1cd ] + +In order to be able to give better diagnostics and detect +malicious traffic, we need to have better sk->sk_drops tracking. + +Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue") +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Acked-by: Yuchung Cheng +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4451,7 +4451,7 @@ coalesce_done: + /* All the bits are present. Drop. */ + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); +- __kfree_skb(skb); ++ tcp_drop(sk, skb); + skb = NULL; + tcp_dsack_set(sk, seq, end_seq); + goto add_sack; +@@ -4470,7 +4470,7 @@ coalesce_done: + TCP_SKB_CB(skb1)->end_seq); + NET_INC_STATS(sock_net(sk), + LINUX_MIB_TCPOFOMERGE); +- __kfree_skb(skb1); ++ tcp_drop(sk, skb1); + goto merge_right; + } + } else if (tcp_try_coalesce(sk, skb1, diff --git a/queue-4.17/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch b/queue-4.17/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch new file mode 100644 index 00000000000..0993a8ba1eb --- /dev/null +++ b/queue-4.17/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch @@ -0,0 +1,72 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Eric Dumazet +Date: Mon, 23 Jul 2018 09:28:19 -0700 +Subject: tcp: detect malicious patterns in tcp_collapse_ofo_queue() + +From: Eric Dumazet + +[ Upstream commit 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf ] + +In case an attacker feeds tiny packets completely out of order, +tcp_collapse_ofo_queue() might scan the whole rb-tree, performing +expensive copies, but not changing socket memory usage at all. + +1) Do not attempt to collapse tiny skbs. +2) Add logic to exit early when too many tiny skbs are detected. + +We prefer not doing aggressive collapsing (which copies packets) +for pathological flows, and revert to tcp_prune_ofo_queue() which +will be less expensive. + +In the future, we might add the possibility of terminating flows +that are proven to be malicious. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 15 +++++++++++++-- + 1 file changed, 13 insertions(+), 2 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4834,6 +4834,7 @@ end: + static void tcp_collapse_ofo_queue(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); ++ u32 range_truesize, sum_tiny = 0; + struct sk_buff *skb, *head; + u32 start, end; + +@@ -4845,6 +4846,7 @@ new_range: + } + start = TCP_SKB_CB(skb)->seq; + end = TCP_SKB_CB(skb)->end_seq; ++ range_truesize = skb->truesize; + + for (head = skb;;) { + skb = skb_rb_next(skb); +@@ -4855,11 +4857,20 @@ new_range: + if (!skb || + after(TCP_SKB_CB(skb)->seq, end) || + before(TCP_SKB_CB(skb)->end_seq, start)) { +- tcp_collapse(sk, NULL, &tp->out_of_order_queue, +- head, skb, start, end); ++ /* Do not attempt collapsing tiny skbs */ ++ if (range_truesize != head->truesize || ++ end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) { ++ tcp_collapse(sk, NULL, &tp->out_of_order_queue, ++ head, skb, start, end); ++ } else { ++ sum_tiny += range_truesize; ++ if (sum_tiny > sk->sk_rcvbuf >> 3) ++ return; ++ } + goto new_range; + } + ++ range_truesize += skb->truesize; + if (unlikely(before(TCP_SKB_CB(skb)->seq, start))) + start = TCP_SKB_CB(skb)->seq; + if (after(TCP_SKB_CB(skb)->end_seq, end)) diff --git a/queue-4.17/tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch b/queue-4.17/tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch new file mode 100644 index 00000000000..0bf51684038 --- /dev/null +++ b/queue-4.17/tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch @@ -0,0 +1,138 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Yuchung Cheng +Date: Wed, 18 Jul 2018 13:56:35 -0700 +Subject: tcp: do not cancel delay-AcK on DCTCP special ACK + +From: Yuchung Cheng + +[ Upstream commit 27cde44a259c380a3c09066fc4b42de7dde9b1ad ] + +Currently when a DCTCP receiver delays an ACK and receive a +data packet with a different CE mark from the previous one's, it +sends two immediate ACKs acking previous and latest sequences +respectly (for ECN accounting). + +Previously sending the first ACK may mark off the delayed ACK timer +(tcp_event_ack_sent). This may subsequently prevent sending the +second ACK to acknowledge the latest sequence (tcp_ack_snd_check). +The culprit is that tcp_send_ack() assumes it always acknowleges +the latest sequence, which is not true for the first special ACK. + +The fix is to not make the assumption in tcp_send_ack and check the +actual ack sequence before cancelling the delayed ACK. Further it's +safer to pass the ack sequence number as a local variable into +tcp_send_ack routine, instead of intercepting tp->rcv_nxt to avoid +future bugs like this. + +Reported-by: Neal Cardwell +Signed-off-by: Yuchung Cheng +Acked-by: Neal Cardwell +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/tcp.h | 1 + + net/ipv4/tcp_dctcp.c | 34 ++++------------------------------ + net/ipv4/tcp_output.c | 11 ++++++++--- + 3 files changed, 13 insertions(+), 33 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -535,6 +535,7 @@ void tcp_send_fin(struct sock *sk); + void tcp_send_active_reset(struct sock *sk, gfp_t priority); + int tcp_send_synack(struct sock *); + void tcp_push_one(struct sock *, unsigned int mss_now); ++void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); + void tcp_send_ack(struct sock *sk); + void tcp_send_delayed_ack(struct sock *sk); + void tcp_send_loss_probe(struct sock *sk); +--- a/net/ipv4/tcp_dctcp.c ++++ b/net/ipv4/tcp_dctcp.c +@@ -135,21 +135,8 @@ static void dctcp_ce_state_0_to_1(struct + * ACK has not sent yet. + */ + if (!ca->ce_state && +- inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { +- u32 tmp_rcv_nxt; +- +- /* Save current rcv_nxt. */ +- tmp_rcv_nxt = tp->rcv_nxt; +- +- /* Generate previous ack with CE=0. */ +- tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +- tp->rcv_nxt = ca->prior_rcv_nxt; +- +- tcp_send_ack(sk); +- +- /* Recover current rcv_nxt. */ +- tp->rcv_nxt = tmp_rcv_nxt; +- } ++ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) ++ __tcp_send_ack(sk, ca->prior_rcv_nxt); + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 1; +@@ -166,21 +153,8 @@ static void dctcp_ce_state_1_to_0(struct + * ACK has not sent yet. + */ + if (ca->ce_state && +- inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { +- u32 tmp_rcv_nxt; +- +- /* Save current rcv_nxt. */ +- tmp_rcv_nxt = tp->rcv_nxt; +- +- /* Generate previous ack with CE=1. */ +- tp->ecn_flags |= TCP_ECN_DEMAND_CWR; +- tp->rcv_nxt = ca->prior_rcv_nxt; +- +- tcp_send_ack(sk); +- +- /* Recover current rcv_nxt. */ +- tp->rcv_nxt = tmp_rcv_nxt; +- } ++ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) ++ __tcp_send_ack(sk, ca->prior_rcv_nxt); + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 0; +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -160,8 +160,13 @@ static void tcp_event_data_sent(struct t + } + + /* Account for an ACK we sent. */ +-static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) ++static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, ++ u32 rcv_nxt) + { ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (unlikely(rcv_nxt != tp->rcv_nxt)) ++ return; /* Special ACK sent by DCTCP to reflect ECN */ + tcp_dec_quickack_mode(sk, pkts); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); + } +@@ -1149,7 +1154,7 @@ static int __tcp_transmit_skb(struct soc + icsk->icsk_af_ops->send_check(sk, skb); + + if (likely(tcb->tcp_flags & TCPHDR_ACK)) +- tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); ++ tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); + + if (skb->len != tcp_header_size) { + tcp_event_data_sent(tp, sk); +@@ -3627,12 +3632,12 @@ void __tcp_send_ack(struct sock *sk, u32 + /* Send it off, this clears delayed acks for us. */ + __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt); + } ++EXPORT_SYMBOL_GPL(__tcp_send_ack); + + void tcp_send_ack(struct sock *sk) + { + __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); + } +-EXPORT_SYMBOL_GPL(tcp_send_ack); + + /* This routine sends a packet with an out of date sequence + * number. It assumes the other end will try to ack it. diff --git a/queue-4.17/tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch b/queue-4.17/tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch new file mode 100644 index 00000000000..2619d45fda0 --- /dev/null +++ b/queue-4.17/tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch @@ -0,0 +1,138 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Yuchung Cheng +Date: Wed, 18 Jul 2018 13:56:36 -0700 +Subject: tcp: do not delay ACK in DCTCP upon CE status change + +From: Yuchung Cheng + +[ Upstream commit a0496ef2c23b3b180902dd185d0d63ccbc624cf8 ] + +Per DCTCP RFC8257 (Section 3.2) the ACK reflecting the CE status change +has to be sent immediately so the sender can respond quickly: + +""" When receiving packets, the CE codepoint MUST be processed as follows: + + 1. If the CE codepoint is set and DCTCP.CE is false, set DCTCP.CE to + true and send an immediate ACK. + + 2. If the CE codepoint is not set and DCTCP.CE is true, set DCTCP.CE + to false and send an immediate ACK. +""" + +Previously DCTCP implementation may continue to delay the ACK. This +patch fixes that to implement the RFC by forcing an immediate ACK. + +Tested with this packetdrill script provided by Larry Brakmo + +0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0 +0.000 bind(3, ..., ...) = 0 +0.000 listen(3, 1) = 0 + +0.100 < [ect0] SEW 0:0(0) win 32792 +0.100 > SE. 0:0(0) ack 1 +0.110 < [ect0] . 1:1(0) ack 1 win 257 +0.200 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_SOCKET, SO_DEBUG, [1], 4) = 0 + +0.200 < [ect0] . 1:1001(1000) ack 1 win 257 +0.200 > [ect01] . 1:1(0) ack 1001 + +0.200 write(4, ..., 1) = 1 +0.200 > [ect01] P. 1:2(1) ack 1001 + +0.200 < [ect0] . 1001:2001(1000) ack 2 win 257 ++0.005 < [ce] . 2001:3001(1000) ack 2 win 257 + ++0.000 > [ect01] . 2:2(0) ack 2001 +// Previously the ACK below would be delayed by 40ms ++0.000 > [ect01] E. 2:2(0) ack 3001 + ++0.500 < F. 9501:9501(0) ack 4 win 257 + +Signed-off-by: Yuchung Cheng +Acked-by: Neal Cardwell +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/tcp.h | 1 + + net/ipv4/tcp_dctcp.c | 30 ++++++++++++++++++------------ + net/ipv4/tcp_input.c | 3 ++- + 3 files changed, 21 insertions(+), 13 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -342,6 +342,7 @@ ssize_t tcp_splice_read(struct socket *s + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); + ++void tcp_enter_quickack_mode(struct sock *sk); + static inline void tcp_dec_quickack_mode(struct sock *sk, + const unsigned int pkts) + { +--- a/net/ipv4/tcp_dctcp.c ++++ b/net/ipv4/tcp_dctcp.c +@@ -131,12 +131,15 @@ static void dctcp_ce_state_0_to_1(struct + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + +- /* State has changed from CE=0 to CE=1 and delayed +- * ACK has not sent yet. +- */ +- if (!ca->ce_state && +- inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) +- __tcp_send_ack(sk, ca->prior_rcv_nxt); ++ if (!ca->ce_state) { ++ /* State has changed from CE=0 to CE=1, force an immediate ++ * ACK to reflect the new CE state. If an ACK was delayed, ++ * send that first to reflect the prior CE state. ++ */ ++ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) ++ __tcp_send_ack(sk, ca->prior_rcv_nxt); ++ tcp_enter_quickack_mode(sk); ++ } + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 1; +@@ -149,12 +152,15 @@ static void dctcp_ce_state_1_to_0(struct + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + +- /* State has changed from CE=1 to CE=0 and delayed +- * ACK has not sent yet. +- */ +- if (ca->ce_state && +- inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) +- __tcp_send_ack(sk, ca->prior_rcv_nxt); ++ if (ca->ce_state) { ++ /* State has changed from CE=1 to CE=0, force an immediate ++ * ACK to reflect the new CE state. If an ACK was delayed, ++ * send that first to reflect the prior CE state. ++ */ ++ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) ++ __tcp_send_ack(sk, ca->prior_rcv_nxt); ++ tcp_enter_quickack_mode(sk); ++ } + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 0; +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -195,13 +195,14 @@ static void tcp_incr_quickack(struct soc + icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); + } + +-static void tcp_enter_quickack_mode(struct sock *sk) ++void tcp_enter_quickack_mode(struct sock *sk) + { + struct inet_connection_sock *icsk = inet_csk(sk); + tcp_incr_quickack(sk); + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; + } ++EXPORT_SYMBOL(tcp_enter_quickack_mode); + + /* Send ACKs quickly, if "quick" count is not exhausted + * and the session is not interactive. diff --git a/queue-4.17/tcp-fix-dctcp-delayed-ack-schedule.patch b/queue-4.17/tcp-fix-dctcp-delayed-ack-schedule.patch new file mode 100644 index 00000000000..bea63523725 --- /dev/null +++ b/queue-4.17/tcp-fix-dctcp-delayed-ack-schedule.patch @@ -0,0 +1,98 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Yuchung Cheng +Date: Thu, 12 Jul 2018 06:04:52 -0700 +Subject: tcp: fix dctcp delayed ACK schedule + +From: Yuchung Cheng + +[ Upstream commit b0c05d0e99d98d7f0cd41efc1eeec94efdc3325d ] + +Previously, when a data segment was sent an ACK was piggybacked +on the data segment without generating a CA_EVENT_NON_DELAYED_ACK +event to notify congestion control modules. So the DCTCP +ca->delayed_ack_reserved flag could incorrectly stay set when +in fact there were no delayed ACKs being reserved. This could result +in sending a special ECN notification ACK that carries an older +ACK sequence, when in fact there was no need for such an ACK. +DCTCP keeps track of the delayed ACK status with its own separate +state ca->delayed_ack_reserved. Previously it may accidentally cancel +the delayed ACK without updating this field upon sending a special +ACK that carries a older ACK sequence. This inconsistency would +lead to DCTCP receiver never acknowledging the latest data until the +sender times out and retry in some cases. + +Packetdrill script (provided by Larry Brakmo) + +0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0 +0.000 bind(3, ..., ...) = 0 +0.000 listen(3, 1) = 0 + +0.100 < [ect0] SEW 0:0(0) win 32792 +0.100 > SE. 0:0(0) ack 1 +0.110 < [ect0] . 1:1(0) ack 1 win 257 +0.200 accept(3, ..., ...) = 4 + +0.200 < [ect0] . 1:1001(1000) ack 1 win 257 +0.200 > [ect01] . 1:1(0) ack 1001 + +0.200 write(4, ..., 1) = 1 +0.200 > [ect01] P. 1:2(1) ack 1001 + +0.200 < [ect0] . 1001:2001(1000) ack 2 win 257 +0.200 write(4, ..., 1) = 1 +0.200 > [ect01] P. 2:3(1) ack 2001 + +0.200 < [ect0] . 2001:3001(1000) ack 3 win 257 +0.200 < [ect0] . 3001:4001(1000) ack 3 win 257 +0.200 > [ect01] . 3:3(0) ack 4001 + +0.210 < [ce] P. 4001:4501(500) ack 3 win 257 + ++0.001 read(4, ..., 4500) = 4500 ++0 write(4, ..., 1) = 1 ++0 > [ect01] PE. 3:4(1) ack 4501 + ++0.010 < [ect0] W. 4501:5501(1000) ack 4 win 257 +// Previously the ACK sequence below would be 4501, causing a long RTO ++0.040~+0.045 > [ect01] . 4:4(0) ack 5501 // delayed ack + ++0.311 < [ect0] . 5501:6501(1000) ack 4 win 257 // More data ++0 > [ect01] . 4:4(0) ack 6501 // now acks everything + ++0.500 < F. 9501:9501(0) ack 4 win 257 + +Reported-by: Larry Brakmo +Signed-off-by: Yuchung Cheng +Signed-off-by: Eric Dumazet +Acked-by: Neal Cardwell +Acked-by: Lawrence Brakmo +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_dctcp.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/net/ipv4/tcp_dctcp.c ++++ b/net/ipv4/tcp_dctcp.c +@@ -134,7 +134,8 @@ static void dctcp_ce_state_0_to_1(struct + /* State has changed from CE=0 to CE=1 and delayed + * ACK has not sent yet. + */ +- if (!ca->ce_state && ca->delayed_ack_reserved) { ++ if (!ca->ce_state && ++ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { + u32 tmp_rcv_nxt; + + /* Save current rcv_nxt. */ +@@ -164,7 +165,8 @@ static void dctcp_ce_state_1_to_0(struct + /* State has changed from CE=1 to CE=0 and delayed + * ACK has not sent yet. + */ +- if (ca->ce_state && ca->delayed_ack_reserved) { ++ if (ca->ce_state && ++ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { + u32 tmp_rcv_nxt; + + /* Save current rcv_nxt. */ diff --git a/queue-4.17/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch b/queue-4.17/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch new file mode 100644 index 00000000000..efdd47d21b4 --- /dev/null +++ b/queue-4.17/tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch @@ -0,0 +1,76 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Eric Dumazet +Date: Mon, 23 Jul 2018 09:28:17 -0700 +Subject: tcp: free batches of packets in tcp_prune_ofo_queue() + +From: Eric Dumazet + +[ Upstream commit 72cd43ba64fc172a443410ce01645895850844c8 ] + +Juha-Matti Tilli reported that malicious peers could inject tiny +packets in out_of_order_queue, forcing very expensive calls +to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for +every incoming packet. out_of_order_queue rb-tree can contain +thousands of nodes, iterating over all of them is not nice. + +Before linux-4.9, we would have pruned all packets in ofo_queue +in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs +truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB. + +Since we plan to increase tcp_rmem[2] in the future to cope with +modern BDP, can not revert to the old behavior, without great pain. + +Strategy taken in this patch is to purge ~12.5 % of the queue capacity. + +Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets") +Signed-off-by: Eric Dumazet +Reported-by: Juha-Matti Tilli +Acked-by: Yuchung Cheng +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 15 +++++++++++---- + 1 file changed, 11 insertions(+), 4 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4874,6 +4874,7 @@ new_range: + * 2) not add too big latencies if thousands of packets sit there. + * (But if application shrinks SO_RCVBUF, we could still end up + * freeing whole queue here) ++ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks. + * + * Return true if queue has shrunk. + */ +@@ -4881,20 +4882,26 @@ static bool tcp_prune_ofo_queue(struct s + { + struct tcp_sock *tp = tcp_sk(sk); + struct rb_node *node, *prev; ++ int goal; + + if (RB_EMPTY_ROOT(&tp->out_of_order_queue)) + return false; + + NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED); ++ goal = sk->sk_rcvbuf >> 3; + node = &tp->ooo_last_skb->rbnode; + do { + prev = rb_prev(node); + rb_erase(node, &tp->out_of_order_queue); ++ goal -= rb_to_skb(node)->truesize; + tcp_drop(sk, rb_to_skb(node)); +- sk_mem_reclaim(sk); +- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && +- !tcp_under_memory_pressure(sk)) +- break; ++ if (!prev || goal <= 0) { ++ sk_mem_reclaim(sk); ++ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && ++ !tcp_under_memory_pressure(sk)) ++ break; ++ goal = sk->sk_rcvbuf >> 3; ++ } + node = prev; + } while (node); + tp->ooo_last_skb = rb_to_skb(prev); diff --git a/queue-4.17/tcp-helpers-to-send-special-dctcp-ack.patch b/queue-4.17/tcp-helpers-to-send-special-dctcp-ack.patch new file mode 100644 index 00000000000..6c565417bea --- /dev/null +++ b/queue-4.17/tcp-helpers-to-send-special-dctcp-ack.patch @@ -0,0 +1,79 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Yuchung Cheng +Date: Wed, 18 Jul 2018 13:56:34 -0700 +Subject: tcp: helpers to send special DCTCP ack + +From: Yuchung Cheng + +[ Upstream commit 2987babb6982306509380fc11b450227a844493b ] + +Refactor and create helpers to send the special ACK in DCTCP. + +Signed-off-by: Yuchung Cheng +Acked-by: Neal Cardwell +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_output.c | 22 +++++++++++++++++----- + 1 file changed, 17 insertions(+), 5 deletions(-) + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -1031,8 +1031,8 @@ static void tcp_update_skb_after_send(st + * We are working here with either a clone of the original + * SKB, or a fresh unique copy made by the retransmit engine. + */ +-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, +- gfp_t gfp_mask) ++static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, ++ int clone_it, gfp_t gfp_mask, u32 rcv_nxt) + { + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet; +@@ -1108,7 +1108,7 @@ static int tcp_transmit_skb(struct sock + th->source = inet->inet_sport; + th->dest = inet->inet_dport; + th->seq = htonl(tcb->seq); +- th->ack_seq = htonl(tp->rcv_nxt); ++ th->ack_seq = htonl(rcv_nxt); + *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | + tcb->tcp_flags); + +@@ -1186,6 +1186,13 @@ static int tcp_transmit_skb(struct sock + return err; + } + ++static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, ++ gfp_t gfp_mask) ++{ ++ return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, ++ tcp_sk(sk)->rcv_nxt); ++} ++ + /* This routine just queues the buffer for sending. + * + * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, +@@ -3583,7 +3590,7 @@ void tcp_send_delayed_ack(struct sock *s + } + + /* This routine sends an ack and also updates the window. */ +-void tcp_send_ack(struct sock *sk) ++void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) + { + struct sk_buff *buff; + +@@ -3618,7 +3625,12 @@ void tcp_send_ack(struct sock *sk) + skb_set_tcp_pure_ack(buff); + + /* Send it off, this clears delayed acks for us. */ +- tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0); ++ __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt); ++} ++ ++void tcp_send_ack(struct sock *sk) ++{ ++ __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); + } + EXPORT_SYMBOL_GPL(tcp_send_ack); + diff --git a/queue-4.17/tls-check-rcv_shutdown-in-tls_wait_data.patch b/queue-4.17/tls-check-rcv_shutdown-in-tls_wait_data.patch new file mode 100644 index 00000000000..75a1f9c74b8 --- /dev/null +++ b/queue-4.17/tls-check-rcv_shutdown-in-tls_wait_data.patch @@ -0,0 +1,37 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Doron Roberts-Kedes +Date: Wed, 18 Jul 2018 16:22:27 -0700 +Subject: tls: check RCV_SHUTDOWN in tls_wait_data + +From: Doron Roberts-Kedes + +[ Upstream commit fcf4793e278edede8fcd748198d12128037e526c ] + +The current code does not check sk->sk_shutdown & RCV_SHUTDOWN. +tls_sw_recvmsg may return a positive value in the case where bytes have +already been copied when the socket is shutdown. sk->sk_err has been +cleared, causing the tls_wait_data to hang forever on a subsequent +invocation. Checking sk->sk_shutdown & RCV_SHUTDOWN, as in tcp_recvmsg, +fixes this problem. + +Fixes: c46234ebb4d1 ("tls: RX path for ktls") +Acked-by: Dave Watson +Signed-off-by: Doron Roberts-Kedes +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/tls/tls_sw.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/net/tls/tls_sw.c ++++ b/net/tls/tls_sw.c +@@ -646,6 +646,9 @@ static struct sk_buff *tls_wait_data(str + return NULL; + } + ++ if (sk->sk_shutdown & RCV_SHUTDOWN) ++ return NULL; ++ + if (sock_flag(sk, SOCK_DONE)) + return NULL; + diff --git a/queue-4.17/vxlan-add-new-fdb-alloc-and-create-helpers.patch b/queue-4.17/vxlan-add-new-fdb-alloc-and-create-helpers.patch new file mode 100644 index 00000000000..4e7e7165be8 --- /dev/null +++ b/queue-4.17/vxlan-add-new-fdb-alloc-and-create-helpers.patch @@ -0,0 +1,169 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Roopa Prabhu +Date: Fri, 20 Jul 2018 13:21:02 -0700 +Subject: vxlan: add new fdb alloc and create helpers + +From: Roopa Prabhu + +[ Upstream commit 7431016b107c95cb5b2014aa1901fcb115f746bc ] + +- Add new vxlan_fdb_alloc helper +- rename existing vxlan_fdb_create into vxlan_fdb_update: + because it really creates or updates an existing + fdb entry +- move new fdb creation into a separate vxlan_fdb_create + +Main motivation for this change is to introduce the ability +to decouple vxlan fdb creation and notify, used in a later patch. + +Signed-off-by: Roopa Prabhu +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/vxlan.c | 91 +++++++++++++++++++++++++++++++++++----------------- + 1 file changed, 62 insertions(+), 29 deletions(-) + +--- a/drivers/net/vxlan.c ++++ b/drivers/net/vxlan.c +@@ -636,9 +636,62 @@ static int vxlan_gro_complete(struct soc + return eth_gro_complete(skb, nhoff + sizeof(struct vxlanhdr)); + } + +-/* Add new entry to forwarding table -- assumes lock held */ ++static struct vxlan_fdb *vxlan_fdb_alloc(struct vxlan_dev *vxlan, ++ const u8 *mac, __u16 state, ++ __be32 src_vni, __u8 ndm_flags) ++{ ++ struct vxlan_fdb *f; ++ ++ f = kmalloc(sizeof(*f), GFP_ATOMIC); ++ if (!f) ++ return NULL; ++ f->state = state; ++ f->flags = ndm_flags; ++ f->updated = f->used = jiffies; ++ f->vni = src_vni; ++ INIT_LIST_HEAD(&f->remotes); ++ memcpy(f->eth_addr, mac, ETH_ALEN); ++ ++ return f; ++} ++ + static int vxlan_fdb_create(struct vxlan_dev *vxlan, + const u8 *mac, union vxlan_addr *ip, ++ __u16 state, __be16 port, __be32 src_vni, ++ __be32 vni, __u32 ifindex, __u8 ndm_flags, ++ struct vxlan_fdb **fdb) ++{ ++ struct vxlan_rdst *rd = NULL; ++ struct vxlan_fdb *f; ++ int rc; ++ ++ if (vxlan->cfg.addrmax && ++ vxlan->addrcnt >= vxlan->cfg.addrmax) ++ return -ENOSPC; ++ ++ netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); ++ f = vxlan_fdb_alloc(vxlan, mac, state, src_vni, ndm_flags); ++ if (!f) ++ return -ENOMEM; ++ ++ rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); ++ if (rc < 0) { ++ kfree(f); ++ return rc; ++ } ++ ++ ++vxlan->addrcnt; ++ hlist_add_head_rcu(&f->hlist, ++ vxlan_fdb_head(vxlan, mac, src_vni)); ++ ++ *fdb = f; ++ ++ return 0; ++} ++ ++/* Add new entry to forwarding table -- assumes lock held */ ++static int vxlan_fdb_update(struct vxlan_dev *vxlan, ++ const u8 *mac, union vxlan_addr *ip, + __u16 state, __u16 flags, + __be16 port, __be32 src_vni, __be32 vni, + __u32 ifindex, __u8 ndm_flags) +@@ -687,37 +740,17 @@ static int vxlan_fdb_create(struct vxlan + if (!(flags & NLM_F_CREATE)) + return -ENOENT; + +- if (vxlan->cfg.addrmax && +- vxlan->addrcnt >= vxlan->cfg.addrmax) +- return -ENOSPC; +- + /* Disallow replace to add a multicast entry */ + if ((flags & NLM_F_REPLACE) && + (is_multicast_ether_addr(mac) || is_zero_ether_addr(mac))) + return -EOPNOTSUPP; + + netdev_dbg(vxlan->dev, "add %pM -> %pIS\n", mac, ip); +- f = kmalloc(sizeof(*f), GFP_ATOMIC); +- if (!f) +- return -ENOMEM; +- +- notify = 1; +- f->state = state; +- f->flags = ndm_flags; +- f->updated = f->used = jiffies; +- f->vni = src_vni; +- INIT_LIST_HEAD(&f->remotes); +- memcpy(f->eth_addr, mac, ETH_ALEN); +- +- rc = vxlan_fdb_append(f, ip, port, vni, ifindex, &rd); +- if (rc < 0) { +- kfree(f); ++ rc = vxlan_fdb_create(vxlan, mac, ip, state, port, src_vni, ++ vni, ifindex, ndm_flags, &f); ++ if (rc < 0) + return rc; +- } +- +- ++vxlan->addrcnt; +- hlist_add_head_rcu(&f->hlist, +- vxlan_fdb_head(vxlan, mac, src_vni)); ++ notify = 1; + } + + if (notify) { +@@ -863,7 +896,7 @@ static int vxlan_fdb_add(struct ndmsg *n + return -EAFNOSUPPORT; + + spin_lock_bh(&vxlan->hash_lock); +- err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags, ++ err = vxlan_fdb_update(vxlan, addr, &ip, ndm->ndm_state, flags, + port, src_vni, vni, ifindex, ndm->ndm_flags); + spin_unlock_bh(&vxlan->hash_lock); + +@@ -1006,7 +1039,7 @@ static bool vxlan_snoop(struct net_devic + + /* close off race between vxlan_flush and incoming packets */ + if (netif_running(dev)) +- vxlan_fdb_create(vxlan, src_mac, src_ip, ++ vxlan_fdb_update(vxlan, src_mac, src_ip, + NUD_REACHABLE, + NLM_F_EXCL|NLM_F_CREATE, + vxlan->cfg.dst_port, +@@ -3165,7 +3198,7 @@ static int __vxlan_dev_create(struct net + + /* create an fdb entry for a valid default destination */ + if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) { +- err = vxlan_fdb_create(vxlan, all_zeros_mac, ++ err = vxlan_fdb_update(vxlan, all_zeros_mac, + &vxlan->default_dst.remote_ip, + NUD_REACHABLE | NUD_PERMANENT, + NLM_F_EXCL | NLM_F_CREATE, +@@ -3439,7 +3472,7 @@ static int vxlan_changelink(struct net_d + old_dst.remote_ifindex, 0); + + if (!vxlan_addr_any(&dst->remote_ip)) { +- err = vxlan_fdb_create(vxlan, all_zeros_mac, ++ err = vxlan_fdb_update(vxlan, all_zeros_mac, + &dst->remote_ip, + NUD_REACHABLE | NUD_PERMANENT, + NLM_F_CREATE | NLM_F_APPEND, diff --git a/queue-4.17/vxlan-fix-default-fdb-entry-netlink-notify-ordering-during-netdev-create.patch b/queue-4.17/vxlan-fix-default-fdb-entry-netlink-notify-ordering-during-netdev-create.patch new file mode 100644 index 00000000000..344f0fc69e7 --- /dev/null +++ b/queue-4.17/vxlan-fix-default-fdb-entry-netlink-notify-ordering-during-netdev-create.patch @@ -0,0 +1,116 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Roopa Prabhu +Date: Fri, 20 Jul 2018 13:21:04 -0700 +Subject: vxlan: fix default fdb entry netlink notify ordering during netdev create + +From: Roopa Prabhu + +[ Upstream commit e99465b952861533d9ba748fdbecc96d9a36da3e ] + +Problem: +In vxlan_newlink, a default fdb entry is added before register_netdev. +The default fdb creation function also notifies user-space of the +fdb entry on the vxlan device which user-space does not know about yet. +(RTM_NEWNEIGH goes before RTM_NEWLINK for the same ifindex). + +This patch fixes the user-space netlink notification ordering issue +with the following changes: +- decouple fdb notify from fdb create. +- Move fdb notify after register_netdev. +- Call rtnl_configure_link in vxlan newlink handler to notify +userspace about the newlink before fdb notify and +hence avoiding the user-space race. + +Fixes: afbd8bae9c79 ("vxlan: add implicit fdb entry for default destination") +Signed-off-by: Roopa Prabhu +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/vxlan.c | 29 +++++++++++++++++++++-------- + 1 file changed, 21 insertions(+), 8 deletions(-) + +--- a/drivers/net/vxlan.c ++++ b/drivers/net/vxlan.c +@@ -3190,6 +3190,7 @@ static int __vxlan_dev_create(struct net + { + struct vxlan_net *vn = net_generic(net, vxlan_net_id); + struct vxlan_dev *vxlan = netdev_priv(dev); ++ struct vxlan_fdb *f = NULL; + int err; + + err = vxlan_dev_configure(net, dev, conf, false, extack); +@@ -3200,27 +3201,38 @@ static int __vxlan_dev_create(struct net + + /* create an fdb entry for a valid default destination */ + if (!vxlan_addr_any(&vxlan->default_dst.remote_ip)) { +- err = vxlan_fdb_update(vxlan, all_zeros_mac, ++ err = vxlan_fdb_create(vxlan, all_zeros_mac, + &vxlan->default_dst.remote_ip, + NUD_REACHABLE | NUD_PERMANENT, +- NLM_F_EXCL | NLM_F_CREATE, + vxlan->cfg.dst_port, + vxlan->default_dst.remote_vni, + vxlan->default_dst.remote_vni, + vxlan->default_dst.remote_ifindex, +- NTF_SELF); ++ NTF_SELF, &f); + if (err) + return err; + } + + err = register_netdevice(dev); ++ if (err) ++ goto errout; ++ ++ err = rtnl_configure_link(dev, NULL); + if (err) { +- vxlan_fdb_delete_default(vxlan, vxlan->default_dst.remote_vni); +- return err; ++ unregister_netdevice(dev); ++ goto errout; + } + ++ /* notify default fdb entry */ ++ if (f) ++ vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH); ++ + list_add(&vxlan->next, &vn->vxlan_list); + return 0; ++errout: ++ if (f) ++ vxlan_fdb_destroy(vxlan, f, false); ++ return err; + } + + static int vxlan_nl2conf(struct nlattr *tb[], struct nlattr *data[], +@@ -3449,6 +3461,7 @@ static int vxlan_changelink(struct net_d + struct vxlan_rdst *dst = &vxlan->default_dst; + struct vxlan_rdst old_dst; + struct vxlan_config conf; ++ struct vxlan_fdb *f = NULL; + int err; + + err = vxlan_nl2conf(tb, data, +@@ -3474,19 +3487,19 @@ static int vxlan_changelink(struct net_d + old_dst.remote_ifindex, 0); + + if (!vxlan_addr_any(&dst->remote_ip)) { +- err = vxlan_fdb_update(vxlan, all_zeros_mac, ++ err = vxlan_fdb_create(vxlan, all_zeros_mac, + &dst->remote_ip, + NUD_REACHABLE | NUD_PERMANENT, +- NLM_F_CREATE | NLM_F_APPEND, + vxlan->cfg.dst_port, + dst->remote_vni, + dst->remote_vni, + dst->remote_ifindex, +- NTF_SELF); ++ NTF_SELF, &f); + if (err) { + spin_unlock_bh(&vxlan->hash_lock); + return err; + } ++ vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_NEWNEIGH); + } + spin_unlock_bh(&vxlan->hash_lock); + } diff --git a/queue-4.17/vxlan-make-netlink-notify-in-vxlan_fdb_destroy-optional.patch b/queue-4.17/vxlan-make-netlink-notify-in-vxlan_fdb_destroy-optional.patch new file mode 100644 index 00000000000..1b1fcc50a1c --- /dev/null +++ b/queue-4.17/vxlan-make-netlink-notify-in-vxlan_fdb_destroy-optional.patch @@ -0,0 +1,75 @@ +From foo@baz Fri Jul 27 08:31:26 CEST 2018 +From: Roopa Prabhu +Date: Fri, 20 Jul 2018 13:21:03 -0700 +Subject: vxlan: make netlink notify in vxlan_fdb_destroy optional + +From: Roopa Prabhu + +[ Upstream commit f6e053858671bb156b6e44ad66418acc8c7f4e77 ] + +Add a new option do_notify to vxlan_fdb_destroy to make +sending netlink notify optional. Used by a later patch. + +Signed-off-by: Roopa Prabhu +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/vxlan.c | 14 ++++++++------ + 1 file changed, 8 insertions(+), 6 deletions(-) + +--- a/drivers/net/vxlan.c ++++ b/drivers/net/vxlan.c +@@ -774,13 +774,15 @@ static void vxlan_fdb_free(struct rcu_he + kfree(f); + } + +-static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f) ++static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f, ++ bool do_notify) + { + netdev_dbg(vxlan->dev, + "delete %pM\n", f->eth_addr); + + --vxlan->addrcnt; +- vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_DELNEIGH); ++ if (do_notify) ++ vxlan_fdb_notify(vxlan, f, first_remote_rtnl(f), RTM_DELNEIGH); + + hlist_del_rcu(&f->hlist); + call_rcu(&f->rcu, vxlan_fdb_free); +@@ -930,7 +932,7 @@ static int __vxlan_fdb_delete(struct vxl + goto out; + } + +- vxlan_fdb_destroy(vxlan, f); ++ vxlan_fdb_destroy(vxlan, f, true); + + out: + return 0; +@@ -2393,7 +2395,7 @@ static void vxlan_cleanup(struct timer_l + "garbage collect %pM\n", + f->eth_addr); + f->state = NUD_STALE; +- vxlan_fdb_destroy(vxlan, f); ++ vxlan_fdb_destroy(vxlan, f, true); + } else if (time_before(timeout, next_timer)) + next_timer = timeout; + } +@@ -2444,7 +2446,7 @@ static void vxlan_fdb_delete_default(str + spin_lock_bh(&vxlan->hash_lock); + f = __vxlan_find_mac(vxlan, all_zeros_mac, vni); + if (f) +- vxlan_fdb_destroy(vxlan, f); ++ vxlan_fdb_destroy(vxlan, f, true); + spin_unlock_bh(&vxlan->hash_lock); + } + +@@ -2498,7 +2500,7 @@ static void vxlan_flush(struct vxlan_dev + continue; + /* the all_zeros_mac entry is deleted at vxlan_uninit */ + if (!is_zero_ether_addr(f->eth_addr)) +- vxlan_fdb_destroy(vxlan, f); ++ vxlan_fdb_destroy(vxlan, f, true); + } + } + spin_unlock_bh(&vxlan->hash_lock);