--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Mon, 23 Jul 2018 16:50:48 +0200
+Subject: ip: hash fragments consistently
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+[ Upstream commit 3dd1c9a1270736029ffca670e9bd0265f4120600 ]
+
+The skb hash for locally generated ip[v6] fragments belonging
+to the same datagram can vary in several circumstances:
+* for connected UDP[v6] sockets, the first fragment get its hash
+ via set_owner_w()/skb_set_hash_from_sk()
+* for unconnected IPv6 UDPv6 sockets, the first fragment can get
+ its hash via ip6_make_flowlabel()/skb_get_hash_flowi6(), if
+ auto_flowlabel is enabled
+
+For the following frags the hash is usually computed via
+skb_get_hash().
+The above can cause OoO for unconnected IPv6 UDPv6 socket: in that
+scenario the egress tx queue can be selected on a per packet basis
+via the skb hash.
+It may also fool flow-oriented schedulers to place fragments belonging
+to the same datagram in different flows.
+
+Fix the issue by copying the skb hash from the head frag into
+the others at fragmentation time.
+
+Before this commit:
+perf probe -a "dev_queue_xmit skb skb->hash skb->l4_hash:b1@0/8 skb->sw_hash:b1@1/8"
+netperf -H $IPV4 -t UDP_STREAM -l 5 -- -m 2000 -n &
+perf record -e probe:dev_queue_xmit -e probe:skb_set_owner_w -a sleep 0.1
+perf script
+probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=3713014309 l4_hash=1 sw_hash=0
+probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=0 l4_hash=0 sw_hash=0
+
+After this commit:
+probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=2171763177 l4_hash=1 sw_hash=0
+probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=2171763177 l4_hash=1 sw_hash=0
+
+Fixes: b73c3d0e4f0e ("net: Save TX flow hash in sock and set in skbuf on xmit")
+Fixes: 67800f9b1f4e ("ipv6: Call skb_get_hash_flowi6 to get skb->hash in ip6_make_flowlabel")
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/ip_output.c | 2 ++
+ net/ipv6/ip6_output.c | 2 ++
+ 2 files changed, 4 insertions(+)
+
+--- a/net/ipv4/ip_output.c
++++ b/net/ipv4/ip_output.c
+@@ -497,6 +497,8 @@ static void ip_copy_metadata(struct sk_b
+ to->dev = from->dev;
+ to->mark = from->mark;
+
++ skb_copy_hash(to, from);
++
+ /* Copy the flags to each fragment. */
+ IPCB(to)->flags = IPCB(from)->flags;
+
+--- a/net/ipv6/ip6_output.c
++++ b/net/ipv6/ip6_output.c
+@@ -576,6 +576,8 @@ static void ip6_copy_metadata(struct sk_
+ to->dev = from->dev;
+ to->mark = from->mark;
+
++ skb_copy_hash(to, from);
++
+ #ifdef CONFIG_NET_SCHED
+ to->tc_index = from->tc_index;
+ #endif
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Willem de Bruijn <willemb@google.com>
+Date: Mon, 23 Jul 2018 19:36:48 -0400
+Subject: ip: in cmsg IP(V6)_ORIGDSTADDR call pskb_may_pull
+
+From: Willem de Bruijn <willemb@google.com>
+
+[ Upstream commit 2efd4fca703a6707cad16ab486eaab8fc7f0fd49 ]
+
+Syzbot reported a read beyond the end of the skb head when returning
+IPV6_ORIGDSTADDR:
+
+ BUG: KMSAN: kernel-infoleak in put_cmsg+0x5ef/0x860 net/core/scm.c:242
+ CPU: 0 PID: 4501 Comm: syz-executor128 Not tainted 4.17.0+ #9
+ Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS
+ Google 01/01/2011
+ Call Trace:
+ __dump_stack lib/dump_stack.c:77 [inline]
+ dump_stack+0x185/0x1d0 lib/dump_stack.c:113
+ kmsan_report+0x188/0x2a0 mm/kmsan/kmsan.c:1125
+ kmsan_internal_check_memory+0x138/0x1f0 mm/kmsan/kmsan.c:1219
+ kmsan_copy_to_user+0x7a/0x160 mm/kmsan/kmsan.c:1261
+ copy_to_user include/linux/uaccess.h:184 [inline]
+ put_cmsg+0x5ef/0x860 net/core/scm.c:242
+ ip6_datagram_recv_specific_ctl+0x1cf3/0x1eb0 net/ipv6/datagram.c:719
+ ip6_datagram_recv_ctl+0x41c/0x450 net/ipv6/datagram.c:733
+ rawv6_recvmsg+0x10fb/0x1460 net/ipv6/raw.c:521
+ [..]
+
+This logic and its ipv4 counterpart read the destination port from
+the packet at skb_transport_offset(skb) + 4.
+
+With MSG_MORE and a local SOCK_RAW sender, syzbot was able to cook a
+packet that stores headers exactly up to skb_transport_offset(skb) in
+the head and the remainder in a frag.
+
+Call pskb_may_pull before accessing the pointer to ensure that it lies
+in skb head.
+
+Link: http://lkml.kernel.org/r/CAF=yD-LEJwZj5a1-bAAj2Oy_hKmGygV6rsJ_WOrAYnv-fnayiQ@mail.gmail.com
+Reported-by: syzbot+9adb4b567003cac781f0@syzkaller.appspotmail.com
+Signed-off-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/ip_sockglue.c | 7 +++++--
+ net/ipv6/datagram.c | 7 +++++--
+ 2 files changed, 10 insertions(+), 4 deletions(-)
+
+--- a/net/ipv4/ip_sockglue.c
++++ b/net/ipv4/ip_sockglue.c
+@@ -135,15 +135,18 @@ static void ip_cmsg_recv_dstaddr(struct
+ {
+ struct sockaddr_in sin;
+ const struct iphdr *iph = ip_hdr(skb);
+- __be16 *ports = (__be16 *)skb_transport_header(skb);
++ __be16 *ports;
++ int end;
+
+- if (skb_transport_offset(skb) + 4 > (int)skb->len)
++ end = skb_transport_offset(skb) + 4;
++ if (end > 0 && !pskb_may_pull(skb, end))
+ return;
+
+ /* All current transport protocols have the port numbers in the
+ * first four bytes of the transport header and this function is
+ * written with this assumption in mind.
+ */
++ ports = (__be16 *)skb_transport_header(skb);
+
+ sin.sin_family = AF_INET;
+ sin.sin_addr.s_addr = iph->daddr;
+--- a/net/ipv6/datagram.c
++++ b/net/ipv6/datagram.c
+@@ -694,13 +694,16 @@ void ip6_datagram_recv_specific_ctl(stru
+ }
+ if (np->rxopt.bits.rxorigdstaddr) {
+ struct sockaddr_in6 sin6;
+- __be16 *ports = (__be16 *) skb_transport_header(skb);
++ __be16 *ports;
++ int end;
+
+- if (skb_transport_offset(skb) + 4 <= (int)skb->len) {
++ end = skb_transport_offset(skb) + 4;
++ if (end <= 0 || pskb_may_pull(skb, end)) {
+ /* All current transport protocols have the port numbers in the
+ * first four bytes of the transport header and this function is
+ * written with this assumption in mind.
+ */
++ ports = (__be16 *)skb_transport_header(skb);
+
+ sin6.sin6_family = AF_INET6;
+ sin6.sin6_addr = ipv6_hdr(skb)->daddr;
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Hangbin Liu <liuhangbin@gmail.com>
+Date: Fri, 20 Jul 2018 14:04:27 +0800
+Subject: multicast: do not restore deleted record source filter mode to new one
+
+From: Hangbin Liu <liuhangbin@gmail.com>
+
+There are two scenarios that we will restore deleted records. The first is
+when device down and up(or unmap/remap). In this scenario the new filter
+mode is same with previous one. Because we get it from in_dev->mc_list and
+we do not touch it during device down and up.
+
+The other scenario is when a new socket join a group which was just delete
+and not finish sending status reports. In this scenario, we should use the
+current filter mode instead of restore old one. Here are 4 cases in total.
+
+old_socket new_socket before_fix after_fix
+ IN(A) IN(A) ALLOW(A) ALLOW(A)
+ IN(A) EX( ) TO_IN( ) TO_EX( )
+ EX( ) IN(A) TO_EX( ) ALLOW(A)
+ EX( ) EX( ) TO_EX( ) TO_EX( )
+
+Fixes: 24803f38a5c0b (igmp: do not remove igmp souce list info when set link down)
+Fixes: 1666d49e1d416 (mld: do not remove mld souce list info when set link down)
+Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/igmp.c | 3 +--
+ net/ipv6/mcast.c | 3 +--
+ 2 files changed, 2 insertions(+), 4 deletions(-)
+
+--- a/net/ipv4/igmp.c
++++ b/net/ipv4/igmp.c
+@@ -1193,8 +1193,7 @@ static void igmpv3_del_delrec(struct in_
+ if (pmc) {
+ im->interface = pmc->interface;
+ im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
+- im->sfmode = pmc->sfmode;
+- if (pmc->sfmode == MCAST_INCLUDE) {
++ if (im->sfmode == MCAST_INCLUDE) {
+ im->tomb = pmc->tomb;
+ im->sources = pmc->sources;
+ for (psf = im->sources; psf; psf = psf->sf_next)
+--- a/net/ipv6/mcast.c
++++ b/net/ipv6/mcast.c
+@@ -771,8 +771,7 @@ static void mld_del_delrec(struct inet6_
+ if (pmc) {
+ im->idev = pmc->idev;
+ im->mca_crcount = idev->mc_qrv;
+- im->mca_sfmode = pmc->mca_sfmode;
+- if (pmc->mca_sfmode == MCAST_INCLUDE) {
++ if (im->mca_sfmode == MCAST_INCLUDE) {
+ im->mca_tomb = pmc->mca_tomb;
+ im->mca_sources = pmc->mca_sources;
+ for (psf = im->mca_sources; psf; psf = psf->sf_next)
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Jack Morgenstein <jackm@dev.mellanox.co.il>
+Date: Tue, 24 Jul 2018 14:27:55 +0300
+Subject: net/mlx4_core: Save the qpn from the input modifier in RST2INIT wrapper
+
+From: Jack Morgenstein <jackm@dev.mellanox.co.il>
+
+[ Upstream commit 958c696f5a7274d9447a458ad7aa70719b29a50a ]
+
+Function mlx4_RST2INIT_QP_wrapper saved the qp number passed in the qp
+context, rather than the one passed in the input modifier.
+
+However, the qp number in the qp context is not defined as a
+required parameter by the FW. Therefore, drivers may choose to not
+specify the qp number in the qp context for the reset-to-init transition.
+
+Thus, we must save the qp number passed in the command input modifier --
+which is always present. (This saved qp number is used as the input
+modifier for command 2RST_QP when a slave's qp's are destroyed).
+
+Fixes: c82e9aa0a8bc ("mlx4_core: resource tracking for HCA resources used by guests")
+Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
+Signed-off-by: Tariq Toukan <tariqt@mellanox.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx4/resource_tracker.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
++++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+@@ -2916,7 +2916,7 @@ int mlx4_RST2INIT_QP_wrapper(struct mlx4
+ u32 srqn = qp_get_srqn(qpc) & 0xffffff;
+ int use_srq = (qp_get_srqn(qpc) >> 24) & 1;
+ struct res_srq *srq;
+- int local_qpn = be32_to_cpu(qpc->local_qpn) & 0xffffff;
++ int local_qpn = vhcr->in_modifier & 0xffffff;
+
+ err = adjust_qp_sched_queue(dev, slave, qpc, inbox);
+ if (err)
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Ariel Levkovich <lariel@mellanox.com>
+Date: Mon, 25 Jun 2018 19:12:02 +0300
+Subject: net/mlx5: Adjust clock overflow work period
+
+From: Ariel Levkovich <lariel@mellanox.com>
+
+[ Upstream commit 33180bee86a8940a84950edca46315cd9dd6deb5 ]
+
+When driver converts HW timestamp to wall clock time it subtracts
+the last saved cycle counter from the HW timestamp and converts the
+difference to nanoseconds.
+The conversion is done by multiplying the cycles difference with the
+clock multiplier value as a first step and therefore the cycles
+difference should be small enough so that the multiplication product
+doesn't exceed 64bit.
+
+The overflow handling routine is in charge of updating the last saved
+cycle counter in driver and it is called periodically using kernel
+delayed workqueue.
+
+The delay period for this work is calculated using the max HW cycle
+counter value (a 41 bit mask) as a base which doesn't take the 64bit
+limit into account so the delay period may be incorrect and too
+long to prevent a large difference between the HW counter and the last
+saved counter in SW.
+
+This change adjusts the work period for the HW clock overflow work by
+taking the minimum between the previous value and the quotient of max
+u64 value and the clock multiplier value.
+
+Fixes: ef9814deafd0 ("net/mlx5e: Add HW timestamping (TS) support")
+Signed-off-by: Ariel Levkovich <lariel@mellanox.com>
+Reviewed-by: Eran Ben Elisha <eranbe@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_clock.c | 12 ++++++++++--
+ 1 file changed, 10 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_clock.c
+@@ -233,6 +233,7 @@ static void mlx5e_timestamp_init_config(
+ void mlx5e_timestamp_init(struct mlx5e_priv *priv)
+ {
+ struct mlx5e_tstamp *tstamp = &priv->tstamp;
++ u64 overflow_cycles;
+ u64 ns;
+ u64 frac = 0;
+ u32 dev_freq;
+@@ -257,10 +258,17 @@ void mlx5e_timestamp_init(struct mlx5e_p
+
+ /* Calculate period in seconds to call the overflow watchdog - to make
+ * sure counter is checked at least once every wrap around.
++ * The period is calculated as the minimum between max HW cycles count
++ * (The clock source mask) and max amount of cycles that can be
++ * multiplied by clock multiplier where the result doesn't exceed
++ * 64bits.
+ */
+- ns = cyclecounter_cyc2ns(&tstamp->cycles, tstamp->cycles.mask,
++ overflow_cycles = div64_u64(~0ULL >> 1, tstamp->cycles.mult);
++ overflow_cycles = min(overflow_cycles, tstamp->cycles.mask >> 1);
++
++ ns = cyclecounter_cyc2ns(&tstamp->cycles, overflow_cycles,
+ frac, &frac);
+- do_div(ns, NSEC_PER_SEC / 2 / HZ);
++ do_div(ns, NSEC_PER_SEC / HZ);
+ tstamp->overflow_period = ns;
+
+ INIT_DELAYED_WORK(&tstamp->overflow_work, mlx5e_timestamp_overflow);
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Eran Ben Elisha <eranbe@mellanox.com>
+Date: Sun, 8 Jul 2018 14:52:12 +0300
+Subject: net/mlx5e: Don't allow aRFS for encapsulated packets
+
+From: Eran Ben Elisha <eranbe@mellanox.com>
+
+[ Upstream commit d2e1c57bcf9a07cbb67f30ecf238f298799bce1c ]
+
+Driver is yet to support aRFS for encapsulated packets, return early
+error in such case.
+
+Fixes: 18c908e477dc ("net/mlx5e: Add accelerated RFS support")
+Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
+Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+@@ -715,6 +715,9 @@ int mlx5e_rx_flow_steer(struct net_devic
+ skb->protocol != htons(ETH_P_IPV6))
+ return -EPROTONOSUPPORT;
+
++ if (skb->encapsulation)
++ return -EPROTONOSUPPORT;
++
+ arfs_t = arfs_get_table(arfs, arfs_get_ip_proto(skb), skb->protocol);
+ if (!arfs_t)
+ return -EPROTONOSUPPORT;
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Eran Ben Elisha <eranbe@mellanox.com>
+Date: Sun, 8 Jul 2018 13:08:55 +0300
+Subject: net/mlx5e: Fix quota counting in aRFS expire flow
+
+From: Eran Ben Elisha <eranbe@mellanox.com>
+
+[ Upstream commit 2630bae8018823c3b88788b69fb9f16ea3b4a11e ]
+
+Quota should follow the amount of rules which do expire, and not the
+number of rules that were examined, fixed that.
+
+Fixes: 18c908e477dc ("net/mlx5e: Add accelerated RFS support")
+Signed-off-by: Eran Ben Elisha <eranbe@mellanox.com>
+Reviewed-by: Maor Gottlieb <maorg@mellanox.com>
+Reviewed-by: Tariq Toukan <tariqt@mellanox.com>
+Signed-off-by: Saeed Mahameed <saeedm@mellanox.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+@@ -383,14 +383,14 @@ static void arfs_may_expire_flow(struct
+ HLIST_HEAD(del_list);
+ spin_lock_bh(&priv->fs.arfs.arfs_lock);
+ mlx5e_for_each_arfs_rule(arfs_rule, htmp, priv->fs.arfs.arfs_tables, i, j) {
+- if (quota++ > MLX5E_ARFS_EXPIRY_QUOTA)
+- break;
+ if (!work_pending(&arfs_rule->arfs_work) &&
+ rps_may_expire_flow(priv->netdev,
+ arfs_rule->rxq, arfs_rule->flow_id,
+ arfs_rule->filter_id)) {
+ hlist_del_init(&arfs_rule->hlist);
+ hlist_add_head(&arfs_rule->hlist, &del_list);
++ if (quota++ > MLX5E_ARFS_EXPIRY_QUOTA)
++ break;
+ }
+ }
+ spin_unlock_bh(&priv->fs.arfs.arfs_lock);
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Heiner Kallweit <hkallweit1@gmail.com>
+Date: Thu, 19 Jul 2018 08:15:16 +0200
+Subject: net: phy: consider PHY_IGNORE_INTERRUPT in phy_start_aneg_priv
+
+From: Heiner Kallweit <hkallweit1@gmail.com>
+
+[ Upstream commit 215d08a85b9acf5e1fe9dbf50f1774cde333efef ]
+
+The situation described in the comment can occur also with
+PHY_IGNORE_INTERRUPT, therefore change the condition to include it.
+
+Fixes: f555f34fdc58 ("net: phy: fix auto-negotiation stall due to unavailable interrupt")
+Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/phy/phy.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/net/phy/phy.c
++++ b/drivers/net/phy/phy.c
+@@ -598,7 +598,7 @@ static int phy_start_aneg_priv(struct ph
+ * negotiation may already be done and aneg interrupt may not be
+ * generated.
+ */
+- if (phy_interrupt_is_valid(phydev) && (phydev->state == PHY_AN)) {
++ if (phydev->irq != PHY_POLL && phydev->state == PHY_AN) {
+ err = phy_aneg_done(phydev);
+ if (err > 0) {
+ trigger = true;
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Eric Dumazet <edumazet@google.com>
+Date: Thu, 19 Jul 2018 16:04:38 -0700
+Subject: net: skb_segment() should not return NULL
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit ff907a11a0d68a749ce1a321f4505c03bf72190c ]
+
+syzbot caught a NULL deref [1], caused by skb_segment()
+
+skb_segment() has many "goto err;" that assume the @err variable
+contains -ENOMEM.
+
+A successful call to __skb_linearize() should not clear @err,
+otherwise a subsequent memory allocation error could return NULL.
+
+While we are at it, we might use -EINVAL instead of -ENOMEM when
+MAX_SKB_FRAGS limit is reached.
+
+[1]
+kasan: CONFIG_KASAN_INLINE enabled
+kasan: GPF could be caused by NULL-ptr deref or user memory access
+general protection fault: 0000 [#1] SMP KASAN
+CPU: 0 PID: 13285 Comm: syz-executor3 Not tainted 4.18.0-rc4+ #146
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011
+RIP: 0010:tcp_gso_segment+0x3dc/0x1780 net/ipv4/tcp_offload.c:106
+Code: f0 ff ff 0f 87 1c fd ff ff e8 00 88 0b fb 48 8b 75 d0 48 b9 00 00 00 00 00 fc ff df 48 8d be 90 00 00 00 48 89 f8 48 c1 e8 03 <0f> b6 14 08 48 8d 86 94 00 00 00 48 89 c6 83 e0 07 48 c1 ee 03 0f
+RSP: 0018:ffff88019b7fd060 EFLAGS: 00010206
+RAX: 0000000000000012 RBX: 0000000000000020 RCX: dffffc0000000000
+RDX: 0000000000040000 RSI: 0000000000000000 RDI: 0000000000000090
+RBP: ffff88019b7fd0f0 R08: ffff88019510e0c0 R09: ffffed003b5c46d6
+R10: ffffed003b5c46d6 R11: ffff8801dae236b3 R12: 0000000000000001
+R13: ffff8801d6c581f4 R14: 0000000000000000 R15: ffff8801d6c58128
+FS: 00007fcae64d6700(0000) GS:ffff8801dae00000(0000) knlGS:0000000000000000
+CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00000000004e8664 CR3: 00000001b669b000 CR4: 00000000001406f0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+Call Trace:
+ tcp4_gso_segment+0x1c3/0x440 net/ipv4/tcp_offload.c:54
+ inet_gso_segment+0x64e/0x12d0 net/ipv4/af_inet.c:1342
+ inet_gso_segment+0x64e/0x12d0 net/ipv4/af_inet.c:1342
+ skb_mac_gso_segment+0x3b5/0x740 net/core/dev.c:2792
+ __skb_gso_segment+0x3c3/0x880 net/core/dev.c:2865
+ skb_gso_segment include/linux/netdevice.h:4099 [inline]
+ validate_xmit_skb+0x640/0xf30 net/core/dev.c:3104
+ __dev_queue_xmit+0xc14/0x3910 net/core/dev.c:3561
+ dev_queue_xmit+0x17/0x20 net/core/dev.c:3602
+ neigh_hh_output include/net/neighbour.h:473 [inline]
+ neigh_output include/net/neighbour.h:481 [inline]
+ ip_finish_output2+0x1063/0x1860 net/ipv4/ip_output.c:229
+ ip_finish_output+0x841/0xfa0 net/ipv4/ip_output.c:317
+ NF_HOOK_COND include/linux/netfilter.h:276 [inline]
+ ip_output+0x223/0x880 net/ipv4/ip_output.c:405
+ dst_output include/net/dst.h:444 [inline]
+ ip_local_out+0xc5/0x1b0 net/ipv4/ip_output.c:124
+ iptunnel_xmit+0x567/0x850 net/ipv4/ip_tunnel_core.c:91
+ ip_tunnel_xmit+0x1598/0x3af1 net/ipv4/ip_tunnel.c:778
+ ipip_tunnel_xmit+0x264/0x2c0 net/ipv4/ipip.c:308
+ __netdev_start_xmit include/linux/netdevice.h:4148 [inline]
+ netdev_start_xmit include/linux/netdevice.h:4157 [inline]
+ xmit_one net/core/dev.c:3034 [inline]
+ dev_hard_start_xmit+0x26c/0xc30 net/core/dev.c:3050
+ __dev_queue_xmit+0x29ef/0x3910 net/core/dev.c:3569
+ dev_queue_xmit+0x17/0x20 net/core/dev.c:3602
+ neigh_direct_output+0x15/0x20 net/core/neighbour.c:1403
+ neigh_output include/net/neighbour.h:483 [inline]
+ ip_finish_output2+0xa67/0x1860 net/ipv4/ip_output.c:229
+ ip_finish_output+0x841/0xfa0 net/ipv4/ip_output.c:317
+ NF_HOOK_COND include/linux/netfilter.h:276 [inline]
+ ip_output+0x223/0x880 net/ipv4/ip_output.c:405
+ dst_output include/net/dst.h:444 [inline]
+ ip_local_out+0xc5/0x1b0 net/ipv4/ip_output.c:124
+ ip_queue_xmit+0x9df/0x1f80 net/ipv4/ip_output.c:504
+ tcp_transmit_skb+0x1bf9/0x3f10 net/ipv4/tcp_output.c:1168
+ tcp_write_xmit+0x1641/0x5c20 net/ipv4/tcp_output.c:2363
+ __tcp_push_pending_frames+0xb2/0x290 net/ipv4/tcp_output.c:2536
+ tcp_push+0x638/0x8c0 net/ipv4/tcp.c:735
+ tcp_sendmsg_locked+0x2ec5/0x3f00 net/ipv4/tcp.c:1410
+ tcp_sendmsg+0x2f/0x50 net/ipv4/tcp.c:1447
+ inet_sendmsg+0x1a1/0x690 net/ipv4/af_inet.c:798
+ sock_sendmsg_nosec net/socket.c:641 [inline]
+ sock_sendmsg+0xd5/0x120 net/socket.c:651
+ __sys_sendto+0x3d7/0x670 net/socket.c:1797
+ __do_sys_sendto net/socket.c:1809 [inline]
+ __se_sys_sendto net/socket.c:1805 [inline]
+ __x64_sys_sendto+0xe1/0x1a0 net/socket.c:1805
+ do_syscall_64+0x1b9/0x820 arch/x86/entry/common.c:290
+ entry_SYSCALL_64_after_hwframe+0x49/0xbe
+RIP: 0033:0x455ab9
+Code: 1d ba fb ff c3 66 2e 0f 1f 84 00 00 00 00 00 66 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0f 83 eb b9 fb ff c3 66 2e 0f 1f 84 00 00 00 00
+RSP: 002b:00007fcae64d5c68 EFLAGS: 00000246 ORIG_RAX: 000000000000002c
+RAX: ffffffffffffffda RBX: 00007fcae64d66d4 RCX: 0000000000455ab9
+RDX: 0000000000000001 RSI: 0000000020000200 RDI: 0000000000000013
+RBP: 000000000072bea0 R08: 0000000000000000 R09: 0000000000000000
+R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000014
+R13: 00000000004c1145 R14: 00000000004d1818 R15: 0000000000000006
+Modules linked in:
+Dumping ftrace buffer:
+ (ftrace buffer empty)
+
+Fixes: ddff00d42043 ("net: Move skb_has_shared_frag check out of GRE code and into segmentation")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Alexander Duyck <alexander.h.duyck@intel.com>
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Acked-by: Alexander Duyck <alexander.h.duyck@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/skbuff.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -3253,6 +3253,7 @@ normal:
+ net_warn_ratelimited(
+ "skb_segment: too many frags: %u %u\n",
+ pos, mss);
++ err = -EINVAL;
+ goto err;
+ }
+
+@@ -3289,11 +3290,10 @@ skip_fraglist:
+
+ perform_csum_check:
+ if (!csum) {
+- if (skb_has_shared_frag(nskb)) {
+- err = __skb_linearize(nskb);
+- if (err)
+- goto err;
+- }
++ if (skb_has_shared_frag(nskb) &&
++ __skb_linearize(nskb))
++ goto err;
++
+ if (!nskb->remcsum_offload)
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum =
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Roopa Prabhu <roopa@cumulusnetworks.com>
+Date: Fri, 20 Jul 2018 13:21:01 -0700
+Subject: rtnetlink: add rtnl_link_state check in rtnl_configure_link
+
+From: Roopa Prabhu <roopa@cumulusnetworks.com>
+
+[ Upstream commit 5025f7f7d506fba9b39e7fe8ca10f6f34cb9bc2d ]
+
+rtnl_configure_link sets dev->rtnl_link_state to
+RTNL_LINK_INITIALIZED and unconditionally calls
+__dev_notify_flags to notify user-space of dev flags.
+
+current call sequence for rtnl_configure_link
+rtnetlink_newlink
+ rtnl_link_ops->newlink
+ rtnl_configure_link (unconditionally notifies userspace of
+ default and new dev flags)
+
+If a newlink handler wants to call rtnl_configure_link
+early, we will end up with duplicate notifications to
+user-space.
+
+This patch fixes rtnl_configure_link to check rtnl_link_state
+and call __dev_notify_flags with gchanges = 0 if already
+RTNL_LINK_INITIALIZED.
+
+Later in the series, this patch will help the following sequence
+where a driver implementing newlink can call rtnl_configure_link
+to initialize the link early.
+
+makes the following call sequence work:
+rtnetlink_newlink
+ rtnl_link_ops->newlink (vxlan) -> rtnl_configure_link (initializes
+ link and notifies
+ user-space of default
+ dev flags)
+ rtnl_configure_link (updates dev flags if requested by user ifm
+ and notifies user-space of new dev flags)
+
+Signed-off-by: Roopa Prabhu <roopa@cumulusnetworks.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/core/rtnetlink.c | 9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- a/net/core/rtnetlink.c
++++ b/net/core/rtnetlink.c
+@@ -2339,9 +2339,12 @@ int rtnl_configure_link(struct net_devic
+ return err;
+ }
+
+- dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
+-
+- __dev_notify_flags(dev, old_flags, ~0U);
++ if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) {
++ __dev_notify_flags(dev, old_flags, 0U);
++ } else {
++ dev->rtnl_link_state = RTNL_LINK_INITIALIZED;
++ __dev_notify_flags(dev, old_flags, ~0U);
++ }
+ return 0;
+ }
+ EXPORT_SYMBOL(rtnl_configure_link);
mips-ath79-fix-register-address-in-ath79_ddr_wb_flush.patch
mips-fix-off-by-one-in-pci_resource_to_user.patch
+ip-hash-fragments-consistently.patch
+ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch
+net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch
+net-skb_segment-should-not-return-null.patch
+net-mlx5-adjust-clock-overflow-work-period.patch
+net-mlx5e-don-t-allow-arfs-for-encapsulated-packets.patch
+net-mlx5e-fix-quota-counting-in-arfs-expire-flow.patch
+multicast-do-not-restore-deleted-record-source-filter-mode-to-new-one.patch
+net-phy-consider-phy_ignore_interrupt-in-phy_start_aneg_priv.patch
+rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch
+tcp-fix-dctcp-delayed-ack-schedule.patch
+tcp-helpers-to-send-special-dctcp-ack.patch
+tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch
+tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch
+tcp-free-batches-of-packets-in-tcp_prune_ofo_queue.patch
+tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch
+tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch
+tcp-call-tcp_drop-from-tcp_data_queue_ofo.patch
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Eric Dumazet <edumazet@google.com>
+Date: Mon, 23 Jul 2018 09:28:18 -0700
+Subject: tcp: avoid collapses in tcp_prune_queue() if possible
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7 ]
+
+Right after a TCP flow is created, receiving tiny out of order
+packets allways hit the condition :
+
+if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ tcp_clamp_window(sk);
+
+tcp_clamp_window() increases sk_rcvbuf to match sk_rmem_alloc
+(guarded by tcp_rmem[2])
+
+Calling tcp_collapse_ofo_queue() in this case is not useful,
+and offers a O(N^2) surface attack to malicious peers.
+
+Better not attempt anything before full queue capacity is reached,
+forcing attacker to spend lots of resource and allow us to more
+easily detect the abuse.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -5027,6 +5027,9 @@ static int tcp_prune_queue(struct sock *
+ else if (tcp_under_memory_pressure(sk))
+ tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss);
+
++ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf)
++ return 0;
++
+ tcp_collapse_ofo_queue(sk);
+ if (!skb_queue_empty(&sk->sk_receive_queue))
+ tcp_collapse(sk, &sk->sk_receive_queue, NULL,
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Eric Dumazet <edumazet@google.com>
+Date: Mon, 23 Jul 2018 09:28:20 -0700
+Subject: tcp: call tcp_drop() from tcp_data_queue_ofo()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 8541b21e781a22dce52a74fef0b9bed00404a1cd ]
+
+In order to be able to give better diagnostics and detect
+malicious traffic, we need to have better sk->sk_drops tracking.
+
+Fixes: 9f5afeae5152 ("tcp: use an RB tree for ooo receive queue")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4517,7 +4517,7 @@ coalesce_done:
+ /* All the bits are present. Drop. */
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPOFOMERGE);
+- __kfree_skb(skb);
++ tcp_drop(sk, skb);
+ skb = NULL;
+ tcp_dsack_set(sk, seq, end_seq);
+ goto add_sack;
+@@ -4536,7 +4536,7 @@ coalesce_done:
+ TCP_SKB_CB(skb1)->end_seq);
+ NET_INC_STATS(sock_net(sk),
+ LINUX_MIB_TCPOFOMERGE);
+- __kfree_skb(skb1);
++ tcp_drop(sk, skb1);
+ goto merge_right;
+ }
+ } else if (tcp_try_coalesce(sk, skb1, skb, &fragstolen)) {
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Eric Dumazet <edumazet@google.com>
+Date: Mon, 23 Jul 2018 09:28:19 -0700
+Subject: tcp: detect malicious patterns in tcp_collapse_ofo_queue()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf ]
+
+In case an attacker feeds tiny packets completely out of order,
+tcp_collapse_ofo_queue() might scan the whole rb-tree, performing
+expensive copies, but not changing socket memory usage at all.
+
+1) Do not attempt to collapse tiny skbs.
+2) Add logic to exit early when too many tiny skbs are detected.
+
+We prefer not doing aggressive collapsing (which copies packets)
+for pathological flows, and revert to tcp_prune_ofo_queue() which
+will be less expensive.
+
+In the future, we might add the possibility of terminating flows
+that are proven to be malicious.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_input.c | 15 +++++++++++++--
+ 1 file changed, 13 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4918,6 +4918,7 @@ end:
+ static void tcp_collapse_ofo_queue(struct sock *sk)
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
++ u32 range_truesize, sum_tiny = 0;
+ struct sk_buff *skb, *head;
+ struct rb_node *p;
+ u32 start, end;
+@@ -4936,6 +4937,7 @@ new_range:
+ }
+ start = TCP_SKB_CB(skb)->seq;
+ end = TCP_SKB_CB(skb)->end_seq;
++ range_truesize = skb->truesize;
+
+ for (head = skb;;) {
+ skb = tcp_skb_next(skb, NULL);
+@@ -4946,11 +4948,20 @@ new_range:
+ if (!skb ||
+ after(TCP_SKB_CB(skb)->seq, end) ||
+ before(TCP_SKB_CB(skb)->end_seq, start)) {
+- tcp_collapse(sk, NULL, &tp->out_of_order_queue,
+- head, skb, start, end);
++ /* Do not attempt collapsing tiny skbs */
++ if (range_truesize != head->truesize ||
++ end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) {
++ tcp_collapse(sk, NULL, &tp->out_of_order_queue,
++ head, skb, start, end);
++ } else {
++ sum_tiny += range_truesize;
++ if (sum_tiny > sk->sk_rcvbuf >> 3)
++ return;
++ }
+ goto new_range;
+ }
+
++ range_truesize += skb->truesize;
+ if (unlikely(before(TCP_SKB_CB(skb)->seq, start)))
+ start = TCP_SKB_CB(skb)->seq;
+ if (after(TCP_SKB_CB(skb)->end_seq, end))
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Yuchung Cheng <ycheng@google.com>
+Date: Wed, 18 Jul 2018 13:56:35 -0700
+Subject: tcp: do not cancel delay-AcK on DCTCP special ACK
+
+From: Yuchung Cheng <ycheng@google.com>
+
+[ Upstream commit 27cde44a259c380a3c09066fc4b42de7dde9b1ad ]
+
+Currently when a DCTCP receiver delays an ACK and receive a
+data packet with a different CE mark from the previous one's, it
+sends two immediate ACKs acking previous and latest sequences
+respectly (for ECN accounting).
+
+Previously sending the first ACK may mark off the delayed ACK timer
+(tcp_event_ack_sent). This may subsequently prevent sending the
+second ACK to acknowledge the latest sequence (tcp_ack_snd_check).
+The culprit is that tcp_send_ack() assumes it always acknowleges
+the latest sequence, which is not true for the first special ACK.
+
+The fix is to not make the assumption in tcp_send_ack and check the
+actual ack sequence before cancelling the delayed ACK. Further it's
+safer to pass the ack sequence number as a local variable into
+tcp_send_ack routine, instead of intercepting tp->rcv_nxt to avoid
+future bugs like this.
+
+Reported-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/tcp.h | 1 +
+ net/ipv4/tcp_dctcp.c | 34 ++++------------------------------
+ net/ipv4/tcp_output.c | 11 ++++++++---
+ 3 files changed, 13 insertions(+), 33 deletions(-)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -553,6 +553,7 @@ void tcp_send_fin(struct sock *sk);
+ void tcp_send_active_reset(struct sock *sk, gfp_t priority);
+ int tcp_send_synack(struct sock *);
+ void tcp_push_one(struct sock *, unsigned int mss_now);
++void __tcp_send_ack(struct sock *sk, u32 rcv_nxt);
+ void tcp_send_ack(struct sock *sk);
+ void tcp_send_delayed_ack(struct sock *sk);
+ void tcp_send_loss_probe(struct sock *sk);
+--- a/net/ipv4/tcp_dctcp.c
++++ b/net/ipv4/tcp_dctcp.c
+@@ -135,21 +135,8 @@ static void dctcp_ce_state_0_to_1(struct
+ * ACK has not sent yet.
+ */
+ if (!ca->ce_state &&
+- inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
+- u32 tmp_rcv_nxt;
+-
+- /* Save current rcv_nxt. */
+- tmp_rcv_nxt = tp->rcv_nxt;
+-
+- /* Generate previous ack with CE=0. */
+- tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR;
+- tp->rcv_nxt = ca->prior_rcv_nxt;
+-
+- tcp_send_ack(sk);
+-
+- /* Recover current rcv_nxt. */
+- tp->rcv_nxt = tmp_rcv_nxt;
+- }
++ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
++ __tcp_send_ack(sk, ca->prior_rcv_nxt);
+
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+ ca->ce_state = 1;
+@@ -166,21 +153,8 @@ static void dctcp_ce_state_1_to_0(struct
+ * ACK has not sent yet.
+ */
+ if (ca->ce_state &&
+- inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
+- u32 tmp_rcv_nxt;
+-
+- /* Save current rcv_nxt. */
+- tmp_rcv_nxt = tp->rcv_nxt;
+-
+- /* Generate previous ack with CE=1. */
+- tp->ecn_flags |= TCP_ECN_DEMAND_CWR;
+- tp->rcv_nxt = ca->prior_rcv_nxt;
+-
+- tcp_send_ack(sk);
+-
+- /* Recover current rcv_nxt. */
+- tp->rcv_nxt = tmp_rcv_nxt;
+- }
++ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
++ __tcp_send_ack(sk, ca->prior_rcv_nxt);
+
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+ ca->ce_state = 0;
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -174,8 +174,13 @@ static void tcp_event_data_sent(struct t
+ }
+
+ /* Account for an ACK we sent. */
+-static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts)
++static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts,
++ u32 rcv_nxt)
+ {
++ struct tcp_sock *tp = tcp_sk(sk);
++
++ if (unlikely(rcv_nxt != tp->rcv_nxt))
++ return; /* Special ACK sent by DCTCP to reflect ECN */
+ tcp_dec_quickack_mode(sk, pkts);
+ inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK);
+ }
+@@ -1010,7 +1015,7 @@ static int __tcp_transmit_skb(struct soc
+ icsk->icsk_af_ops->send_check(sk, skb);
+
+ if (likely(tcb->tcp_flags & TCPHDR_ACK))
+- tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
++ tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);
+
+ if (skb->len != tcp_header_size) {
+ tcp_event_data_sent(tp, sk);
+@@ -3529,12 +3534,12 @@ void __tcp_send_ack(struct sock *sk, u32
+ skb_mstamp_get(&buff->skb_mstamp);
+ __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
+ }
++EXPORT_SYMBOL_GPL(__tcp_send_ack);
+
+ void tcp_send_ack(struct sock *sk)
+ {
+ __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
+ }
+-EXPORT_SYMBOL_GPL(tcp_send_ack);
+
+ /* This routine sends a packet with an out of date sequence
+ * number. It assumes the other end will try to ack it.
--- /dev/null
+From foo@baz Fri Jul 27 08:53:18 CEST 2018
+From: Yuchung Cheng <ycheng@google.com>
+Date: Wed, 18 Jul 2018 13:56:36 -0700
+Subject: tcp: do not delay ACK in DCTCP upon CE status change
+
+From: Yuchung Cheng <ycheng@google.com>
+
+[ Upstream commit a0496ef2c23b3b180902dd185d0d63ccbc624cf8 ]
+
+Per DCTCP RFC8257 (Section 3.2) the ACK reflecting the CE status change
+has to be sent immediately so the sender can respond quickly:
+
+""" When receiving packets, the CE codepoint MUST be processed as follows:
+
+ 1. If the CE codepoint is set and DCTCP.CE is false, set DCTCP.CE to
+ true and send an immediate ACK.
+
+ 2. If the CE codepoint is not set and DCTCP.CE is true, set DCTCP.CE
+ to false and send an immediate ACK.
+"""
+
+Previously DCTCP implementation may continue to delay the ACK. This
+patch fixes that to implement the RFC by forcing an immediate ACK.
+
+Tested with this packetdrill script provided by Larry Brakmo
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < [ect0] SEW 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.100 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+0.110 < [ect0] . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+ +0 setsockopt(4, SOL_SOCKET, SO_DEBUG, [1], 4) = 0
+
+0.200 < [ect0] . 1:1001(1000) ack 1 win 257
+0.200 > [ect01] . 1:1(0) ack 1001
+
+0.200 write(4, ..., 1) = 1
+0.200 > [ect01] P. 1:2(1) ack 1001
+
+0.200 < [ect0] . 1001:2001(1000) ack 2 win 257
++0.005 < [ce] . 2001:3001(1000) ack 2 win 257
+
++0.000 > [ect01] . 2:2(0) ack 2001
+// Previously the ACK below would be delayed by 40ms
++0.000 > [ect01] E. 2:2(0) ack 3001
+
++0.500 < F. 9501:9501(0) ack 4 win 257
+
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/tcp.h | 1 +
+ net/ipv4/tcp_dctcp.c | 30 ++++++++++++++++++------------
+ net/ipv4/tcp_input.c | 3 ++-
+ 3 files changed, 21 insertions(+), 13 deletions(-)
+
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -363,6 +363,7 @@ ssize_t tcp_splice_read(struct socket *s
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags);
+
++void tcp_enter_quickack_mode(struct sock *sk);
+ static inline void tcp_dec_quickack_mode(struct sock *sk,
+ const unsigned int pkts)
+ {
+--- a/net/ipv4/tcp_dctcp.c
++++ b/net/ipv4/tcp_dctcp.c
+@@ -131,12 +131,15 @@ static void dctcp_ce_state_0_to_1(struct
+ struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+- /* State has changed from CE=0 to CE=1 and delayed
+- * ACK has not sent yet.
+- */
+- if (!ca->ce_state &&
+- inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+- __tcp_send_ack(sk, ca->prior_rcv_nxt);
++ if (!ca->ce_state) {
++ /* State has changed from CE=0 to CE=1, force an immediate
++ * ACK to reflect the new CE state. If an ACK was delayed,
++ * send that first to reflect the prior CE state.
++ */
++ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
++ __tcp_send_ack(sk, ca->prior_rcv_nxt);
++ tcp_enter_quickack_mode(sk);
++ }
+
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+ ca->ce_state = 1;
+@@ -149,12 +152,15 @@ static void dctcp_ce_state_1_to_0(struct
+ struct dctcp *ca = inet_csk_ca(sk);
+ struct tcp_sock *tp = tcp_sk(sk);
+
+- /* State has changed from CE=1 to CE=0 and delayed
+- * ACK has not sent yet.
+- */
+- if (ca->ce_state &&
+- inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
+- __tcp_send_ack(sk, ca->prior_rcv_nxt);
++ if (ca->ce_state) {
++ /* State has changed from CE=1 to CE=0, force an immediate
++ * ACK to reflect the new CE state. If an ACK was delayed,
++ * send that first to reflect the prior CE state.
++ */
++ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER)
++ __tcp_send_ack(sk, ca->prior_rcv_nxt);
++ tcp_enter_quickack_mode(sk);
++ }
+
+ ca->prior_rcv_nxt = tp->rcv_nxt;
+ ca->ce_state = 0;
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -209,13 +209,14 @@ static void tcp_incr_quickack(struct soc
+ icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS);
+ }
+
+-static void tcp_enter_quickack_mode(struct sock *sk)
++void tcp_enter_quickack_mode(struct sock *sk)
+ {
+ struct inet_connection_sock *icsk = inet_csk(sk);
+ tcp_incr_quickack(sk);
+ icsk->icsk_ack.pingpong = 0;
+ icsk->icsk_ack.ato = TCP_ATO_MIN;
+ }
++EXPORT_SYMBOL(tcp_enter_quickack_mode);
+
+ /* Send ACKs quickly, if "quick" count is not exhausted
+ * and the session is not interactive.
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Yuchung Cheng <ycheng@google.com>
+Date: Thu, 12 Jul 2018 06:04:52 -0700
+Subject: tcp: fix dctcp delayed ACK schedule
+
+From: Yuchung Cheng <ycheng@google.com>
+
+[ Upstream commit b0c05d0e99d98d7f0cd41efc1eeec94efdc3325d ]
+
+Previously, when a data segment was sent an ACK was piggybacked
+on the data segment without generating a CA_EVENT_NON_DELAYED_ACK
+event to notify congestion control modules. So the DCTCP
+ca->delayed_ack_reserved flag could incorrectly stay set when
+in fact there were no delayed ACKs being reserved. This could result
+in sending a special ECN notification ACK that carries an older
+ACK sequence, when in fact there was no need for such an ACK.
+DCTCP keeps track of the delayed ACK status with its own separate
+state ca->delayed_ack_reserved. Previously it may accidentally cancel
+the delayed ACK without updating this field upon sending a special
+ACK that carries a older ACK sequence. This inconsistency would
+lead to DCTCP receiver never acknowledging the latest data until the
+sender times out and retry in some cases.
+
+Packetdrill script (provided by Larry Brakmo)
+
+0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3
+0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0
+0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0
+0.000 bind(3, ..., ...) = 0
+0.000 listen(3, 1) = 0
+
+0.100 < [ect0] SEW 0:0(0) win 32792 <mss 1000,sackOK,nop,nop,nop,wscale 7>
+0.100 > SE. 0:0(0) ack 1 <mss 1460,nop,nop,sackOK,nop,wscale 8>
+0.110 < [ect0] . 1:1(0) ack 1 win 257
+0.200 accept(3, ..., ...) = 4
+
+0.200 < [ect0] . 1:1001(1000) ack 1 win 257
+0.200 > [ect01] . 1:1(0) ack 1001
+
+0.200 write(4, ..., 1) = 1
+0.200 > [ect01] P. 1:2(1) ack 1001
+
+0.200 < [ect0] . 1001:2001(1000) ack 2 win 257
+0.200 write(4, ..., 1) = 1
+0.200 > [ect01] P. 2:3(1) ack 2001
+
+0.200 < [ect0] . 2001:3001(1000) ack 3 win 257
+0.200 < [ect0] . 3001:4001(1000) ack 3 win 257
+0.200 > [ect01] . 3:3(0) ack 4001
+
+0.210 < [ce] P. 4001:4501(500) ack 3 win 257
+
++0.001 read(4, ..., 4500) = 4500
++0 write(4, ..., 1) = 1
++0 > [ect01] PE. 3:4(1) ack 4501
+
++0.010 < [ect0] W. 4501:5501(1000) ack 4 win 257
+// Previously the ACK sequence below would be 4501, causing a long RTO
++0.040~+0.045 > [ect01] . 4:4(0) ack 5501 // delayed ack
+
++0.311 < [ect0] . 5501:6501(1000) ack 4 win 257 // More data
++0 > [ect01] . 4:4(0) ack 6501 // now acks everything
+
++0.500 < F. 9501:9501(0) ack 4 win 257
+
+Reported-by: Larry Brakmo <brakmo@fb.com>
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Acked-by: Lawrence Brakmo <brakmo@fb.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_dctcp.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/net/ipv4/tcp_dctcp.c
++++ b/net/ipv4/tcp_dctcp.c
+@@ -134,7 +134,8 @@ static void dctcp_ce_state_0_to_1(struct
+ /* State has changed from CE=0 to CE=1 and delayed
+ * ACK has not sent yet.
+ */
+- if (!ca->ce_state && ca->delayed_ack_reserved) {
++ if (!ca->ce_state &&
++ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
+ u32 tmp_rcv_nxt;
+
+ /* Save current rcv_nxt. */
+@@ -164,7 +165,8 @@ static void dctcp_ce_state_1_to_0(struct
+ /* State has changed from CE=1 to CE=0 and delayed
+ * ACK has not sent yet.
+ */
+- if (ca->ce_state && ca->delayed_ack_reserved) {
++ if (ca->ce_state &&
++ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) {
+ u32 tmp_rcv_nxt;
+
+ /* Save current rcv_nxt. */
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Eric Dumazet <edumazet@google.com>
+Date: Mon, 23 Jul 2018 09:28:17 -0700
+Subject: tcp: free batches of packets in tcp_prune_ofo_queue()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 72cd43ba64fc172a443410ce01645895850844c8 ]
+
+Juha-Matti Tilli reported that malicious peers could inject tiny
+packets in out_of_order_queue, forcing very expensive calls
+to tcp_collapse_ofo_queue() and tcp_prune_ofo_queue() for
+every incoming packet. out_of_order_queue rb-tree can contain
+thousands of nodes, iterating over all of them is not nice.
+
+Before linux-4.9, we would have pruned all packets in ofo_queue
+in one go, every XXXX packets. XXXX depends on sk_rcvbuf and skbs
+truesize, but is about 7000 packets with tcp_rmem[2] default of 6 MB.
+
+Since we plan to increase tcp_rmem[2] in the future to cope with
+modern BDP, can not revert to the old behavior, without great pain.
+
+Strategy taken in this patch is to purge ~12.5 % of the queue capacity.
+
+Fixes: 36a6503fedda ("tcp: refine tcp_prune_ofo_queue() to not drop all packets")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: Juha-Matti Tilli <juha-matti.tilli@iki.fi>
+Acked-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/skbuff.h | 2 ++
+ net/ipv4/tcp_input.c | 15 +++++++++++----
+ 2 files changed, 13 insertions(+), 4 deletions(-)
+
+--- a/include/linux/skbuff.h
++++ b/include/linux/skbuff.h
+@@ -2982,6 +2982,8 @@ static inline int __skb_grow_rcsum(struc
+ return __skb_grow(skb, len);
+ }
+
++#define rb_to_skb(rb) rb_entry_safe(rb, struct sk_buff, rbnode)
++
+ #define skb_queue_walk(queue, skb) \
+ for (skb = (queue)->next; \
+ skb != (struct sk_buff *)(queue); \
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -4965,6 +4965,7 @@ new_range:
+ * 2) not add too big latencies if thousands of packets sit there.
+ * (But if application shrinks SO_RCVBUF, we could still end up
+ * freeing whole queue here)
++ * 3) Drop at least 12.5 % of sk_rcvbuf to avoid malicious attacks.
+ *
+ * Return true if queue has shrunk.
+ */
+@@ -4972,20 +4973,26 @@ static bool tcp_prune_ofo_queue(struct s
+ {
+ struct tcp_sock *tp = tcp_sk(sk);
+ struct rb_node *node, *prev;
++ int goal;
+
+ if (RB_EMPTY_ROOT(&tp->out_of_order_queue))
+ return false;
+
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_OFOPRUNED);
++ goal = sk->sk_rcvbuf >> 3;
+ node = &tp->ooo_last_skb->rbnode;
+ do {
+ prev = rb_prev(node);
+ rb_erase(node, &tp->out_of_order_queue);
++ goal -= rb_to_skb(node)->truesize;
+ tcp_drop(sk, rb_entry(node, struct sk_buff, rbnode));
+- sk_mem_reclaim(sk);
+- if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
+- !tcp_under_memory_pressure(sk))
+- break;
++ if (!prev || goal <= 0) {
++ sk_mem_reclaim(sk);
++ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf &&
++ !tcp_under_memory_pressure(sk))
++ break;
++ goal = sk->sk_rcvbuf >> 3;
++ }
+ node = prev;
+ } while (node);
+ tp->ooo_last_skb = rb_entry(prev, struct sk_buff, rbnode);
--- /dev/null
+From foo@baz Fri Jul 27 08:45:05 CEST 2018
+From: Yuchung Cheng <ycheng@google.com>
+Date: Wed, 18 Jul 2018 13:56:34 -0700
+Subject: tcp: helpers to send special DCTCP ack
+
+From: Yuchung Cheng <ycheng@google.com>
+
+[ Upstream commit 2987babb6982306509380fc11b450227a844493b ]
+
+Refactor and create helpers to send the special ACK in DCTCP.
+
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/ipv4/tcp_output.c | 22 +++++++++++++++++-----
+ 1 file changed, 17 insertions(+), 5 deletions(-)
+
+--- a/net/ipv4/tcp_output.c
++++ b/net/ipv4/tcp_output.c
+@@ -905,8 +905,8 @@ out:
+ * We are working here with either a clone of the original
+ * SKB, or a fresh unique copy made by the retransmit engine.
+ */
+-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
+- gfp_t gfp_mask)
++static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
++ int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
+ {
+ const struct inet_connection_sock *icsk = inet_csk(sk);
+ struct inet_sock *inet;
+@@ -969,7 +969,7 @@ static int tcp_transmit_skb(struct sock
+ th->source = inet->inet_sport;
+ th->dest = inet->inet_dport;
+ th->seq = htonl(tcb->seq);
+- th->ack_seq = htonl(tp->rcv_nxt);
++ th->ack_seq = htonl(rcv_nxt);
+ *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) |
+ tcb->tcp_flags);
+
+@@ -1046,6 +1046,13 @@ static int tcp_transmit_skb(struct sock
+ return err;
+ }
+
++static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
++ gfp_t gfp_mask)
++{
++ return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask,
++ tcp_sk(sk)->rcv_nxt);
++}
++
+ /* This routine just queues the buffer for sending.
+ *
+ * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames,
+@@ -3482,7 +3489,7 @@ void tcp_send_delayed_ack(struct sock *s
+ }
+
+ /* This routine sends an ack and also updates the window. */
+-void tcp_send_ack(struct sock *sk)
++void __tcp_send_ack(struct sock *sk, u32 rcv_nxt)
+ {
+ struct sk_buff *buff;
+
+@@ -3520,7 +3527,12 @@ void tcp_send_ack(struct sock *sk)
+
+ /* Send it off, this clears delayed acks for us. */
+ skb_mstamp_get(&buff->skb_mstamp);
+- tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0);
++ __tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0, rcv_nxt);
++}
++
++void tcp_send_ack(struct sock *sk)
++{
++ __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt);
+ }
+ EXPORT_SYMBOL_GPL(tcp_send_ack);
+