From 11a68d5845b0de64420a772e47ab1047f646a8aa Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 27 Jul 2018 10:24:03 +0200 Subject: [PATCH] 4.4-stable patches added patches: ip-hash-fragments-consistently.patch ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch tcp-fix-dctcp-delayed-ack-schedule.patch tcp-helpers-to-send-special-dctcp-ack.patch --- .../ip-hash-fragments-consistently.patch | 73 +++++++++ ...p-v6-_origdstaddr-call-pskb_may_pull.patch | 93 ++++++++++++ ...e-input-modifier-in-rst2init-wrapper.patch | 40 +++++ ...k_state-check-in-rtnl_configure_link.patch | 65 +++++++++ queue-4.4/series | 10 ++ ...apses-in-tcp_prune_queue-if-possible.patch | 46 ++++++ ...s-patterns-in-tcp_collapse_ofo_queue.patch | 69 +++++++++ ...ancel-delay-ack-on-dctcp-special-ack.patch | 138 ++++++++++++++++++ ...y-ack-in-dctcp-upon-ce-status-change.patch | 138 ++++++++++++++++++ .../tcp-fix-dctcp-delayed-ack-schedule.patch | 98 +++++++++++++ ...cp-helpers-to-send-special-dctcp-ack.patch | 79 ++++++++++ 11 files changed, 849 insertions(+) create mode 100644 queue-4.4/ip-hash-fragments-consistently.patch create mode 100644 queue-4.4/ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch create mode 100644 queue-4.4/net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch create mode 100644 queue-4.4/rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch create mode 100644 queue-4.4/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch create mode 100644 queue-4.4/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch create mode 100644 queue-4.4/tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch create mode 100644 queue-4.4/tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch create mode 100644 queue-4.4/tcp-fix-dctcp-delayed-ack-schedule.patch create mode 100644 queue-4.4/tcp-helpers-to-send-special-dctcp-ack.patch diff --git a/queue-4.4/ip-hash-fragments-consistently.patch b/queue-4.4/ip-hash-fragments-consistently.patch new file mode 100644 index 00000000000..df19d623782 --- /dev/null +++ b/queue-4.4/ip-hash-fragments-consistently.patch @@ -0,0 +1,73 @@ +From foo@baz Fri Jul 27 09:17:52 CEST 2018 +From: Paolo Abeni +Date: Mon, 23 Jul 2018 16:50:48 +0200 +Subject: ip: hash fragments consistently + +From: Paolo Abeni + +[ Upstream commit 3dd1c9a1270736029ffca670e9bd0265f4120600 ] + +The skb hash for locally generated ip[v6] fragments belonging +to the same datagram can vary in several circumstances: +* for connected UDP[v6] sockets, the first fragment get its hash + via set_owner_w()/skb_set_hash_from_sk() +* for unconnected IPv6 UDPv6 sockets, the first fragment can get + its hash via ip6_make_flowlabel()/skb_get_hash_flowi6(), if + auto_flowlabel is enabled + +For the following frags the hash is usually computed via +skb_get_hash(). +The above can cause OoO for unconnected IPv6 UDPv6 socket: in that +scenario the egress tx queue can be selected on a per packet basis +via the skb hash. +It may also fool flow-oriented schedulers to place fragments belonging +to the same datagram in different flows. + +Fix the issue by copying the skb hash from the head frag into +the others at fragmentation time. + +Before this commit: +perf probe -a "dev_queue_xmit skb skb->hash skb->l4_hash:b1@0/8 skb->sw_hash:b1@1/8" +netperf -H $IPV4 -t UDP_STREAM -l 5 -- -m 2000 -n & +perf record -e probe:dev_queue_xmit -e probe:skb_set_owner_w -a sleep 0.1 +perf script +probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=3713014309 l4_hash=1 sw_hash=0 +probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=0 l4_hash=0 sw_hash=0 + +After this commit: +probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=2171763177 l4_hash=1 sw_hash=0 +probe:dev_queue_xmit: (ffffffff8c6b1b20) hash=2171763177 l4_hash=1 sw_hash=0 + +Fixes: b73c3d0e4f0e ("net: Save TX flow hash in sock and set in skbuf on xmit") +Fixes: 67800f9b1f4e ("ipv6: Call skb_get_hash_flowi6 to get skb->hash in ip6_make_flowlabel") +Signed-off-by: Paolo Abeni +Reviewed-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ip_output.c | 2 ++ + net/ipv6/ip6_output.c | 2 ++ + 2 files changed, 4 insertions(+) + +--- a/net/ipv4/ip_output.c ++++ b/net/ipv4/ip_output.c +@@ -480,6 +480,8 @@ static void ip_copy_metadata(struct sk_b + to->dev = from->dev; + to->mark = from->mark; + ++ skb_copy_hash(to, from); ++ + /* Copy the flags to each fragment. */ + IPCB(to)->flags = IPCB(from)->flags; + +--- a/net/ipv6/ip6_output.c ++++ b/net/ipv6/ip6_output.c +@@ -559,6 +559,8 @@ static void ip6_copy_metadata(struct sk_ + to->dev = from->dev; + to->mark = from->mark; + ++ skb_copy_hash(to, from); ++ + #ifdef CONFIG_NET_SCHED + to->tc_index = from->tc_index; + #endif diff --git a/queue-4.4/ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch b/queue-4.4/ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch new file mode 100644 index 00000000000..aa36759c73b --- /dev/null +++ b/queue-4.4/ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch @@ -0,0 +1,93 @@ +From foo@baz Fri Jul 27 09:17:52 CEST 2018 +From: Willem de Bruijn +Date: Mon, 23 Jul 2018 19:36:48 -0400 +Subject: ip: in cmsg IP(V6)_ORIGDSTADDR call pskb_may_pull + +From: Willem de Bruijn + +[ Upstream commit 2efd4fca703a6707cad16ab486eaab8fc7f0fd49 ] + +Syzbot reported a read beyond the end of the skb head when returning +IPV6_ORIGDSTADDR: + + BUG: KMSAN: kernel-infoleak in put_cmsg+0x5ef/0x860 net/core/scm.c:242 + CPU: 0 PID: 4501 Comm: syz-executor128 Not tainted 4.17.0+ #9 + Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS + Google 01/01/2011 + Call Trace: + __dump_stack lib/dump_stack.c:77 [inline] + dump_stack+0x185/0x1d0 lib/dump_stack.c:113 + kmsan_report+0x188/0x2a0 mm/kmsan/kmsan.c:1125 + kmsan_internal_check_memory+0x138/0x1f0 mm/kmsan/kmsan.c:1219 + kmsan_copy_to_user+0x7a/0x160 mm/kmsan/kmsan.c:1261 + copy_to_user include/linux/uaccess.h:184 [inline] + put_cmsg+0x5ef/0x860 net/core/scm.c:242 + ip6_datagram_recv_specific_ctl+0x1cf3/0x1eb0 net/ipv6/datagram.c:719 + ip6_datagram_recv_ctl+0x41c/0x450 net/ipv6/datagram.c:733 + rawv6_recvmsg+0x10fb/0x1460 net/ipv6/raw.c:521 + [..] + +This logic and its ipv4 counterpart read the destination port from +the packet at skb_transport_offset(skb) + 4. + +With MSG_MORE and a local SOCK_RAW sender, syzbot was able to cook a +packet that stores headers exactly up to skb_transport_offset(skb) in +the head and the remainder in a frag. + +Call pskb_may_pull before accessing the pointer to ensure that it lies +in skb head. + +Link: http://lkml.kernel.org/r/CAF=yD-LEJwZj5a1-bAAj2Oy_hKmGygV6rsJ_WOrAYnv-fnayiQ@mail.gmail.com +Reported-by: syzbot+9adb4b567003cac781f0@syzkaller.appspotmail.com +Signed-off-by: Willem de Bruijn +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/ip_sockglue.c | 7 +++++-- + net/ipv6/datagram.c | 7 +++++-- + 2 files changed, 10 insertions(+), 4 deletions(-) + +--- a/net/ipv4/ip_sockglue.c ++++ b/net/ipv4/ip_sockglue.c +@@ -135,15 +135,18 @@ static void ip_cmsg_recv_dstaddr(struct + { + struct sockaddr_in sin; + const struct iphdr *iph = ip_hdr(skb); +- __be16 *ports = (__be16 *)skb_transport_header(skb); ++ __be16 *ports; ++ int end; + +- if (skb_transport_offset(skb) + 4 > skb->len) ++ end = skb_transport_offset(skb) + 4; ++ if (end > 0 && !pskb_may_pull(skb, end)) + return; + + /* All current transport protocols have the port numbers in the + * first four bytes of the transport header and this function is + * written with this assumption in mind. + */ ++ ports = (__be16 *)skb_transport_header(skb); + + sin.sin_family = AF_INET; + sin.sin_addr.s_addr = iph->daddr; +--- a/net/ipv6/datagram.c ++++ b/net/ipv6/datagram.c +@@ -657,13 +657,16 @@ void ip6_datagram_recv_specific_ctl(stru + } + if (np->rxopt.bits.rxorigdstaddr) { + struct sockaddr_in6 sin6; +- __be16 *ports = (__be16 *) skb_transport_header(skb); ++ __be16 *ports; ++ int end; + +- if (skb_transport_offset(skb) + 4 <= skb->len) { ++ end = skb_transport_offset(skb) + 4; ++ if (end <= 0 || pskb_may_pull(skb, end)) { + /* All current transport protocols have the port numbers in the + * first four bytes of the transport header and this function is + * written with this assumption in mind. + */ ++ ports = (__be16 *)skb_transport_header(skb); + + sin6.sin6_family = AF_INET6; + sin6.sin6_addr = ipv6_hdr(skb)->daddr; diff --git a/queue-4.4/net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch b/queue-4.4/net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch new file mode 100644 index 00000000000..f85c58a7b90 --- /dev/null +++ b/queue-4.4/net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch @@ -0,0 +1,40 @@ +From foo@baz Fri Jul 27 09:17:52 CEST 2018 +From: Jack Morgenstein +Date: Tue, 24 Jul 2018 14:27:55 +0300 +Subject: net/mlx4_core: Save the qpn from the input modifier in RST2INIT wrapper + +From: Jack Morgenstein + +[ Upstream commit 958c696f5a7274d9447a458ad7aa70719b29a50a ] + +Function mlx4_RST2INIT_QP_wrapper saved the qp number passed in the qp +context, rather than the one passed in the input modifier. + +However, the qp number in the qp context is not defined as a +required parameter by the FW. Therefore, drivers may choose to not +specify the qp number in the qp context for the reset-to-init transition. + +Thus, we must save the qp number passed in the command input modifier -- +which is always present. (This saved qp number is used as the input +modifier for command 2RST_QP when a slave's qp's are destroyed). + +Fixes: c82e9aa0a8bc ("mlx4_core: resource tracking for HCA resources used by guests") +Signed-off-by: Jack Morgenstein +Signed-off-by: Tariq Toukan +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/mellanox/mlx4/resource_tracker.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c ++++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c +@@ -2891,7 +2891,7 @@ int mlx4_RST2INIT_QP_wrapper(struct mlx4 + u32 srqn = qp_get_srqn(qpc) & 0xffffff; + int use_srq = (qp_get_srqn(qpc) >> 24) & 1; + struct res_srq *srq; +- int local_qpn = be32_to_cpu(qpc->local_qpn) & 0xffffff; ++ int local_qpn = vhcr->in_modifier & 0xffffff; + + err = adjust_qp_sched_queue(dev, slave, qpc, inbox); + if (err) diff --git a/queue-4.4/rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch b/queue-4.4/rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch new file mode 100644 index 00000000000..7329180d68a --- /dev/null +++ b/queue-4.4/rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch @@ -0,0 +1,65 @@ +From foo@baz Fri Jul 27 09:17:52 CEST 2018 +From: Roopa Prabhu +Date: Fri, 20 Jul 2018 13:21:01 -0700 +Subject: rtnetlink: add rtnl_link_state check in rtnl_configure_link + +From: Roopa Prabhu + +[ Upstream commit 5025f7f7d506fba9b39e7fe8ca10f6f34cb9bc2d ] + +rtnl_configure_link sets dev->rtnl_link_state to +RTNL_LINK_INITIALIZED and unconditionally calls +__dev_notify_flags to notify user-space of dev flags. + +current call sequence for rtnl_configure_link +rtnetlink_newlink + rtnl_link_ops->newlink + rtnl_configure_link (unconditionally notifies userspace of + default and new dev flags) + +If a newlink handler wants to call rtnl_configure_link +early, we will end up with duplicate notifications to +user-space. + +This patch fixes rtnl_configure_link to check rtnl_link_state +and call __dev_notify_flags with gchanges = 0 if already +RTNL_LINK_INITIALIZED. + +Later in the series, this patch will help the following sequence +where a driver implementing newlink can call rtnl_configure_link +to initialize the link early. + +makes the following call sequence work: +rtnetlink_newlink + rtnl_link_ops->newlink (vxlan) -> rtnl_configure_link (initializes + link and notifies + user-space of default + dev flags) + rtnl_configure_link (updates dev flags if requested by user ifm + and notifies user-space of new dev flags) + +Signed-off-by: Roopa Prabhu +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/rtnetlink.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +--- a/net/core/rtnetlink.c ++++ b/net/core/rtnetlink.c +@@ -2087,9 +2087,12 @@ int rtnl_configure_link(struct net_devic + return err; + } + +- dev->rtnl_link_state = RTNL_LINK_INITIALIZED; +- +- __dev_notify_flags(dev, old_flags, ~0U); ++ if (dev->rtnl_link_state == RTNL_LINK_INITIALIZED) { ++ __dev_notify_flags(dev, old_flags, 0U); ++ } else { ++ dev->rtnl_link_state = RTNL_LINK_INITIALIZED; ++ __dev_notify_flags(dev, old_flags, ~0U); ++ } + return 0; + } + EXPORT_SYMBOL(rtnl_configure_link); diff --git a/queue-4.4/series b/queue-4.4/series index e4d320cdad1..8ea777392db 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -1 +1,11 @@ mips-ath79-fix-register-address-in-ath79_ddr_wb_flush.patch +ip-hash-fragments-consistently.patch +net-mlx4_core-save-the-qpn-from-the-input-modifier-in-rst2init-wrapper.patch +rtnetlink-add-rtnl_link_state-check-in-rtnl_configure_link.patch +tcp-fix-dctcp-delayed-ack-schedule.patch +tcp-helpers-to-send-special-dctcp-ack.patch +tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch +tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch +tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch +tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch +ip-in-cmsg-ip-v6-_origdstaddr-call-pskb_may_pull.patch diff --git a/queue-4.4/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch b/queue-4.4/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch new file mode 100644 index 00000000000..bab37cc1f76 --- /dev/null +++ b/queue-4.4/tcp-avoid-collapses-in-tcp_prune_queue-if-possible.patch @@ -0,0 +1,46 @@ +From foo@baz Fri Jul 27 09:17:52 CEST 2018 +From: Eric Dumazet +Date: Mon, 23 Jul 2018 09:28:18 -0700 +Subject: tcp: avoid collapses in tcp_prune_queue() if possible + +From: Eric Dumazet + +[ Upstream commit f4a3313d8e2ca9fd8d8f45e40a2903ba782607e7 ] + +Right after a TCP flow is created, receiving tiny out of order +packets allways hit the condition : + +if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) + tcp_clamp_window(sk); + +tcp_clamp_window() increases sk_rcvbuf to match sk_rmem_alloc +(guarded by tcp_rmem[2]) + +Calling tcp_collapse_ofo_queue() in this case is not useful, +and offers a O(N^2) surface attack to malicious peers. + +Better not attempt anything before full queue capacity is reached, +forcing attacker to spend lots of resource and allow us to more +easily detect the abuse. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Acked-by: Yuchung Cheng +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4875,6 +4875,9 @@ static int tcp_prune_queue(struct sock * + else if (tcp_under_memory_pressure(sk)) + tp->rcv_ssthresh = min(tp->rcv_ssthresh, 4U * tp->advmss); + ++ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf) ++ return 0; ++ + tcp_collapse_ofo_queue(sk); + if (!skb_queue_empty(&sk->sk_receive_queue)) + tcp_collapse(sk, &sk->sk_receive_queue, diff --git a/queue-4.4/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch b/queue-4.4/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch new file mode 100644 index 00000000000..f669daa88ce --- /dev/null +++ b/queue-4.4/tcp-detect-malicious-patterns-in-tcp_collapse_ofo_queue.patch @@ -0,0 +1,69 @@ +From foo@baz Fri Jul 27 09:17:52 CEST 2018 +From: Eric Dumazet +Date: Mon, 23 Jul 2018 09:28:19 -0700 +Subject: tcp: detect malicious patterns in tcp_collapse_ofo_queue() + +From: Eric Dumazet + +[ Upstream commit 3d4bf93ac12003f9b8e1e2de37fe27983deebdcf ] + +In case an attacker feeds tiny packets completely out of order, +tcp_collapse_ofo_queue() might scan the whole rb-tree, performing +expensive copies, but not changing socket memory usage at all. + +1) Do not attempt to collapse tiny skbs. +2) Add logic to exit early when too many tiny skbs are detected. + +We prefer not doing aggressive collapsing (which copies packets) +for pathological flows, and revert to tcp_prune_ofo_queue() which +will be less expensive. + +In the future, we might add the possibility of terminating flows +that are proven to be malicious. + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_input.c | 15 +++++++++++++-- + 1 file changed, 13 insertions(+), 2 deletions(-) + +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4789,6 +4789,7 @@ restart: + static void tcp_collapse_ofo_queue(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); ++ u32 range_truesize, sum_tiny = 0; + struct sk_buff *skb = skb_peek(&tp->out_of_order_queue); + struct sk_buff *head; + u32 start, end; +@@ -4798,6 +4799,7 @@ static void tcp_collapse_ofo_queue(struc + + start = TCP_SKB_CB(skb)->seq; + end = TCP_SKB_CB(skb)->end_seq; ++ range_truesize = skb->truesize; + head = skb; + + for (;;) { +@@ -4812,8 +4814,17 @@ static void tcp_collapse_ofo_queue(struc + if (!skb || + after(TCP_SKB_CB(skb)->seq, end) || + before(TCP_SKB_CB(skb)->end_seq, start)) { +- tcp_collapse(sk, &tp->out_of_order_queue, +- head, skb, start, end); ++ /* Do not attempt collapsing tiny skbs */ ++ if (range_truesize != head->truesize || ++ end - start >= SKB_WITH_OVERHEAD(SK_MEM_QUANTUM)) { ++ tcp_collapse(sk, &tp->out_of_order_queue, ++ head, skb, start, end); ++ } else { ++ sum_tiny += range_truesize; ++ if (sum_tiny > sk->sk_rcvbuf >> 3) ++ return; ++ } ++ + head = skb; + if (!skb) + break; diff --git a/queue-4.4/tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch b/queue-4.4/tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch new file mode 100644 index 00000000000..cf5eb0fef8a --- /dev/null +++ b/queue-4.4/tcp-do-not-cancel-delay-ack-on-dctcp-special-ack.patch @@ -0,0 +1,138 @@ +From foo@baz Fri Jul 27 09:17:52 CEST 2018 +From: Yuchung Cheng +Date: Wed, 18 Jul 2018 13:56:35 -0700 +Subject: tcp: do not cancel delay-AcK on DCTCP special ACK + +From: Yuchung Cheng + +[ Upstream commit 27cde44a259c380a3c09066fc4b42de7dde9b1ad ] + +Currently when a DCTCP receiver delays an ACK and receive a +data packet with a different CE mark from the previous one's, it +sends two immediate ACKs acking previous and latest sequences +respectly (for ECN accounting). + +Previously sending the first ACK may mark off the delayed ACK timer +(tcp_event_ack_sent). This may subsequently prevent sending the +second ACK to acknowledge the latest sequence (tcp_ack_snd_check). +The culprit is that tcp_send_ack() assumes it always acknowleges +the latest sequence, which is not true for the first special ACK. + +The fix is to not make the assumption in tcp_send_ack and check the +actual ack sequence before cancelling the delayed ACK. Further it's +safer to pass the ack sequence number as a local variable into +tcp_send_ack routine, instead of intercepting tp->rcv_nxt to avoid +future bugs like this. + +Reported-by: Neal Cardwell +Signed-off-by: Yuchung Cheng +Acked-by: Neal Cardwell +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/tcp.h | 1 + + net/ipv4/tcp_dctcp.c | 34 ++++------------------------------ + net/ipv4/tcp_output.c | 11 ++++++++--- + 3 files changed, 13 insertions(+), 33 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -559,6 +559,7 @@ void tcp_send_fin(struct sock *sk); + void tcp_send_active_reset(struct sock *sk, gfp_t priority); + int tcp_send_synack(struct sock *); + void tcp_push_one(struct sock *, unsigned int mss_now); ++void __tcp_send_ack(struct sock *sk, u32 rcv_nxt); + void tcp_send_ack(struct sock *sk); + void tcp_send_delayed_ack(struct sock *sk); + void tcp_send_loss_probe(struct sock *sk); +--- a/net/ipv4/tcp_dctcp.c ++++ b/net/ipv4/tcp_dctcp.c +@@ -135,21 +135,8 @@ static void dctcp_ce_state_0_to_1(struct + * ACK has not sent yet. + */ + if (!ca->ce_state && +- inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { +- u32 tmp_rcv_nxt; +- +- /* Save current rcv_nxt. */ +- tmp_rcv_nxt = tp->rcv_nxt; +- +- /* Generate previous ack with CE=0. */ +- tp->ecn_flags &= ~TCP_ECN_DEMAND_CWR; +- tp->rcv_nxt = ca->prior_rcv_nxt; +- +- tcp_send_ack(sk); +- +- /* Recover current rcv_nxt. */ +- tp->rcv_nxt = tmp_rcv_nxt; +- } ++ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) ++ __tcp_send_ack(sk, ca->prior_rcv_nxt); + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 1; +@@ -166,21 +153,8 @@ static void dctcp_ce_state_1_to_0(struct + * ACK has not sent yet. + */ + if (ca->ce_state && +- inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { +- u32 tmp_rcv_nxt; +- +- /* Save current rcv_nxt. */ +- tmp_rcv_nxt = tp->rcv_nxt; +- +- /* Generate previous ack with CE=1. */ +- tp->ecn_flags |= TCP_ECN_DEMAND_CWR; +- tp->rcv_nxt = ca->prior_rcv_nxt; +- +- tcp_send_ack(sk); +- +- /* Recover current rcv_nxt. */ +- tp->rcv_nxt = tmp_rcv_nxt; +- } ++ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) ++ __tcp_send_ack(sk, ca->prior_rcv_nxt); + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 0; +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -177,8 +177,13 @@ static void tcp_event_data_sent(struct t + } + + /* Account for an ACK we sent. */ +-static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) ++static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts, ++ u32 rcv_nxt) + { ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (unlikely(rcv_nxt != tp->rcv_nxt)) ++ return; /* Special ACK sent by DCTCP to reflect ECN */ + tcp_dec_quickack_mode(sk, pkts); + inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); + } +@@ -1005,7 +1010,7 @@ static int __tcp_transmit_skb(struct soc + icsk->icsk_af_ops->send_check(sk, skb); + + if (likely(tcb->tcp_flags & TCPHDR_ACK)) +- tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); ++ tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt); + + if (skb->len != tcp_header_size) + tcp_event_data_sent(tp, sk); +@@ -3400,12 +3405,12 @@ void __tcp_send_ack(struct sock *sk, u32 + skb_mstamp_get(&buff->skb_mstamp); + __tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC), rcv_nxt); + } ++EXPORT_SYMBOL_GPL(__tcp_send_ack); + + void tcp_send_ack(struct sock *sk) + { + __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); + } +-EXPORT_SYMBOL_GPL(tcp_send_ack); + + /* This routine sends a packet with an out of date sequence + * number. It assumes the other end will try to ack it. diff --git a/queue-4.4/tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch b/queue-4.4/tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch new file mode 100644 index 00000000000..73d4a437cfc --- /dev/null +++ b/queue-4.4/tcp-do-not-delay-ack-in-dctcp-upon-ce-status-change.patch @@ -0,0 +1,138 @@ +From foo@baz Fri Jul 27 09:17:52 CEST 2018 +From: Yuchung Cheng +Date: Wed, 18 Jul 2018 13:56:36 -0700 +Subject: tcp: do not delay ACK in DCTCP upon CE status change + +From: Yuchung Cheng + +[ Upstream commit a0496ef2c23b3b180902dd185d0d63ccbc624cf8 ] + +Per DCTCP RFC8257 (Section 3.2) the ACK reflecting the CE status change +has to be sent immediately so the sender can respond quickly: + +""" When receiving packets, the CE codepoint MUST be processed as follows: + + 1. If the CE codepoint is set and DCTCP.CE is false, set DCTCP.CE to + true and send an immediate ACK. + + 2. If the CE codepoint is not set and DCTCP.CE is true, set DCTCP.CE + to false and send an immediate ACK. +""" + +Previously DCTCP implementation may continue to delay the ACK. This +patch fixes that to implement the RFC by forcing an immediate ACK. + +Tested with this packetdrill script provided by Larry Brakmo + +0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0 +0.000 bind(3, ..., ...) = 0 +0.000 listen(3, 1) = 0 + +0.100 < [ect0] SEW 0:0(0) win 32792 +0.100 > SE. 0:0(0) ack 1 +0.110 < [ect0] . 1:1(0) ack 1 win 257 +0.200 accept(3, ..., ...) = 4 + +0 setsockopt(4, SOL_SOCKET, SO_DEBUG, [1], 4) = 0 + +0.200 < [ect0] . 1:1001(1000) ack 1 win 257 +0.200 > [ect01] . 1:1(0) ack 1001 + +0.200 write(4, ..., 1) = 1 +0.200 > [ect01] P. 1:2(1) ack 1001 + +0.200 < [ect0] . 1001:2001(1000) ack 2 win 257 ++0.005 < [ce] . 2001:3001(1000) ack 2 win 257 + ++0.000 > [ect01] . 2:2(0) ack 2001 +// Previously the ACK below would be delayed by 40ms ++0.000 > [ect01] E. 2:2(0) ack 3001 + ++0.500 < F. 9501:9501(0) ack 4 win 257 + +Signed-off-by: Yuchung Cheng +Acked-by: Neal Cardwell +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + include/net/tcp.h | 1 + + net/ipv4/tcp_dctcp.c | 30 ++++++++++++++++++------------ + net/ipv4/tcp_input.c | 3 ++- + 3 files changed, 21 insertions(+), 13 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -376,6 +376,7 @@ ssize_t tcp_splice_read(struct socket *s + struct pipe_inode_info *pipe, size_t len, + unsigned int flags); + ++void tcp_enter_quickack_mode(struct sock *sk); + static inline void tcp_dec_quickack_mode(struct sock *sk, + const unsigned int pkts) + { +--- a/net/ipv4/tcp_dctcp.c ++++ b/net/ipv4/tcp_dctcp.c +@@ -131,12 +131,15 @@ static void dctcp_ce_state_0_to_1(struct + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + +- /* State has changed from CE=0 to CE=1 and delayed +- * ACK has not sent yet. +- */ +- if (!ca->ce_state && +- inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) +- __tcp_send_ack(sk, ca->prior_rcv_nxt); ++ if (!ca->ce_state) { ++ /* State has changed from CE=0 to CE=1, force an immediate ++ * ACK to reflect the new CE state. If an ACK was delayed, ++ * send that first to reflect the prior CE state. ++ */ ++ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) ++ __tcp_send_ack(sk, ca->prior_rcv_nxt); ++ tcp_enter_quickack_mode(sk); ++ } + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 1; +@@ -149,12 +152,15 @@ static void dctcp_ce_state_1_to_0(struct + struct dctcp *ca = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); + +- /* State has changed from CE=1 to CE=0 and delayed +- * ACK has not sent yet. +- */ +- if (ca->ce_state && +- inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) +- __tcp_send_ack(sk, ca->prior_rcv_nxt); ++ if (ca->ce_state) { ++ /* State has changed from CE=1 to CE=0, force an immediate ++ * ACK to reflect the new CE state. If an ACK was delayed, ++ * send that first to reflect the prior CE state. ++ */ ++ if (inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) ++ __tcp_send_ack(sk, ca->prior_rcv_nxt); ++ tcp_enter_quickack_mode(sk); ++ } + + ca->prior_rcv_nxt = tp->rcv_nxt; + ca->ce_state = 0; +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -187,13 +187,14 @@ static void tcp_incr_quickack(struct soc + icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); + } + +-static void tcp_enter_quickack_mode(struct sock *sk) ++void tcp_enter_quickack_mode(struct sock *sk) + { + struct inet_connection_sock *icsk = inet_csk(sk); + tcp_incr_quickack(sk); + icsk->icsk_ack.pingpong = 0; + icsk->icsk_ack.ato = TCP_ATO_MIN; + } ++EXPORT_SYMBOL(tcp_enter_quickack_mode); + + /* Send ACKs quickly, if "quick" count is not exhausted + * and the session is not interactive. diff --git a/queue-4.4/tcp-fix-dctcp-delayed-ack-schedule.patch b/queue-4.4/tcp-fix-dctcp-delayed-ack-schedule.patch new file mode 100644 index 00000000000..e0f868dc3d2 --- /dev/null +++ b/queue-4.4/tcp-fix-dctcp-delayed-ack-schedule.patch @@ -0,0 +1,98 @@ +From foo@baz Fri Jul 27 09:17:52 CEST 2018 +From: Yuchung Cheng +Date: Thu, 12 Jul 2018 06:04:52 -0700 +Subject: tcp: fix dctcp delayed ACK schedule + +From: Yuchung Cheng + +[ Upstream commit b0c05d0e99d98d7f0cd41efc1eeec94efdc3325d ] + +Previously, when a data segment was sent an ACK was piggybacked +on the data segment without generating a CA_EVENT_NON_DELAYED_ACK +event to notify congestion control modules. So the DCTCP +ca->delayed_ack_reserved flag could incorrectly stay set when +in fact there were no delayed ACKs being reserved. This could result +in sending a special ECN notification ACK that carries an older +ACK sequence, when in fact there was no need for such an ACK. +DCTCP keeps track of the delayed ACK status with its own separate +state ca->delayed_ack_reserved. Previously it may accidentally cancel +the delayed ACK without updating this field upon sending a special +ACK that carries a older ACK sequence. This inconsistency would +lead to DCTCP receiver never acknowledging the latest data until the +sender times out and retry in some cases. + +Packetdrill script (provided by Larry Brakmo) + +0.000 socket(..., SOCK_STREAM, IPPROTO_TCP) = 3 +0.000 setsockopt(3, SOL_SOCKET, SO_REUSEADDR, [1], 4) = 0 +0.000 setsockopt(3, SOL_TCP, TCP_CONGESTION, "dctcp", 5) = 0 +0.000 bind(3, ..., ...) = 0 +0.000 listen(3, 1) = 0 + +0.100 < [ect0] SEW 0:0(0) win 32792 +0.100 > SE. 0:0(0) ack 1 +0.110 < [ect0] . 1:1(0) ack 1 win 257 +0.200 accept(3, ..., ...) = 4 + +0.200 < [ect0] . 1:1001(1000) ack 1 win 257 +0.200 > [ect01] . 1:1(0) ack 1001 + +0.200 write(4, ..., 1) = 1 +0.200 > [ect01] P. 1:2(1) ack 1001 + +0.200 < [ect0] . 1001:2001(1000) ack 2 win 257 +0.200 write(4, ..., 1) = 1 +0.200 > [ect01] P. 2:3(1) ack 2001 + +0.200 < [ect0] . 2001:3001(1000) ack 3 win 257 +0.200 < [ect0] . 3001:4001(1000) ack 3 win 257 +0.200 > [ect01] . 3:3(0) ack 4001 + +0.210 < [ce] P. 4001:4501(500) ack 3 win 257 + ++0.001 read(4, ..., 4500) = 4500 ++0 write(4, ..., 1) = 1 ++0 > [ect01] PE. 3:4(1) ack 4501 + ++0.010 < [ect0] W. 4501:5501(1000) ack 4 win 257 +// Previously the ACK sequence below would be 4501, causing a long RTO ++0.040~+0.045 > [ect01] . 4:4(0) ack 5501 // delayed ack + ++0.311 < [ect0] . 5501:6501(1000) ack 4 win 257 // More data ++0 > [ect01] . 4:4(0) ack 6501 // now acks everything + ++0.500 < F. 9501:9501(0) ack 4 win 257 + +Reported-by: Larry Brakmo +Signed-off-by: Yuchung Cheng +Signed-off-by: Eric Dumazet +Acked-by: Neal Cardwell +Acked-by: Lawrence Brakmo +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_dctcp.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/net/ipv4/tcp_dctcp.c ++++ b/net/ipv4/tcp_dctcp.c +@@ -134,7 +134,8 @@ static void dctcp_ce_state_0_to_1(struct + /* State has changed from CE=0 to CE=1 and delayed + * ACK has not sent yet. + */ +- if (!ca->ce_state && ca->delayed_ack_reserved) { ++ if (!ca->ce_state && ++ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { + u32 tmp_rcv_nxt; + + /* Save current rcv_nxt. */ +@@ -164,7 +165,8 @@ static void dctcp_ce_state_1_to_0(struct + /* State has changed from CE=1 to CE=0 and delayed + * ACK has not sent yet. + */ +- if (ca->ce_state && ca->delayed_ack_reserved) { ++ if (ca->ce_state && ++ inet_csk(sk)->icsk_ack.pending & ICSK_ACK_TIMER) { + u32 tmp_rcv_nxt; + + /* Save current rcv_nxt. */ diff --git a/queue-4.4/tcp-helpers-to-send-special-dctcp-ack.patch b/queue-4.4/tcp-helpers-to-send-special-dctcp-ack.patch new file mode 100644 index 00000000000..598339a2f8f --- /dev/null +++ b/queue-4.4/tcp-helpers-to-send-special-dctcp-ack.patch @@ -0,0 +1,79 @@ +From foo@baz Fri Jul 27 09:17:52 CEST 2018 +From: Yuchung Cheng +Date: Wed, 18 Jul 2018 13:56:34 -0700 +Subject: tcp: helpers to send special DCTCP ack + +From: Yuchung Cheng + +[ Upstream commit 2987babb6982306509380fc11b450227a844493b ] + +Refactor and create helpers to send the special ACK in DCTCP. + +Signed-off-by: Yuchung Cheng +Acked-by: Neal Cardwell +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_output.c | 22 +++++++++++++++++----- + 1 file changed, 17 insertions(+), 5 deletions(-) + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -901,8 +901,8 @@ out: + * We are working here with either a clone of the original + * SKB, or a fresh unique copy made by the retransmit engine. + */ +-static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, +- gfp_t gfp_mask) ++static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, ++ int clone_it, gfp_t gfp_mask, u32 rcv_nxt) + { + const struct inet_connection_sock *icsk = inet_csk(sk); + struct inet_sock *inet; +@@ -962,7 +962,7 @@ static int tcp_transmit_skb(struct sock + th->source = inet->inet_sport; + th->dest = inet->inet_dport; + th->seq = htonl(tcb->seq); +- th->ack_seq = htonl(tp->rcv_nxt); ++ th->ack_seq = htonl(rcv_nxt); + *(((__be16 *)th) + 6) = htons(((tcp_header_size >> 2) << 12) | + tcb->tcp_flags); + +@@ -1036,6 +1036,13 @@ static int tcp_transmit_skb(struct sock + return net_xmit_eval(err); + } + ++static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, ++ gfp_t gfp_mask) ++{ ++ return __tcp_transmit_skb(sk, skb, clone_it, gfp_mask, ++ tcp_sk(sk)->rcv_nxt); ++} ++ + /* This routine just queues the buffer for sending. + * + * NOTE: probe0 timer is not checked, do not forget tcp_push_pending_frames, +@@ -3354,7 +3361,7 @@ void tcp_send_delayed_ack(struct sock *s + } + + /* This routine sends an ack and also updates the window. */ +-void tcp_send_ack(struct sock *sk) ++void __tcp_send_ack(struct sock *sk, u32 rcv_nxt) + { + struct sk_buff *buff; + +@@ -3391,7 +3398,12 @@ void tcp_send_ack(struct sock *sk) + + /* Send it off, this clears delayed acks for us. */ + skb_mstamp_get(&buff->skb_mstamp); +- tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC)); ++ __tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC), rcv_nxt); ++} ++ ++void tcp_send_ack(struct sock *sk) ++{ ++ __tcp_send_ack(sk, tcp_sk(sk)->rcv_nxt); + } + EXPORT_SYMBOL_GPL(tcp_send_ack); + -- 2.47.3