]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 4.19
authorSasha Levin <sashal@kernel.org>
Sun, 7 Jul 2024 14:53:25 +0000 (10:53 -0400)
committerSasha Levin <sashal@kernel.org>
Sun, 7 Jul 2024 14:53:25 +0000 (10:53 -0400)
Signed-off-by: Sasha Levin <sashal@kernel.org>
queue-4.19/bonding-fix-out-of-bounds-read-in-bond_option_arp_ip.patch [new file with mode: 0644]
queue-4.19/inet_diag-initialize-pad-field-in-struct-inet_diag_r.patch [new file with mode: 0644]
queue-4.19/net-tcp-better-handling-of-reordering-then-loss-case.patch [new file with mode: 0644]
queue-4.19/selftests-fix-oom-in-msg_zerocopy-selftest.patch [new file with mode: 0644]
queue-4.19/selftests-make-order-checking-verbose-in-msg_zerocop.patch [new file with mode: 0644]
queue-4.19/series
queue-4.19/tcp-add-ece_ack-flag-to-reno-sack-functions.patch [new file with mode: 0644]
queue-4.19/tcp-take-care-of-compressed-acks-in-tcp_add_reno_sac.patch [new file with mode: 0644]
queue-4.19/tcp-tcp_mark_head_lost-is-only-valid-for-sack-tcp.patch [new file with mode: 0644]
queue-4.19/tcp_metrics-validate-source-addr-length.patch [new file with mode: 0644]
queue-4.19/upstream-tcp-fix-dsack-undo-in-fast-recovery-to-call.patch [new file with mode: 0644]

diff --git a/queue-4.19/bonding-fix-out-of-bounds-read-in-bond_option_arp_ip.patch b/queue-4.19/bonding-fix-out-of-bounds-read-in-bond_option_arp_ip.patch
new file mode 100644 (file)
index 0000000..61e1ee8
--- /dev/null
@@ -0,0 +1,79 @@
+From f6fca73dc66d83d92c64deb18842e51646064802 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 2 Jul 2024 14:55:55 +0100
+Subject: bonding: Fix out-of-bounds read in bond_option_arp_ip_targets_set()
+
+From: Sam Sun <samsun1006219@gmail.com>
+
+[ Upstream commit e271ff53807e8f2c628758290f0e499dbe51cb3d ]
+
+In function bond_option_arp_ip_targets_set(), if newval->string is an
+empty string, newval->string+1 will point to the byte after the
+string, causing an out-of-bound read.
+
+BUG: KASAN: slab-out-of-bounds in strlen+0x7d/0xa0 lib/string.c:418
+Read of size 1 at addr ffff8881119c4781 by task syz-executor665/8107
+CPU: 1 PID: 8107 Comm: syz-executor665 Not tainted 6.7.0-rc7 #1
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
+Call Trace:
+ <TASK>
+ __dump_stack lib/dump_stack.c:88 [inline]
+ dump_stack_lvl+0xd9/0x150 lib/dump_stack.c:106
+ print_address_description mm/kasan/report.c:364 [inline]
+ print_report+0xc1/0x5e0 mm/kasan/report.c:475
+ kasan_report+0xbe/0xf0 mm/kasan/report.c:588
+ strlen+0x7d/0xa0 lib/string.c:418
+ __fortify_strlen include/linux/fortify-string.h:210 [inline]
+ in4_pton+0xa3/0x3f0 net/core/utils.c:130
+ bond_option_arp_ip_targets_set+0xc2/0x910
+drivers/net/bonding/bond_options.c:1201
+ __bond_opt_set+0x2a4/0x1030 drivers/net/bonding/bond_options.c:767
+ __bond_opt_set_notify+0x48/0x150 drivers/net/bonding/bond_options.c:792
+ bond_opt_tryset_rtnl+0xda/0x160 drivers/net/bonding/bond_options.c:817
+ bonding_sysfs_store_option+0xa1/0x120 drivers/net/bonding/bond_sysfs.c:156
+ dev_attr_store+0x54/0x80 drivers/base/core.c:2366
+ sysfs_kf_write+0x114/0x170 fs/sysfs/file.c:136
+ kernfs_fop_write_iter+0x337/0x500 fs/kernfs/file.c:334
+ call_write_iter include/linux/fs.h:2020 [inline]
+ new_sync_write fs/read_write.c:491 [inline]
+ vfs_write+0x96a/0xd80 fs/read_write.c:584
+ ksys_write+0x122/0x250 fs/read_write.c:637
+ do_syscall_x64 arch/x86/entry/common.c:52 [inline]
+ do_syscall_64+0x40/0x110 arch/x86/entry/common.c:83
+ entry_SYSCALL_64_after_hwframe+0x63/0x6b
+---[ end trace ]---
+
+Fix it by adding a check of string length before using it.
+
+Fixes: f9de11a16594 ("bonding: add ip checks when store ip target")
+Signed-off-by: Yue Sun <samsun1006219@gmail.com>
+Signed-off-by: Simon Horman <horms@kernel.org>
+Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
+Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
+Link: https://patch.msgid.link/20240702-bond-oob-v6-1-2dfdba195c19@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/bonding/bond_options.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
+index c9aa28eee191d..a818720ef0e49 100644
+--- a/drivers/net/bonding/bond_options.c
++++ b/drivers/net/bonding/bond_options.c
+@@ -1074,9 +1074,9 @@ static int bond_option_arp_ip_targets_set(struct bonding *bond,
+       __be32 target;
+       if (newval->string) {
+-              if (!in4_pton(newval->string+1, -1, (u8 *)&target, -1, NULL)) {
+-                      netdev_err(bond->dev, "invalid ARP target %pI4 specified\n",
+-                                 &target);
++              if (strlen(newval->string) < 1 ||
++                  !in4_pton(newval->string + 1, -1, (u8 *)&target, -1, NULL)) {
++                      netdev_err(bond->dev, "invalid ARP target specified\n");
+                       return ret;
+               }
+               if (newval->string[0] == '+')
+-- 
+2.43.0
+
diff --git a/queue-4.19/inet_diag-initialize-pad-field-in-struct-inet_diag_r.patch b/queue-4.19/inet_diag-initialize-pad-field-in-struct-inet_diag_r.patch
new file mode 100644 (file)
index 0000000..b3bed20
--- /dev/null
@@ -0,0 +1,117 @@
+From 7938be5c38869a82b74ebfcf9d2c5be4c10c706f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 3 Jul 2024 18:16:49 +0900
+Subject: inet_diag: Initialize pad field in struct inet_diag_req_v2
+
+From: Shigeru Yoshida <syoshida@redhat.com>
+
+[ Upstream commit 61cf1c739f08190a4cbf047b9fbb192a94d87e3f ]
+
+KMSAN reported uninit-value access in raw_lookup() [1]. Diag for raw
+sockets uses the pad field in struct inet_diag_req_v2 for the
+underlying protocol. This field corresponds to the sdiag_raw_protocol
+field in struct inet_diag_req_raw.
+
+inet_diag_get_exact_compat() converts inet_diag_req to
+inet_diag_req_v2, but leaves the pad field uninitialized. So the issue
+occurs when raw_lookup() accesses the sdiag_raw_protocol field.
+
+Fix this by initializing the pad field in
+inet_diag_get_exact_compat(). Also, do the same fix in
+inet_diag_dump_compat() to avoid the similar issue in the future.
+
+[1]
+BUG: KMSAN: uninit-value in raw_lookup net/ipv4/raw_diag.c:49 [inline]
+BUG: KMSAN: uninit-value in raw_sock_get+0x657/0x800 net/ipv4/raw_diag.c:71
+ raw_lookup net/ipv4/raw_diag.c:49 [inline]
+ raw_sock_get+0x657/0x800 net/ipv4/raw_diag.c:71
+ raw_diag_dump_one+0xa1/0x660 net/ipv4/raw_diag.c:99
+ inet_diag_cmd_exact+0x7d9/0x980
+ inet_diag_get_exact_compat net/ipv4/inet_diag.c:1404 [inline]
+ inet_diag_rcv_msg_compat+0x469/0x530 net/ipv4/inet_diag.c:1426
+ sock_diag_rcv_msg+0x23d/0x740 net/core/sock_diag.c:282
+ netlink_rcv_skb+0x537/0x670 net/netlink/af_netlink.c:2564
+ sock_diag_rcv+0x35/0x40 net/core/sock_diag.c:297
+ netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline]
+ netlink_unicast+0xe74/0x1240 net/netlink/af_netlink.c:1361
+ netlink_sendmsg+0x10c6/0x1260 net/netlink/af_netlink.c:1905
+ sock_sendmsg_nosec net/socket.c:730 [inline]
+ __sock_sendmsg+0x332/0x3d0 net/socket.c:745
+ ____sys_sendmsg+0x7f0/0xb70 net/socket.c:2585
+ ___sys_sendmsg+0x271/0x3b0 net/socket.c:2639
+ __sys_sendmsg net/socket.c:2668 [inline]
+ __do_sys_sendmsg net/socket.c:2677 [inline]
+ __se_sys_sendmsg net/socket.c:2675 [inline]
+ __x64_sys_sendmsg+0x27e/0x4a0 net/socket.c:2675
+ x64_sys_call+0x135e/0x3ce0 arch/x86/include/generated/asm/syscalls_64.h:47
+ do_syscall_x64 arch/x86/entry/common.c:52 [inline]
+ do_syscall_64+0xd9/0x1e0 arch/x86/entry/common.c:83
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+
+Uninit was stored to memory at:
+ raw_sock_get+0x650/0x800 net/ipv4/raw_diag.c:71
+ raw_diag_dump_one+0xa1/0x660 net/ipv4/raw_diag.c:99
+ inet_diag_cmd_exact+0x7d9/0x980
+ inet_diag_get_exact_compat net/ipv4/inet_diag.c:1404 [inline]
+ inet_diag_rcv_msg_compat+0x469/0x530 net/ipv4/inet_diag.c:1426
+ sock_diag_rcv_msg+0x23d/0x740 net/core/sock_diag.c:282
+ netlink_rcv_skb+0x537/0x670 net/netlink/af_netlink.c:2564
+ sock_diag_rcv+0x35/0x40 net/core/sock_diag.c:297
+ netlink_unicast_kernel net/netlink/af_netlink.c:1335 [inline]
+ netlink_unicast+0xe74/0x1240 net/netlink/af_netlink.c:1361
+ netlink_sendmsg+0x10c6/0x1260 net/netlink/af_netlink.c:1905
+ sock_sendmsg_nosec net/socket.c:730 [inline]
+ __sock_sendmsg+0x332/0x3d0 net/socket.c:745
+ ____sys_sendmsg+0x7f0/0xb70 net/socket.c:2585
+ ___sys_sendmsg+0x271/0x3b0 net/socket.c:2639
+ __sys_sendmsg net/socket.c:2668 [inline]
+ __do_sys_sendmsg net/socket.c:2677 [inline]
+ __se_sys_sendmsg net/socket.c:2675 [inline]
+ __x64_sys_sendmsg+0x27e/0x4a0 net/socket.c:2675
+ x64_sys_call+0x135e/0x3ce0 arch/x86/include/generated/asm/syscalls_64.h:47
+ do_syscall_x64 arch/x86/entry/common.c:52 [inline]
+ do_syscall_64+0xd9/0x1e0 arch/x86/entry/common.c:83
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+
+Local variable req.i created at:
+ inet_diag_get_exact_compat net/ipv4/inet_diag.c:1396 [inline]
+ inet_diag_rcv_msg_compat+0x2a6/0x530 net/ipv4/inet_diag.c:1426
+ sock_diag_rcv_msg+0x23d/0x740 net/core/sock_diag.c:282
+
+CPU: 1 PID: 8888 Comm: syz-executor.6 Not tainted 6.10.0-rc4-00217-g35bb670d65fc #32
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 04/01/2014
+
+Fixes: 432490f9d455 ("net: ip, diag -- Add diag interface for raw sockets")
+Reported-by: syzkaller <syzkaller@googlegroups.com>
+Signed-off-by: Shigeru Yoshida <syoshida@redhat.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://patch.msgid.link/20240703091649.111773-1-syoshida@redhat.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/inet_diag.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
+index d07917059d70f..a876991ecb2ca 100644
+--- a/net/ipv4/inet_diag.c
++++ b/net/ipv4/inet_diag.c
+@@ -1097,6 +1097,7 @@ static int inet_diag_dump_compat(struct sk_buff *skb,
+       req.sdiag_family = AF_UNSPEC; /* compatibility */
+       req.sdiag_protocol = inet_diag_type2proto(cb->nlh->nlmsg_type);
+       req.idiag_ext = rc->idiag_ext;
++      req.pad = 0;
+       req.idiag_states = rc->idiag_states;
+       req.id = rc->id;
+@@ -1115,6 +1116,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
+       req.sdiag_family = rc->idiag_family;
+       req.sdiag_protocol = inet_diag_type2proto(nlh->nlmsg_type);
+       req.idiag_ext = rc->idiag_ext;
++      req.pad = 0;
+       req.idiag_states = rc->idiag_states;
+       req.id = rc->id;
+-- 
+2.43.0
+
diff --git a/queue-4.19/net-tcp-better-handling-of-reordering-then-loss-case.patch b/queue-4.19/net-tcp-better-handling-of-reordering-then-loss-case.patch
new file mode 100644 (file)
index 0000000..907352e
--- /dev/null
@@ -0,0 +1,119 @@
+From 6046225849d8e1ce652e2180c75d3bc298179c0b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 2 Jun 2021 17:51:21 -0700
+Subject: net: tcp better handling of reordering then loss cases
+
+From: Yuchung Cheng <ycheng@google.com>
+
+[ Upstream commit a29cb6914681a55667436a9eb7a42e28da8cf387 ]
+
+This patch aims to improve the situation when reordering and loss are
+ocurring in the same flight of packets.
+
+Previously the reordering would first induce a spurious recovery, then
+the subsequent ACK may undo the cwnd (based on the timestamps e.g.).
+However the current loss recovery does not proceed to invoke
+RACK to install a reordering timer. If some packets are also lost, this
+may lead to a long RTO-based recovery. An example is
+https://groups.google.com/g/bbr-dev/c/OFHADvJbTEI
+
+The solution is to after reverting the recovery, always invoke RACK
+to either mount the RACK timer to fast retransmit after the reordering
+window, or restarts the recovery if new loss is identified. Hence
+it is possible the sender may go from Recovery to Disorder/Open to
+Recovery again in one ACK.
+
+Reported-by: mingkun bian <bianmingkun@gmail.com>
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: a6458ab7fd4f ("UPSTREAM: tcp: fix DSACK undo in fast recovery to call tcp_try_to_open()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/tcp_input.c | 45 +++++++++++++++++++++++++-------------------
+ 1 file changed, 26 insertions(+), 19 deletions(-)
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 88216b87c986f..5503f130cc6dd 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -2727,8 +2727,17 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
+       *rexmit = REXMIT_LOST;
+ }
++static bool tcp_force_fast_retransmit(struct sock *sk)
++{
++      struct tcp_sock *tp = tcp_sk(sk);
++
++      return after(tcp_highest_sack_seq(tp),
++                   tp->snd_una + tp->reordering * tp->mss_cache);
++}
++
+ /* Undo during fast recovery after partial ACK. */
+-static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
++static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una,
++                               bool *do_lost)
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
+@@ -2753,7 +2762,9 @@ static bool tcp_try_undo_partial(struct sock *sk, u32 prior_snd_una)
+               tcp_undo_cwnd_reduction(sk, true);
+               NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPARTIALUNDO);
+               tcp_try_keep_open(sk);
+-              return true;
++      } else {
++              /* Partial ACK arrived. Force fast retransmit. */
++              *do_lost = tcp_force_fast_retransmit(sk);
+       }
+       return false;
+ }
+@@ -2777,14 +2788,6 @@ static void tcp_identify_packet_loss(struct sock *sk, int *ack_flag)
+       }
+ }
+-static bool tcp_force_fast_retransmit(struct sock *sk)
+-{
+-      struct tcp_sock *tp = tcp_sk(sk);
+-
+-      return after(tcp_highest_sack_seq(tp),
+-                   tp->snd_una + tp->reordering * tp->mss_cache);
+-}
+-
+ /* Process an event, which can update packets-in-flight not trivially.
+  * Main goal of this function is to calculate new estimate for left_out,
+  * taking into account both packets sitting in receiver's buffer and
+@@ -2854,17 +2857,21 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
+               if (!(flag & FLAG_SND_UNA_ADVANCED)) {
+                       if (tcp_is_reno(tp))
+                               tcp_add_reno_sack(sk, num_dupack, ece_ack);
+-              } else {
+-                      if (tcp_try_undo_partial(sk, prior_snd_una))
+-                              return;
+-                      /* Partial ACK arrived. Force fast retransmit. */
+-                      do_lost = tcp_force_fast_retransmit(sk);
+-              }
+-              if (tcp_try_undo_dsack(sk)) {
+-                      tcp_try_keep_open(sk);
++              } else if (tcp_try_undo_partial(sk, prior_snd_una, &do_lost))
+                       return;
+-              }
++
++              if (tcp_try_undo_dsack(sk))
++                      tcp_try_keep_open(sk);
++
+               tcp_identify_packet_loss(sk, ack_flag);
++              if (icsk->icsk_ca_state != TCP_CA_Recovery) {
++                      if (!tcp_time_to_recover(sk, flag))
++                              return;
++                      /* Undo reverts the recovery state. If loss is evident,
++                       * starts a new recovery (e.g. reordering then loss);
++                       */
++                      tcp_enter_recovery(sk, ece_ack);
++              }
+               break;
+       case TCP_CA_Loss:
+               tcp_process_loss(sk, flag, num_dupack, rexmit);
+-- 
+2.43.0
+
diff --git a/queue-4.19/selftests-fix-oom-in-msg_zerocopy-selftest.patch b/queue-4.19/selftests-fix-oom-in-msg_zerocopy-selftest.patch
new file mode 100644 (file)
index 0000000..cc84ce9
--- /dev/null
@@ -0,0 +1,101 @@
+From c4c1dfbeb7daaed0c87634fbf979eda069d4a844 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 1 Jul 2024 22:53:48 +0000
+Subject: selftests: fix OOM in msg_zerocopy selftest
+
+From: Zijian Zhang <zijianzhang@bytedance.com>
+
+[ Upstream commit af2b7e5b741aaae9ffbba2c660def434e07aa241 ]
+
+In selftests/net/msg_zerocopy.c, it has a while loop keeps calling sendmsg
+on a socket with MSG_ZEROCOPY flag, and it will recv the notifications
+until the socket is not writable. Typically, it will start the receiving
+process after around 30+ sendmsgs. However, as the introduction of commit
+dfa2f0483360 ("tcp: get rid of sysctl_tcp_adv_win_scale"), the sender is
+always writable and does not get any chance to run recv notifications.
+The selftest always exits with OUT_OF_MEMORY because the memory used by
+opt_skb exceeds the net.core.optmem_max. Meanwhile, it could be set to a
+different value to trigger OOM on older kernels too.
+
+Thus, we introduce "cfg_notification_limit" to force sender to receive
+notifications after some number of sendmsgs.
+
+Fixes: 07b65c5b31ce ("test: add msg_zerocopy test")
+Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
+Signed-off-by: Xiaochun Lu <xiaochun.lu@bytedance.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Link: https://patch.msgid.link/20240701225349.3395580-2-zijianzhang@bytedance.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/msg_zerocopy.c | 12 +++++++++++-
+ 1 file changed, 11 insertions(+), 1 deletion(-)
+
+diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c
+index c539591937a17..b7dc9f3617572 100644
+--- a/tools/testing/selftests/net/msg_zerocopy.c
++++ b/tools/testing/selftests/net/msg_zerocopy.c
+@@ -85,6 +85,7 @@ static bool cfg_rx;
+ static int  cfg_runtime_ms    = 4200;
+ static int  cfg_verbose;
+ static int  cfg_waittime_ms   = 500;
++static int  cfg_notification_limit = 32;
+ static bool cfg_zerocopy;
+ static socklen_t cfg_alen;
+@@ -95,6 +96,7 @@ static char payload[IP_MAXPACKET];
+ static long packets, bytes, completions, expected_completions;
+ static int  zerocopied = -1;
+ static uint32_t next_completion;
++static uint32_t sends_since_notify;
+ static unsigned long gettimeofday_ms(void)
+ {
+@@ -208,6 +210,7 @@ static bool do_sendmsg(int fd, struct msghdr *msg, bool do_zerocopy, int domain)
+               error(1, errno, "send");
+       if (cfg_verbose && ret != len)
+               fprintf(stderr, "send: ret=%u != %u\n", ret, len);
++      sends_since_notify++;
+       if (len) {
+               packets++;
+@@ -460,6 +463,7 @@ static bool do_recv_completion(int fd, int domain)
+ static void do_recv_completions(int fd, int domain)
+ {
+       while (do_recv_completion(fd, domain)) {}
++      sends_since_notify = 0;
+ }
+ /* Wait for all remaining completions on the errqueue */
+@@ -549,6 +553,9 @@ static void do_tx(int domain, int type, int protocol)
+               else
+                       do_sendmsg(fd, &msg, cfg_zerocopy, domain);
++              if (cfg_zerocopy && sends_since_notify >= cfg_notification_limit)
++                      do_recv_completions(fd, domain);
++
+               while (!do_poll(fd, POLLOUT)) {
+                       if (cfg_zerocopy)
+                               do_recv_completions(fd, domain);
+@@ -707,7 +714,7 @@ static void parse_opts(int argc, char **argv)
+       cfg_payload_len = max_payload_len;
+-      while ((c = getopt(argc, argv, "46c:C:D:i:mp:rs:S:t:vz")) != -1) {
++      while ((c = getopt(argc, argv, "46c:C:D:i:l:mp:rs:S:t:vz")) != -1) {
+               switch (c) {
+               case '4':
+                       if (cfg_family != PF_UNSPEC)
+@@ -735,6 +742,9 @@ static void parse_opts(int argc, char **argv)
+                       if (cfg_ifindex == 0)
+                               error(1, errno, "invalid iface: %s", optarg);
+                       break;
++              case 'l':
++                      cfg_notification_limit = strtoul(optarg, NULL, 0);
++                      break;
+               case 'm':
+                       cfg_cork_mixed = true;
+                       break;
+-- 
+2.43.0
+
diff --git a/queue-4.19/selftests-make-order-checking-verbose-in-msg_zerocop.patch b/queue-4.19/selftests-make-order-checking-verbose-in-msg_zerocop.patch
new file mode 100644 (file)
index 0000000..f2c7d2e
--- /dev/null
@@ -0,0 +1,40 @@
+From 6fc57a0e83fe169ef435f3c8604033b87e68e698 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 1 Jul 2024 22:53:49 +0000
+Subject: selftests: make order checking verbose in msg_zerocopy selftest
+
+From: Zijian Zhang <zijianzhang@bytedance.com>
+
+[ Upstream commit 7d6d8f0c8b700c9493f2839abccb6d29028b4219 ]
+
+We find that when lock debugging is on, notifications may not come in
+order. Thus, we have order checking outputs managed by cfg_verbose, to
+avoid too many outputs in this case.
+
+Fixes: 07b65c5b31ce ("test: add msg_zerocopy test")
+Signed-off-by: Zijian Zhang <zijianzhang@bytedance.com>
+Signed-off-by: Xiaochun Lu <xiaochun.lu@bytedance.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Link: https://patch.msgid.link/20240701225349.3395580-3-zijianzhang@bytedance.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/msg_zerocopy.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/testing/selftests/net/msg_zerocopy.c b/tools/testing/selftests/net/msg_zerocopy.c
+index b7dc9f3617572..67fa124c06864 100644
+--- a/tools/testing/selftests/net/msg_zerocopy.c
++++ b/tools/testing/selftests/net/msg_zerocopy.c
+@@ -438,7 +438,7 @@ static bool do_recv_completion(int fd, int domain)
+       /* Detect notification gaps. These should not happen often, if at all.
+        * Gaps can occur due to drops, reordering and retransmissions.
+        */
+-      if (lo != next_completion)
++      if (cfg_verbose && lo != next_completion)
+               fprintf(stderr, "gap: %u..%u does not append to %u\n",
+                       lo, hi, next_completion);
+       next_completion = hi + 1;
+-- 
+2.43.0
+
index b0dc9f6129aa5df808c339be717e6e685ea53570..26d036b946da50db7421d795196db64c7dfeecef 100644 (file)
@@ -18,3 +18,13 @@ powerpc-xmon-check-cpu-id-in-commands-c-dp-and-dx.patch
 jffs2-fix-potential-illegal-address-access-in-jffs2_.patch
 s390-mark-psw-in-__load_psw_mask-as-__unitialized.patch
 s390-pkey-wipe-sensitive-data-on-failure.patch
+tcp-take-care-of-compressed-acks-in-tcp_add_reno_sac.patch
+tcp-tcp_mark_head_lost-is-only-valid-for-sack-tcp.patch
+tcp-add-ece_ack-flag-to-reno-sack-functions.patch
+net-tcp-better-handling-of-reordering-then-loss-case.patch
+upstream-tcp-fix-dsack-undo-in-fast-recovery-to-call.patch
+tcp_metrics-validate-source-addr-length.patch
+bonding-fix-out-of-bounds-read-in-bond_option_arp_ip.patch
+selftests-fix-oom-in-msg_zerocopy-selftest.patch
+selftests-make-order-checking-verbose-in-msg_zerocop.patch
+inet_diag-initialize-pad-field-in-struct-inet_diag_r.patch
diff --git a/queue-4.19/tcp-add-ece_ack-flag-to-reno-sack-functions.patch b/queue-4.19/tcp-add-ece_ack-flag-to-reno-sack-functions.patch
new file mode 100644 (file)
index 0000000..3e7e3be
--- /dev/null
@@ -0,0 +1,131 @@
+From 16a7188eb09089f5571a1943429ee30062eef1bc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 26 Jun 2020 21:05:33 -0700
+Subject: tcp: add ece_ack flag to reno sack functions
+
+From: Yousuk Seung <ysseung@google.com>
+
+[ Upstream commit c634e34f6ebfb75259e6ce467523fd3adf30d3d2 ]
+
+Pass a boolean flag that tells the ECE state of the current ack to reno
+sack functions. This is pure refactor for future patches to improve
+tracking delivered counts.
+
+Signed-off-by: Yousuk Seung <ysseung@google.com>
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Acked-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Acked-by: Soheil Hassas Yeganeh <soheil@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: a6458ab7fd4f ("UPSTREAM: tcp: fix DSACK undo in fast recovery to call tcp_try_to_open()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/tcp_input.c | 22 ++++++++++++----------
+ 1 file changed, 12 insertions(+), 10 deletions(-)
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index e51aa5a149c0f..88216b87c986f 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -1897,7 +1897,7 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+ /* Emulate SACKs for SACKless connection: account for a new dupack. */
+-static void tcp_add_reno_sack(struct sock *sk, int num_dupack)
++static void tcp_add_reno_sack(struct sock *sk, int num_dupack, bool ece_ack)
+ {
+       if (num_dupack) {
+               struct tcp_sock *tp = tcp_sk(sk);
+@@ -1915,7 +1915,7 @@ static void tcp_add_reno_sack(struct sock *sk, int num_dupack)
+ /* Account for ACK, ACKing some data in Reno Recovery phase. */
+-static void tcp_remove_reno_sacks(struct sock *sk, int acked)
++static void tcp_remove_reno_sacks(struct sock *sk, int acked, bool ece_ack)
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
+@@ -2720,7 +2720,7 @@ static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
+                * delivered. Lower inflight to clock out (re)tranmissions.
+                */
+               if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
+-                      tcp_add_reno_sack(sk, num_dupack);
++                      tcp_add_reno_sack(sk, num_dupack, flag & FLAG_ECE);
+               else if (flag & FLAG_SND_UNA_ADVANCED)
+                       tcp_reset_reno_sack(tp);
+       }
+@@ -2803,6 +2803,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
+       int fast_rexmit = 0, flag = *ack_flag;
++      bool ece_ack = flag & FLAG_ECE;
+       bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
+                                     tcp_force_fast_retransmit(sk));
+@@ -2811,7 +2812,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
+       /* Now state machine starts.
+        * A. ECE, hence prohibit cwnd undoing, the reduction is required. */
+-      if (flag & FLAG_ECE)
++      if (ece_ack)
+               tp->prior_ssthresh = 0;
+       /* B. In all the states check for reneging SACKs. */
+@@ -2852,7 +2853,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
+       case TCP_CA_Recovery:
+               if (!(flag & FLAG_SND_UNA_ADVANCED)) {
+                       if (tcp_is_reno(tp))
+-                              tcp_add_reno_sack(sk, num_dupack);
++                              tcp_add_reno_sack(sk, num_dupack, ece_ack);
+               } else {
+                       if (tcp_try_undo_partial(sk, prior_snd_una))
+                               return;
+@@ -2877,7 +2878,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
+               if (tcp_is_reno(tp)) {
+                       if (flag & FLAG_SND_UNA_ADVANCED)
+                               tcp_reset_reno_sack(tp);
+-                      tcp_add_reno_sack(sk, num_dupack);
++                      tcp_add_reno_sack(sk, num_dupack, ece_ack);
+               }
+               if (icsk->icsk_ca_state <= TCP_CA_Disorder)
+@@ -2901,7 +2902,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
+               }
+               /* Otherwise enter Recovery state */
+-              tcp_enter_recovery(sk, (flag & FLAG_ECE));
++              tcp_enter_recovery(sk, ece_ack);
+               fast_rexmit = 1;
+       }
+@@ -3077,7 +3078,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
+  */
+ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
+                              u32 prior_snd_una,
+-                             struct tcp_sacktag_state *sack)
++                             struct tcp_sacktag_state *sack, bool ece_ack)
+ {
+       const struct inet_connection_sock *icsk = inet_csk(sk);
+       u64 first_ackt, last_ackt;
+@@ -3215,7 +3216,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, u32 prior_fack,
+               }
+               if (tcp_is_reno(tp)) {
+-                      tcp_remove_reno_sacks(sk, pkts_acked);
++                      tcp_remove_reno_sacks(sk, pkts_acked, ece_ack);
+                       /* If any of the cumulatively ACKed segments was
+                        * retransmitted, non-SACK case cannot confirm that
+@@ -3720,7 +3721,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+               goto no_queue;
+       /* See if we can take anything off of the retransmit queue. */
+-      flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state);
++      flag |= tcp_clean_rtx_queue(sk, prior_fack, prior_snd_una, &sack_state,
++                                  flag & FLAG_ECE);
+       tcp_rack_update_reo_wnd(sk, &rs);
+-- 
+2.43.0
+
diff --git a/queue-4.19/tcp-take-care-of-compressed-acks-in-tcp_add_reno_sac.patch b/queue-4.19/tcp-take-care-of-compressed-acks-in-tcp_add_reno_sac.patch
new file mode 100644 (file)
index 0000000..390c97c
--- /dev/null
@@ -0,0 +1,180 @@
+From bda730fdb80a9e6c557112bf3ca75eacfe3a79ea Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 27 Nov 2018 14:42:01 -0800
+Subject: tcp: take care of compressed acks in tcp_add_reno_sack()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 19119f298bb1f2af3bb1093f5f2a1fed8da94e37 ]
+
+Neal pointed out that non sack flows might suffer from ACK compression
+added in the following patch ("tcp: implement coalescing on backlog queue")
+
+Instead of tweaking tcp_add_backlog() we can take into
+account how many ACK were coalesced, this information
+will be available in skb_shinfo(skb)->gso_segs
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: a6458ab7fd4f ("UPSTREAM: tcp: fix DSACK undo in fast recovery to call tcp_try_to_open()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/tcp_input.c | 58 +++++++++++++++++++++++++-------------------
+ 1 file changed, 33 insertions(+), 25 deletions(-)
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 6a8c7c521d36e..022d75c67096a 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -1897,16 +1897,20 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
+ /* Emulate SACKs for SACKless connection: account for a new dupack. */
+-static void tcp_add_reno_sack(struct sock *sk)
++static void tcp_add_reno_sack(struct sock *sk, int num_dupack)
+ {
+-      struct tcp_sock *tp = tcp_sk(sk);
+-      u32 prior_sacked = tp->sacked_out;
++      if (num_dupack) {
++              struct tcp_sock *tp = tcp_sk(sk);
++              u32 prior_sacked = tp->sacked_out;
++              s32 delivered;
+-      tp->sacked_out++;
+-      tcp_check_reno_reordering(sk, 0);
+-      if (tp->sacked_out > prior_sacked)
+-              tp->delivered++; /* Some out-of-order packet is delivered */
+-      tcp_verify_left_out(tp);
++              tp->sacked_out += num_dupack;
++              tcp_check_reno_reordering(sk, 0);
++              delivered = tp->sacked_out - prior_sacked;
++              if (delivered > 0)
++                      tp->delivered += delivered;
++              tcp_verify_left_out(tp);
++      }
+ }
+ /* Account for ACK, ACKing some data in Reno Recovery phase. */
+@@ -2687,7 +2691,7 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack)
+ /* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
+  * recovered or spurious. Otherwise retransmits more on partial ACKs.
+  */
+-static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
++static void tcp_process_loss(struct sock *sk, int flag, int num_dupack,
+                            int *rexmit)
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
+@@ -2706,7 +2710,7 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
+                       return;
+               if (after(tp->snd_nxt, tp->high_seq)) {
+-                      if (flag & FLAG_DATA_SACKED || is_dupack)
++                      if (flag & FLAG_DATA_SACKED || num_dupack)
+                               tp->frto = 0; /* Step 3.a. loss was real */
+               } else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
+                       tp->high_seq = tp->snd_nxt;
+@@ -2732,8 +2736,8 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
+               /* A Reno DUPACK means new data in F-RTO step 2.b above are
+                * delivered. Lower inflight to clock out (re)tranmissions.
+                */
+-              if (after(tp->snd_nxt, tp->high_seq) && is_dupack)
+-                      tcp_add_reno_sack(sk);
++              if (after(tp->snd_nxt, tp->high_seq) && num_dupack)
++                      tcp_add_reno_sack(sk, num_dupack);
+               else if (flag & FLAG_SND_UNA_ADVANCED)
+                       tcp_reset_reno_sack(tp);
+       }
+@@ -2811,13 +2815,13 @@ static bool tcp_force_fast_retransmit(struct sock *sk)
+  * tcp_xmit_retransmit_queue().
+  */
+ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
+-                                bool is_dupack, int *ack_flag, int *rexmit)
++                                int num_dupack, int *ack_flag, int *rexmit)
+ {
+       struct inet_connection_sock *icsk = inet_csk(sk);
+       struct tcp_sock *tp = tcp_sk(sk);
+       int fast_rexmit = 0, flag = *ack_flag;
+-      bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
+-                                   tcp_force_fast_retransmit(sk));
++      bool do_lost = num_dupack || ((flag & FLAG_DATA_SACKED) &&
++                                    tcp_force_fast_retransmit(sk));
+       if (!tp->packets_out && tp->sacked_out)
+               tp->sacked_out = 0;
+@@ -2864,8 +2868,8 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
+       switch (icsk->icsk_ca_state) {
+       case TCP_CA_Recovery:
+               if (!(flag & FLAG_SND_UNA_ADVANCED)) {
+-                      if (tcp_is_reno(tp) && is_dupack)
+-                              tcp_add_reno_sack(sk);
++                      if (tcp_is_reno(tp))
++                              tcp_add_reno_sack(sk, num_dupack);
+               } else {
+                       if (tcp_try_undo_partial(sk, prior_snd_una))
+                               return;
+@@ -2880,7 +2884,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
+               tcp_identify_packet_loss(sk, ack_flag);
+               break;
+       case TCP_CA_Loss:
+-              tcp_process_loss(sk, flag, is_dupack, rexmit);
++              tcp_process_loss(sk, flag, num_dupack, rexmit);
+               tcp_identify_packet_loss(sk, ack_flag);
+               if (!(icsk->icsk_ca_state == TCP_CA_Open ||
+                     (*ack_flag & FLAG_LOST_RETRANS)))
+@@ -2891,8 +2895,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
+               if (tcp_is_reno(tp)) {
+                       if (flag & FLAG_SND_UNA_ADVANCED)
+                               tcp_reset_reno_sack(tp);
+-                      if (is_dupack)
+-                              tcp_add_reno_sack(sk);
++                      tcp_add_reno_sack(sk, num_dupack);
+               }
+               if (icsk->icsk_ca_state <= TCP_CA_Disorder)
+@@ -3623,7 +3626,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+       bool is_sack_reneg = tp->is_sack_reneg;
+       u32 ack_seq = TCP_SKB_CB(skb)->seq;
+       u32 ack = TCP_SKB_CB(skb)->ack_seq;
+-      bool is_dupack = false;
++      int num_dupack = 0;
+       int prior_packets = tp->packets_out;
+       u32 delivered = tp->delivered;
+       u32 lost = tp->lost;
+@@ -3743,8 +3746,13 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+               tcp_process_tlp_ack(sk, ack, flag);
+       if (tcp_ack_is_dubious(sk, flag)) {
+-              is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
+-              tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
++              if (!(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP))) {
++                      num_dupack = 1;
++                      /* Consider if pure acks were aggregated in tcp_add_backlog() */
++                      if (!(flag & FLAG_DATA))
++                              num_dupack = max_t(u16, 1, skb_shinfo(skb)->gso_segs);
++              }
++              tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
+                                     &rexmit);
+       }
+@@ -3766,7 +3774,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+ no_queue:
+       /* If data was DSACKed, see if we can undo a cwnd reduction. */
+       if (flag & FLAG_DSACKING_ACK) {
+-              tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
++              tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
+                                     &rexmit);
+               tcp_newly_delivered(sk, delivered, flag);
+       }
+@@ -3791,7 +3799,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
+       if (TCP_SKB_CB(skb)->sacked) {
+               flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
+                                               &sack_state);
+-              tcp_fastretrans_alert(sk, prior_snd_una, is_dupack, &flag,
++              tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag,
+                                     &rexmit);
+               tcp_newly_delivered(sk, delivered, flag);
+               tcp_xmit_recovery(sk, rexmit);
+-- 
+2.43.0
+
diff --git a/queue-4.19/tcp-tcp_mark_head_lost-is-only-valid-for-sack-tcp.patch b/queue-4.19/tcp-tcp_mark_head_lost-is-only-valid-for-sack-tcp.patch
new file mode 100644 (file)
index 0000000..b5e334f
--- /dev/null
@@ -0,0 +1,90 @@
+From e54288a45229e4435add827417471cfc86b1ead3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 May 2020 11:08:30 +0800
+Subject: tcp: tcp_mark_head_lost is only valid for sack-tcp
+
+From: zhang kai <zhangkaiheb@126.com>
+
+[ Upstream commit 636ef28d6e4d174e424102466caf572b0406fb0e ]
+
+so tcp_is_sack/reno checks are removed from tcp_mark_head_lost.
+
+Signed-off-by: zhang kai <zhangkaiheb@126.com>
+Acked-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: a6458ab7fd4f ("UPSTREAM: tcp: fix DSACK undo in fast recovery to call tcp_try_to_open()")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/tcp_input.c | 32 +++++++-------------------------
+ 1 file changed, 7 insertions(+), 25 deletions(-)
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 022d75c67096a..e51aa5a149c0f 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -2193,8 +2193,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
+ }
+ /* Detect loss in event "A" above by marking head of queue up as lost.
+- * For non-SACK(Reno) senders, the first "packets" number of segments
+- * are considered lost. For RFC3517 SACK, a segment is considered lost if it
++ * For RFC3517 SACK, a segment is considered lost if it
+  * has at least tp->reordering SACKed seqments above it; "packets" refers to
+  * the maximum SACKed segments to pass before reaching this limit.
+  */
+@@ -2202,10 +2201,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
+ {
+       struct tcp_sock *tp = tcp_sk(sk);
+       struct sk_buff *skb;
+-      int cnt, oldcnt, lost;
+-      unsigned int mss;
++      int cnt;
+       /* Use SACK to deduce losses of new sequences sent during recovery */
+-      const u32 loss_high = tcp_is_sack(tp) ?  tp->snd_nxt : tp->high_seq;
++      const u32 loss_high = tp->snd_nxt;
+       WARN_ON(packets > tp->packets_out);
+       skb = tp->lost_skb_hint;
+@@ -2228,26 +2226,11 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
+               if (after(TCP_SKB_CB(skb)->end_seq, loss_high))
+                       break;
+-              oldcnt = cnt;
+-              if (tcp_is_reno(tp) ||
+-                  (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED))
++              if (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED)
+                       cnt += tcp_skb_pcount(skb);
+-              if (cnt > packets) {
+-                      if (tcp_is_sack(tp) ||
+-                          (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) ||
+-                          (oldcnt >= packets))
+-                              break;
+-
+-                      mss = tcp_skb_mss(skb);
+-                      /* If needed, chop off the prefix to mark as lost. */
+-                      lost = (packets - oldcnt) * mss;
+-                      if (lost < skb->len &&
+-                          tcp_fragment(sk, TCP_FRAG_IN_RTX_QUEUE, skb,
+-                                       lost, mss, GFP_ATOMIC) < 0)
+-                              break;
+-                      cnt = packets;
+-              }
++              if (cnt > packets)
++                      break;
+               tcp_skb_mark_lost(tp, skb);
+@@ -2874,8 +2857,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
+                       if (tcp_try_undo_partial(sk, prior_snd_una))
+                               return;
+                       /* Partial ACK arrived. Force fast retransmit. */
+-                      do_lost = tcp_is_reno(tp) ||
+-                                tcp_force_fast_retransmit(sk);
++                      do_lost = tcp_force_fast_retransmit(sk);
+               }
+               if (tcp_try_undo_dsack(sk)) {
+                       tcp_try_keep_open(sk);
+-- 
+2.43.0
+
diff --git a/queue-4.19/tcp_metrics-validate-source-addr-length.patch b/queue-4.19/tcp_metrics-validate-source-addr-length.patch
new file mode 100644 (file)
index 0000000..57093a9
--- /dev/null
@@ -0,0 +1,38 @@
+From 4641ea3b4984a1a80d58dacdcaf0b77e5de484be Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 27 Jun 2024 14:25:00 -0700
+Subject: tcp_metrics: validate source addr length
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 66be40e622e177316ae81717aa30057ba9e61dff ]
+
+I don't see anything checking that TCP_METRICS_ATTR_SADDR_IPV4
+is at least 4 bytes long, and the policy doesn't have an entry
+for this attribute at all (neither does it for IPv6 but v6 is
+manually validated).
+
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Fixes: 3e7013ddf55a ("tcp: metrics: Allow selective get/del of tcp-metrics based on src IP")
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/tcp_metrics.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
+index 60619b1f4acdc..1bfbb8f8e0b7c 100644
+--- a/net/ipv4/tcp_metrics.c
++++ b/net/ipv4/tcp_metrics.c
+@@ -624,6 +624,7 @@ static const struct nla_policy tcp_metrics_nl_policy[TCP_METRICS_ATTR_MAX + 1] =
+       [TCP_METRICS_ATTR_ADDR_IPV4]    = { .type = NLA_U32, },
+       [TCP_METRICS_ATTR_ADDR_IPV6]    = { .type = NLA_BINARY,
+                                           .len = sizeof(struct in6_addr), },
++      [TCP_METRICS_ATTR_SADDR_IPV4]   = { .type = NLA_U32, },
+       /* Following attributes are not received for GET/DEL,
+        * we keep them for reference
+        */
+-- 
+2.43.0
+
diff --git a/queue-4.19/upstream-tcp-fix-dsack-undo-in-fast-recovery-to-call.patch b/queue-4.19/upstream-tcp-fix-dsack-undo-in-fast-recovery-to-call.patch
new file mode 100644 (file)
index 0000000..c04dfd2
--- /dev/null
@@ -0,0 +1,70 @@
+From a5ad0648fbf7a93191fd21aad7108fb06594d33e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 26 Jun 2024 22:42:27 -0400
+Subject: UPSTREAM: tcp: fix DSACK undo in fast recovery to call
+ tcp_try_to_open()
+
+From: Neal Cardwell <ncardwell@google.com>
+
+[ Upstream commit a6458ab7fd4f427d4f6f54380453ad255b7fde83 ]
+
+In some production workloads we noticed that connections could
+sometimes close extremely prematurely with ETIMEDOUT after
+transmitting only 1 TLP and RTO retransmission (when we would normally
+expect roughly tcp_retries2 = TCP_RETR2 = 15 RTOs before a connection
+closes with ETIMEDOUT).
+
+From tracing we determined that these workloads can suffer from a
+scenario where in fast recovery, after some retransmits, a DSACK undo
+can happen at a point where the scoreboard is totally clear (we have
+retrans_out == sacked_out == lost_out == 0). In such cases, calling
+tcp_try_keep_open() means that we do not execute any code path that
+clears tp->retrans_stamp to 0. That means that tp->retrans_stamp can
+remain erroneously set to the start time of the undone fast recovery,
+even after the fast recovery is undone. If minutes or hours elapse,
+and then a TLP/RTO/RTO sequence occurs, then the start_ts value in
+retransmits_timed_out() (which is from tp->retrans_stamp) will be
+erroneously ancient (left over from the fast recovery undone via
+DSACKs). Thus this ancient tp->retrans_stamp value can cause the
+connection to die very prematurely with ETIMEDOUT via
+tcp_write_err().
+
+The fix: we change DSACK undo in fast recovery (TCP_CA_Recovery) to
+call tcp_try_to_open() instead of tcp_try_keep_open(). This ensures
+that if no retransmits are in flight at the time of DSACK undo in fast
+recovery then we properly zero retrans_stamp. Note that calling
+tcp_try_to_open() is more consistent with other loss recovery
+behavior, since normal fast recovery (CA_Recovery) and RTO recovery
+(CA_Loss) both normally end when tp->snd_una meets or exceeds
+tp->high_seq and then in tcp_fastretrans_alert() the "default" switch
+case executes tcp_try_to_open(). Also note that by inspection this
+change to call tcp_try_to_open() implies at least one other nice bug
+fix, where now an ECE-marked DSACK that causes an undo will properly
+invoke tcp_enter_cwr() rather than ignoring the ECE mark.
+
+Fixes: c7d9d6a185a7 ("tcp: undo on DSACK during recovery")
+Signed-off-by: Neal Cardwell <ncardwell@google.com>
+Signed-off-by: Yuchung Cheng <ycheng@google.com>
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/tcp_input.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
+index 5503f130cc6dd..9a66c37958451 100644
+--- a/net/ipv4/tcp_input.c
++++ b/net/ipv4/tcp_input.c
+@@ -2861,7 +2861,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const u32 prior_snd_una,
+                       return;
+               if (tcp_try_undo_dsack(sk))
+-                      tcp_try_keep_open(sk);
++                      tcp_try_to_open(sk, flag);
+               tcp_identify_packet_loss(sk, ack_flag);
+               if (icsk->icsk_ca_state != TCP_CA_Recovery) {
+-- 
+2.43.0
+