From 54f92c1cc4896c4d9449bd67d76decc0686976b8 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 17 May 2023 21:37:41 -0400 Subject: [PATCH] Fixes for 4.19 Signed-off-by: Sasha Levin --- ...ata-race-of-sk-sk_receive_queue-qlen.patch | 84 +++++++++ ...fix-data-races-around-sk-sk_shutdown.patch | 153 ++++++++++++++++ ...t-of-bounds-caused-by-unclear-skb-cb.patch | 172 ++++++++++++++++++ ...ate-sk-sk_err-write-from-do_recvmmsg.patch | 40 ++++ ...gram-fix-data-races-in-datagram_poll.patch | 69 +++++++ ...ring-on-sk-sk_stamp-in-sock_recv_cms.patch | 82 +++++++++ ...-annotate-accesses-to-nlk-cb_running.patch | 109 +++++++++++ queue-4.19/series | 11 ++ ...tions-around-sk-sk_shutdown-accesses.patch | 158 ++++++++++++++++ .../tcp-factor-out-__tcp_close-helper.patch | 68 +++++++ ...ut-events-caused-by-tcp_notsent_lowa.patch | 159 ++++++++++++++++ ...out-from-tcp_poll-only-when-notsent_.patch | 54 ++++++ 12 files changed, 1159 insertions(+) create mode 100644 queue-4.19/af_unix-fix-a-data-race-of-sk-sk_receive_queue-qlen.patch create mode 100644 queue-4.19/af_unix-fix-data-races-around-sk-sk_shutdown.patch create mode 100644 queue-4.19/ipvlan-fix-out-of-bounds-caused-by-unclear-skb-cb.patch create mode 100644 queue-4.19/net-annotate-sk-sk_err-write-from-do_recvmmsg.patch create mode 100644 queue-4.19/net-datagram-fix-data-races-in-datagram_poll.patch create mode 100644 queue-4.19/net-fix-load-tearing-on-sk-sk_stamp-in-sock_recv_cms.patch create mode 100644 queue-4.19/netlink-annotate-accesses-to-nlk-cb_running.patch create mode 100644 queue-4.19/tcp-add-annotations-around-sk-sk_shutdown-accesses.patch create mode 100644 queue-4.19/tcp-factor-out-__tcp_close-helper.patch create mode 100644 queue-4.19/tcp-reduce-pollout-events-caused-by-tcp_notsent_lowa.patch create mode 100644 queue-4.19/tcp-return-epollout-from-tcp_poll-only-when-notsent_.patch diff --git a/queue-4.19/af_unix-fix-a-data-race-of-sk-sk_receive_queue-qlen.patch b/queue-4.19/af_unix-fix-a-data-race-of-sk-sk_receive_queue-qlen.patch new file mode 100644 index 00000000000..9f9ca6f5a16 --- /dev/null +++ b/queue-4.19/af_unix-fix-a-data-race-of-sk-sk_receive_queue-qlen.patch @@ -0,0 +1,84 @@ +From f7678a7389f3e06b6950cd73d51575bc2709c98b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 9 May 2023 17:34:55 -0700 +Subject: af_unix: Fix a data race of sk->sk_receive_queue->qlen. + +From: Kuniyuki Iwashima + +[ Upstream commit 679ed006d416ea0cecfe24a99d365d1dea69c683 ] + +KCSAN found a data race of sk->sk_receive_queue->qlen where recvmsg() +updates qlen under the queue lock and sendmsg() checks qlen under +unix_state_sock(), not the queue lock, so the reader side needs +READ_ONCE(). + +BUG: KCSAN: data-race in __skb_try_recv_from_queue / unix_wait_for_peer + +write (marked) to 0xffff888019fe7c68 of 4 bytes by task 49792 on cpu 0: + __skb_unlink include/linux/skbuff.h:2347 [inline] + __skb_try_recv_from_queue+0x3de/0x470 net/core/datagram.c:197 + __skb_try_recv_datagram+0xf7/0x390 net/core/datagram.c:263 + __unix_dgram_recvmsg+0x109/0x8a0 net/unix/af_unix.c:2452 + unix_dgram_recvmsg+0x94/0xa0 net/unix/af_unix.c:2549 + sock_recvmsg_nosec net/socket.c:1019 [inline] + ____sys_recvmsg+0x3a3/0x3b0 net/socket.c:2720 + ___sys_recvmsg+0xc8/0x150 net/socket.c:2764 + do_recvmmsg+0x182/0x560 net/socket.c:2858 + __sys_recvmmsg net/socket.c:2937 [inline] + __do_sys_recvmmsg net/socket.c:2960 [inline] + __se_sys_recvmmsg net/socket.c:2953 [inline] + __x64_sys_recvmmsg+0x153/0x170 net/socket.c:2953 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x72/0xdc + +read to 0xffff888019fe7c68 of 4 bytes by task 49793 on cpu 1: + skb_queue_len include/linux/skbuff.h:2127 [inline] + unix_recvq_full net/unix/af_unix.c:229 [inline] + unix_wait_for_peer+0x154/0x1a0 net/unix/af_unix.c:1445 + unix_dgram_sendmsg+0x13bc/0x14b0 net/unix/af_unix.c:2048 + sock_sendmsg_nosec net/socket.c:724 [inline] + sock_sendmsg+0x148/0x160 net/socket.c:747 + ____sys_sendmsg+0x20e/0x620 net/socket.c:2503 + ___sys_sendmsg+0xc6/0x140 net/socket.c:2557 + __sys_sendmmsg+0x11d/0x370 net/socket.c:2643 + __do_sys_sendmmsg net/socket.c:2672 [inline] + __se_sys_sendmmsg net/socket.c:2669 [inline] + __x64_sys_sendmmsg+0x58/0x70 net/socket.c:2669 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x72/0xdc + +value changed: 0x0000000b -> 0x00000001 + +Reported by Kernel Concurrency Sanitizer on: +CPU: 1 PID: 49793 Comm: syz-executor.0 Not tainted 6.3.0-rc7-02330-gca6270c12e20 #2 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 + +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Reported-by: syzbot +Signed-off-by: Kuniyuki Iwashima +Reviewed-by: Eric Dumazet +Reviewed-by: Michal Kubiak +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/unix/af_unix.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c +index e79c329427964..0b2d466fb8585 100644 +--- a/net/unix/af_unix.c ++++ b/net/unix/af_unix.c +@@ -1232,7 +1232,7 @@ static long unix_wait_for_peer(struct sock *other, long timeo) + + sched = !sock_flag(other, SOCK_DEAD) && + !(other->sk_shutdown & RCV_SHUTDOWN) && +- unix_recvq_full(other); ++ unix_recvq_full_lockless(other); + + unix_state_unlock(other); + +-- +2.39.2 + diff --git a/queue-4.19/af_unix-fix-data-races-around-sk-sk_shutdown.patch b/queue-4.19/af_unix-fix-data-races-around-sk-sk_shutdown.patch new file mode 100644 index 00000000000..824a71b9c2f --- /dev/null +++ b/queue-4.19/af_unix-fix-data-races-around-sk-sk_shutdown.patch @@ -0,0 +1,153 @@ +From 8fec34b655f4fdf25ce616f9269859ebe9162011 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 9 May 2023 17:34:56 -0700 +Subject: af_unix: Fix data races around sk->sk_shutdown. + +From: Kuniyuki Iwashima + +[ Upstream commit e1d09c2c2f5793474556b60f83900e088d0d366d ] + +KCSAN found a data race around sk->sk_shutdown where unix_release_sock() +and unix_shutdown() update it under unix_state_lock(), OTOH unix_poll() +and unix_dgram_poll() read it locklessly. + +We need to annotate the writes and reads with WRITE_ONCE() and READ_ONCE(). + +BUG: KCSAN: data-race in unix_poll / unix_release_sock + +write to 0xffff88800d0f8aec of 1 bytes by task 264 on cpu 0: + unix_release_sock+0x75c/0x910 net/unix/af_unix.c:631 + unix_release+0x59/0x80 net/unix/af_unix.c:1042 + __sock_release+0x7d/0x170 net/socket.c:653 + sock_close+0x19/0x30 net/socket.c:1397 + __fput+0x179/0x5e0 fs/file_table.c:321 + ____fput+0x15/0x20 fs/file_table.c:349 + task_work_run+0x116/0x1a0 kernel/task_work.c:179 + resume_user_mode_work include/linux/resume_user_mode.h:49 [inline] + exit_to_user_mode_loop kernel/entry/common.c:171 [inline] + exit_to_user_mode_prepare+0x174/0x180 kernel/entry/common.c:204 + __syscall_exit_to_user_mode_work kernel/entry/common.c:286 [inline] + syscall_exit_to_user_mode+0x1a/0x30 kernel/entry/common.c:297 + do_syscall_64+0x4b/0x90 arch/x86/entry/common.c:86 + entry_SYSCALL_64_after_hwframe+0x72/0xdc + +read to 0xffff88800d0f8aec of 1 bytes by task 222 on cpu 1: + unix_poll+0xa3/0x2a0 net/unix/af_unix.c:3170 + sock_poll+0xcf/0x2b0 net/socket.c:1385 + vfs_poll include/linux/poll.h:88 [inline] + ep_item_poll.isra.0+0x78/0xc0 fs/eventpoll.c:855 + ep_send_events fs/eventpoll.c:1694 [inline] + ep_poll fs/eventpoll.c:1823 [inline] + do_epoll_wait+0x6c4/0xea0 fs/eventpoll.c:2258 + __do_sys_epoll_wait fs/eventpoll.c:2270 [inline] + __se_sys_epoll_wait fs/eventpoll.c:2265 [inline] + __x64_sys_epoll_wait+0xcc/0x190 fs/eventpoll.c:2265 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x72/0xdc + +value changed: 0x00 -> 0x03 + +Reported by Kernel Concurrency Sanitizer on: +CPU: 1 PID: 222 Comm: dbus-broker Not tainted 6.3.0-rc7-02330-gca6270c12e20 #2 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 + +Fixes: 3c73419c09a5 ("af_unix: fix 'poll for write'/ connected DGRAM sockets") +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Reported-by: syzbot +Signed-off-by: Kuniyuki Iwashima +Reviewed-by: Eric Dumazet +Reviewed-by: Michal Kubiak +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/unix/af_unix.c | 20 ++++++++++++-------- + 1 file changed, 12 insertions(+), 8 deletions(-) + +diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c +index 0b2d466fb8585..b0dcbb08e60db 100644 +--- a/net/unix/af_unix.c ++++ b/net/unix/af_unix.c +@@ -536,7 +536,7 @@ static void unix_release_sock(struct sock *sk, int embrion) + /* Clear state */ + unix_state_lock(sk); + sock_orphan(sk); +- sk->sk_shutdown = SHUTDOWN_MASK; ++ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); + path = u->path; + u->path.dentry = NULL; + u->path.mnt = NULL; +@@ -554,7 +554,7 @@ static void unix_release_sock(struct sock *sk, int embrion) + if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { + unix_state_lock(skpair); + /* No more writes */ +- skpair->sk_shutdown = SHUTDOWN_MASK; ++ WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); + if (!skb_queue_empty(&sk->sk_receive_queue) || embrion) + skpair->sk_err = ECONNRESET; + unix_state_unlock(skpair); +@@ -2551,7 +2551,7 @@ static int unix_shutdown(struct socket *sock, int mode) + ++mode; + + unix_state_lock(sk); +- sk->sk_shutdown |= mode; ++ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | mode); + other = unix_peer(sk); + if (other) + sock_hold(other); +@@ -2568,7 +2568,7 @@ static int unix_shutdown(struct socket *sock, int mode) + if (mode&SEND_SHUTDOWN) + peer_mode |= RCV_SHUTDOWN; + unix_state_lock(other); +- other->sk_shutdown |= peer_mode; ++ WRITE_ONCE(other->sk_shutdown, other->sk_shutdown | peer_mode); + unix_state_unlock(other); + other->sk_state_change(other); + if (peer_mode == SHUTDOWN_MASK) +@@ -2687,16 +2687,18 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa + { + struct sock *sk = sock->sk; + __poll_t mask; ++ u8 shutdown; + + sock_poll_wait(file, sock, wait); + mask = 0; ++ shutdown = READ_ONCE(sk->sk_shutdown); + + /* exceptional events? */ + if (sk->sk_err) + mask |= EPOLLERR; +- if (sk->sk_shutdown == SHUTDOWN_MASK) ++ if (shutdown == SHUTDOWN_MASK) + mask |= EPOLLHUP; +- if (sk->sk_shutdown & RCV_SHUTDOWN) ++ if (shutdown & RCV_SHUTDOWN) + mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; + + /* readable? */ +@@ -2724,18 +2726,20 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, + struct sock *sk = sock->sk, *other; + unsigned int writable; + __poll_t mask; ++ u8 shutdown; + + sock_poll_wait(file, sock, wait); + mask = 0; ++ shutdown = READ_ONCE(sk->sk_shutdown); + + /* exceptional events? */ + if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) + mask |= EPOLLERR | + (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); + +- if (sk->sk_shutdown & RCV_SHUTDOWN) ++ if (shutdown & RCV_SHUTDOWN) + mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; +- if (sk->sk_shutdown == SHUTDOWN_MASK) ++ if (shutdown == SHUTDOWN_MASK) + mask |= EPOLLHUP; + + /* readable? */ +-- +2.39.2 + diff --git a/queue-4.19/ipvlan-fix-out-of-bounds-caused-by-unclear-skb-cb.patch b/queue-4.19/ipvlan-fix-out-of-bounds-caused-by-unclear-skb-cb.patch new file mode 100644 index 00000000000..2347f22b792 --- /dev/null +++ b/queue-4.19/ipvlan-fix-out-of-bounds-caused-by-unclear-skb-cb.patch @@ -0,0 +1,172 @@ +From 46ea90fcfce89b9e41b8051083a7e88d91e8fd84 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 10 May 2023 11:50:44 +0800 +Subject: ipvlan:Fix out-of-bounds caused by unclear skb->cb + +From: t.feng + +[ Upstream commit 90cbed5247439a966b645b34eb0a2e037836ea8e ] + +If skb enqueue the qdisc, fq_skb_cb(skb)->time_to_send is changed which +is actually skb->cb, and IPCB(skb_in)->opt will be used in +__ip_options_echo. It is possible that memcpy is out of bounds and lead +to stack overflow. +We should clear skb->cb before ip_local_out or ip6_local_out. + +v2: +1. clean the stack info +2. use IPCB/IP6CB instead of skb->cb + +crash on stable-5.10(reproduce in kasan kernel). +Stack info: +[ 2203.651571] BUG: KASAN: stack-out-of-bounds in +__ip_options_echo+0x589/0x800 +[ 2203.653327] Write of size 4 at addr ffff88811a388f27 by task +swapper/3/0 +[ 2203.655460] CPU: 3 PID: 0 Comm: swapper/3 Kdump: loaded Not tainted +5.10.0-60.18.0.50.h856.kasan.eulerosv2r11.x86_64 #1 +[ 2203.655466] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), +BIOS rel-1.10.2-0-g5f4c7b1-20181220_000000-szxrtosci10000 04/01/2014 +[ 2203.655475] Call Trace: +[ 2203.655481] +[ 2203.655501] dump_stack+0x9c/0xd3 +[ 2203.655514] print_address_description.constprop.0+0x19/0x170 +[ 2203.655530] __kasan_report.cold+0x6c/0x84 +[ 2203.655586] kasan_report+0x3a/0x50 +[ 2203.655594] check_memory_region+0xfd/0x1f0 +[ 2203.655601] memcpy+0x39/0x60 +[ 2203.655608] __ip_options_echo+0x589/0x800 +[ 2203.655654] __icmp_send+0x59a/0x960 +[ 2203.655755] nf_send_unreach+0x129/0x3d0 [nf_reject_ipv4] +[ 2203.655763] reject_tg+0x77/0x1bf [ipt_REJECT] +[ 2203.655772] ipt_do_table+0x691/0xa40 [ip_tables] +[ 2203.655821] nf_hook_slow+0x69/0x100 +[ 2203.655828] __ip_local_out+0x21e/0x2b0 +[ 2203.655857] ip_local_out+0x28/0x90 +[ 2203.655868] ipvlan_process_v4_outbound+0x21e/0x260 [ipvlan] +[ 2203.655931] ipvlan_xmit_mode_l3+0x3bd/0x400 [ipvlan] +[ 2203.655967] ipvlan_queue_xmit+0xb3/0x190 [ipvlan] +[ 2203.655977] ipvlan_start_xmit+0x2e/0xb0 [ipvlan] +[ 2203.655984] xmit_one.constprop.0+0xe1/0x280 +[ 2203.655992] dev_hard_start_xmit+0x62/0x100 +[ 2203.656000] sch_direct_xmit+0x215/0x640 +[ 2203.656028] __qdisc_run+0x153/0x1f0 +[ 2203.656069] __dev_queue_xmit+0x77f/0x1030 +[ 2203.656173] ip_finish_output2+0x59b/0xc20 +[ 2203.656244] __ip_finish_output.part.0+0x318/0x3d0 +[ 2203.656312] ip_finish_output+0x168/0x190 +[ 2203.656320] ip_output+0x12d/0x220 +[ 2203.656357] __ip_queue_xmit+0x392/0x880 +[ 2203.656380] __tcp_transmit_skb+0x1088/0x11c0 +[ 2203.656436] __tcp_retransmit_skb+0x475/0xa30 +[ 2203.656505] tcp_retransmit_skb+0x2d/0x190 +[ 2203.656512] tcp_retransmit_timer+0x3af/0x9a0 +[ 2203.656519] tcp_write_timer_handler+0x3ba/0x510 +[ 2203.656529] tcp_write_timer+0x55/0x180 +[ 2203.656542] call_timer_fn+0x3f/0x1d0 +[ 2203.656555] expire_timers+0x160/0x200 +[ 2203.656562] run_timer_softirq+0x1f4/0x480 +[ 2203.656606] __do_softirq+0xfd/0x402 +[ 2203.656613] asm_call_irq_on_stack+0x12/0x20 +[ 2203.656617] +[ 2203.656623] do_softirq_own_stack+0x37/0x50 +[ 2203.656631] irq_exit_rcu+0x134/0x1a0 +[ 2203.656639] sysvec_apic_timer_interrupt+0x36/0x80 +[ 2203.656646] asm_sysvec_apic_timer_interrupt+0x12/0x20 +[ 2203.656654] RIP: 0010:default_idle+0x13/0x20 +[ 2203.656663] Code: 89 f0 5d 41 5c 41 5d 41 5e c3 cc cc cc cc cc cc cc +cc cc cc cc cc cc 0f 1f 44 00 00 0f 1f 44 00 00 0f 00 2d 9f 32 57 00 fb +f4 cc cc cc cc 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 41 54 be 08 +[ 2203.656668] RSP: 0018:ffff88810036fe78 EFLAGS: 00000256 +[ 2203.656676] RAX: ffffffffaf2a87f0 RBX: ffff888100360000 RCX: +ffffffffaf290191 +[ 2203.656681] RDX: 0000000000098b5e RSI: 0000000000000004 RDI: +ffff88811a3c4f60 +[ 2203.656686] RBP: 0000000000000000 R08: 0000000000000001 R09: +ffff88811a3c4f63 +[ 2203.656690] R10: ffffed10234789ec R11: 0000000000000001 R12: +0000000000000003 +[ 2203.656695] R13: ffff888100360000 R14: 0000000000000000 R15: +0000000000000000 +[ 2203.656729] default_idle_call+0x5a/0x150 +[ 2203.656735] cpuidle_idle_call+0x1c6/0x220 +[ 2203.656780] do_idle+0xab/0x100 +[ 2203.656786] cpu_startup_entry+0x19/0x20 +[ 2203.656793] secondary_startup_64_no_verify+0xc2/0xcb + +[ 2203.657409] The buggy address belongs to the page: +[ 2203.658648] page:0000000027a9842f refcount:1 mapcount:0 +mapping:0000000000000000 index:0x0 pfn:0x11a388 +[ 2203.658665] flags: +0x17ffffc0001000(reserved|node=0|zone=2|lastcpupid=0x1fffff) +[ 2203.658675] raw: 0017ffffc0001000 ffffea000468e208 ffffea000468e208 +0000000000000000 +[ 2203.658682] raw: 0000000000000000 0000000000000000 00000001ffffffff +0000000000000000 +[ 2203.658686] page dumped because: kasan: bad access detected + +To reproduce(ipvlan with IPVLAN_MODE_L3): +Env setting: +======================================================= +modprobe ipvlan ipvlan_default_mode=1 +sysctl net.ipv4.conf.eth0.forwarding=1 +iptables -t nat -A POSTROUTING -s 20.0.0.0/255.255.255.0 -o eth0 -j +MASQUERADE +ip link add gw link eth0 type ipvlan +ip -4 addr add 20.0.0.254/24 dev gw +ip netns add net1 +ip link add ipv1 link eth0 type ipvlan +ip link set ipv1 netns net1 +ip netns exec net1 ip link set ipv1 up +ip netns exec net1 ip -4 addr add 20.0.0.4/24 dev ipv1 +ip netns exec net1 route add default gw 20.0.0.254 +ip netns exec net1 tc qdisc add dev ipv1 root netem loss 10% +ifconfig gw up +iptables -t filter -A OUTPUT -p tcp --dport 8888 -j REJECT --reject-with +icmp-port-unreachable +======================================================= +And then excute the shell(curl any address of eth0 can reach): + +for((i=1;i<=100000;i++)) +do + ip netns exec net1 curl x.x.x.x:8888 +done +======================================================= + +Fixes: 2ad7bf363841 ("ipvlan: Initial check-in of the IPVLAN driver.") +Signed-off-by: "t.feng" +Suggested-by: Florian Westphal +Reviewed-by: Paolo Abeni +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ipvlan/ipvlan_core.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c +index 63f0226b0a70c..eb80d277b56f5 100644 +--- a/drivers/net/ipvlan/ipvlan_core.c ++++ b/drivers/net/ipvlan/ipvlan_core.c +@@ -443,6 +443,9 @@ static int ipvlan_process_v4_outbound(struct sk_buff *skb) + goto err; + } + skb_dst_set(skb, &rt->dst); ++ ++ memset(IPCB(skb), 0, sizeof(*IPCB(skb))); ++ + err = ip_local_out(net, skb->sk, skb); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; +@@ -481,6 +484,9 @@ static int ipvlan_process_v6_outbound(struct sk_buff *skb) + goto err; + } + skb_dst_set(skb, dst); ++ ++ memset(IP6CB(skb), 0, sizeof(*IP6CB(skb))); ++ + err = ip6_local_out(net, skb->sk, skb); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; +-- +2.39.2 + diff --git a/queue-4.19/net-annotate-sk-sk_err-write-from-do_recvmmsg.patch b/queue-4.19/net-annotate-sk-sk_err-write-from-do_recvmmsg.patch new file mode 100644 index 00000000000..c4ee1e97f06 --- /dev/null +++ b/queue-4.19/net-annotate-sk-sk_err-write-from-do_recvmmsg.patch @@ -0,0 +1,40 @@ +From a10e4abd4b03efbc0ddfe08d6658c4c87e4a7289 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 9 May 2023 16:35:53 +0000 +Subject: net: annotate sk->sk_err write from do_recvmmsg() + +From: Eric Dumazet + +[ Upstream commit e05a5f510f26607616fecdd4ac136310c8bea56b ] + +do_recvmmsg() can write to sk->sk_err from multiple threads. + +As said before, many other points reading or writing sk_err +need annotations. + +Fixes: 34b88a68f26a ("net: Fix use after free in the recvmmsg exit path") +Signed-off-by: Eric Dumazet +Reported-by: syzbot +Reviewed-by: Kuniyuki Iwashima +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/socket.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/socket.c b/net/socket.c +index a5167f03c31db..ce70c01eb2f3e 100644 +--- a/net/socket.c ++++ b/net/socket.c +@@ -2555,7 +2555,7 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, + * error to return on the next call or if the + * app asks about it using getsockopt(SO_ERROR). + */ +- sock->sk->sk_err = -err; ++ WRITE_ONCE(sock->sk->sk_err, -err); + } + out_put: + fput_light(sock->file, fput_needed); +-- +2.39.2 + diff --git a/queue-4.19/net-datagram-fix-data-races-in-datagram_poll.patch b/queue-4.19/net-datagram-fix-data-races-in-datagram_poll.patch new file mode 100644 index 00000000000..d0ca77f8883 --- /dev/null +++ b/queue-4.19/net-datagram-fix-data-races-in-datagram_poll.patch @@ -0,0 +1,69 @@ +From 1a55d697565efefefeea9ba6155c487b75931267 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 9 May 2023 17:31:31 +0000 +Subject: net: datagram: fix data-races in datagram_poll() + +From: Eric Dumazet + +[ Upstream commit 5bca1d081f44c9443e61841842ce4e9179d327b6 ] + +datagram_poll() runs locklessly, we should add READ_ONCE() +annotations while reading sk->sk_err, sk->sk_shutdown and sk->sk_state. + +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Signed-off-by: Eric Dumazet +Reviewed-by: Kuniyuki Iwashima +Link: https://lore.kernel.org/r/20230509173131.3263780-1-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/core/datagram.c | 15 ++++++++++----- + 1 file changed, 10 insertions(+), 5 deletions(-) + +diff --git a/net/core/datagram.c b/net/core/datagram.c +index 865a8cb7b0bdb..6ba82eb14b465 100644 +--- a/net/core/datagram.c ++++ b/net/core/datagram.c +@@ -837,18 +837,21 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, + { + struct sock *sk = sock->sk; + __poll_t mask; ++ u8 shutdown; + + sock_poll_wait(file, sock, wait); + mask = 0; + + /* exceptional events? */ +- if (sk->sk_err || !skb_queue_empty_lockless(&sk->sk_error_queue)) ++ if (READ_ONCE(sk->sk_err) || ++ !skb_queue_empty_lockless(&sk->sk_error_queue)) + mask |= EPOLLERR | + (sock_flag(sk, SOCK_SELECT_ERR_QUEUE) ? EPOLLPRI : 0); + +- if (sk->sk_shutdown & RCV_SHUTDOWN) ++ shutdown = READ_ONCE(sk->sk_shutdown); ++ if (shutdown & RCV_SHUTDOWN) + mask |= EPOLLRDHUP | EPOLLIN | EPOLLRDNORM; +- if (sk->sk_shutdown == SHUTDOWN_MASK) ++ if (shutdown == SHUTDOWN_MASK) + mask |= EPOLLHUP; + + /* readable? */ +@@ -857,10 +860,12 @@ __poll_t datagram_poll(struct file *file, struct socket *sock, + + /* Connection-based need to check for termination and startup */ + if (connection_based(sk)) { +- if (sk->sk_state == TCP_CLOSE) ++ int state = READ_ONCE(sk->sk_state); ++ ++ if (state == TCP_CLOSE) + mask |= EPOLLHUP; + /* connection hasn't started yet? */ +- if (sk->sk_state == TCP_SYN_SENT) ++ if (state == TCP_SYN_SENT) + return mask; + } + +-- +2.39.2 + diff --git a/queue-4.19/net-fix-load-tearing-on-sk-sk_stamp-in-sock_recv_cms.patch b/queue-4.19/net-fix-load-tearing-on-sk-sk_stamp-in-sock_recv_cms.patch new file mode 100644 index 00000000000..5982664647f --- /dev/null +++ b/queue-4.19/net-fix-load-tearing-on-sk-sk_stamp-in-sock_recv_cms.patch @@ -0,0 +1,82 @@ +From 3af8b99852510b3a320bcf37f20ac6f81e1ce69d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 8 May 2023 10:55:43 -0700 +Subject: net: Fix load-tearing on sk->sk_stamp in sock_recv_cmsgs(). + +From: Kuniyuki Iwashima + +[ Upstream commit dfd9248c071a3710c24365897459538551cb7167 ] + +KCSAN found a data race in sock_recv_cmsgs() where the read access +to sk->sk_stamp needs READ_ONCE(). + +BUG: KCSAN: data-race in packet_recvmsg / packet_recvmsg + +write (marked) to 0xffff88803c81f258 of 8 bytes by task 19171 on cpu 0: + sock_write_timestamp include/net/sock.h:2670 [inline] + sock_recv_cmsgs include/net/sock.h:2722 [inline] + packet_recvmsg+0xb97/0xd00 net/packet/af_packet.c:3489 + sock_recvmsg_nosec net/socket.c:1019 [inline] + sock_recvmsg+0x11a/0x130 net/socket.c:1040 + sock_read_iter+0x176/0x220 net/socket.c:1118 + call_read_iter include/linux/fs.h:1845 [inline] + new_sync_read fs/read_write.c:389 [inline] + vfs_read+0x5e0/0x630 fs/read_write.c:470 + ksys_read+0x163/0x1a0 fs/read_write.c:613 + __do_sys_read fs/read_write.c:623 [inline] + __se_sys_read fs/read_write.c:621 [inline] + __x64_sys_read+0x41/0x50 fs/read_write.c:621 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x72/0xdc + +read to 0xffff88803c81f258 of 8 bytes by task 19183 on cpu 1: + sock_recv_cmsgs include/net/sock.h:2721 [inline] + packet_recvmsg+0xb64/0xd00 net/packet/af_packet.c:3489 + sock_recvmsg_nosec net/socket.c:1019 [inline] + sock_recvmsg+0x11a/0x130 net/socket.c:1040 + sock_read_iter+0x176/0x220 net/socket.c:1118 + call_read_iter include/linux/fs.h:1845 [inline] + new_sync_read fs/read_write.c:389 [inline] + vfs_read+0x5e0/0x630 fs/read_write.c:470 + ksys_read+0x163/0x1a0 fs/read_write.c:613 + __do_sys_read fs/read_write.c:623 [inline] + __se_sys_read fs/read_write.c:621 [inline] + __x64_sys_read+0x41/0x50 fs/read_write.c:621 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x3b/0x90 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x72/0xdc + +value changed: 0xffffffffc4653600 -> 0x0000000000000000 + +Reported by Kernel Concurrency Sanitizer on: +CPU: 1 PID: 19183 Comm: syz-executor.5 Not tainted 6.3.0-rc7-02330-gca6270c12e20 #2 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 + +Fixes: 6c7c98bad488 ("sock: avoid dirtying sk_stamp, if possible") +Reported-by: syzbot +Signed-off-by: Kuniyuki Iwashima +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/r/20230508175543.55756-1-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + include/net/sock.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/net/sock.h b/include/net/sock.h +index 9eb656683281f..629cc89b7f0e4 100644 +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -2406,7 +2406,7 @@ static inline void sock_recv_ts_and_drops(struct msghdr *msg, struct sock *sk, + __sock_recv_ts_and_drops(msg, sk, skb); + else if (unlikely(sock_flag(sk, SOCK_TIMESTAMP))) + sock_write_timestamp(sk, skb->tstamp); +- else if (unlikely(sk->sk_stamp == SK_DEFAULT_STAMP)) ++ else if (unlikely(sock_read_timestamp(sk) == SK_DEFAULT_STAMP)) + sock_write_timestamp(sk, 0); + } + +-- +2.39.2 + diff --git a/queue-4.19/netlink-annotate-accesses-to-nlk-cb_running.patch b/queue-4.19/netlink-annotate-accesses-to-nlk-cb_running.patch new file mode 100644 index 00000000000..26de9ec7548 --- /dev/null +++ b/queue-4.19/netlink-annotate-accesses-to-nlk-cb_running.patch @@ -0,0 +1,109 @@ +From 1ac2aa570e1a0f4b8bd246a8d7a23c57a99e384b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 9 May 2023 16:56:34 +0000 +Subject: netlink: annotate accesses to nlk->cb_running + +From: Eric Dumazet + +[ Upstream commit a939d14919b799e6fff8a9c80296ca229ba2f8a4 ] + +Both netlink_recvmsg() and netlink_native_seq_show() read +nlk->cb_running locklessly. Use READ_ONCE() there. + +Add corresponding WRITE_ONCE() to netlink_dump() and +__netlink_dump_start() + +syzbot reported: +BUG: KCSAN: data-race in __netlink_dump_start / netlink_recvmsg + +write to 0xffff88813ea4db59 of 1 bytes by task 28219 on cpu 0: +__netlink_dump_start+0x3af/0x4d0 net/netlink/af_netlink.c:2399 +netlink_dump_start include/linux/netlink.h:308 [inline] +rtnetlink_rcv_msg+0x70f/0x8c0 net/core/rtnetlink.c:6130 +netlink_rcv_skb+0x126/0x220 net/netlink/af_netlink.c:2577 +rtnetlink_rcv+0x1c/0x20 net/core/rtnetlink.c:6192 +netlink_unicast_kernel net/netlink/af_netlink.c:1339 [inline] +netlink_unicast+0x56f/0x640 net/netlink/af_netlink.c:1365 +netlink_sendmsg+0x665/0x770 net/netlink/af_netlink.c:1942 +sock_sendmsg_nosec net/socket.c:724 [inline] +sock_sendmsg net/socket.c:747 [inline] +sock_write_iter+0x1aa/0x230 net/socket.c:1138 +call_write_iter include/linux/fs.h:1851 [inline] +new_sync_write fs/read_write.c:491 [inline] +vfs_write+0x463/0x760 fs/read_write.c:584 +ksys_write+0xeb/0x1a0 fs/read_write.c:637 +__do_sys_write fs/read_write.c:649 [inline] +__se_sys_write fs/read_write.c:646 [inline] +__x64_sys_write+0x42/0x50 fs/read_write.c:646 +do_syscall_x64 arch/x86/entry/common.c:50 [inline] +do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 +entry_SYSCALL_64_after_hwframe+0x63/0xcd + +read to 0xffff88813ea4db59 of 1 bytes by task 28222 on cpu 1: +netlink_recvmsg+0x3b4/0x730 net/netlink/af_netlink.c:2022 +sock_recvmsg_nosec+0x4c/0x80 net/socket.c:1017 +____sys_recvmsg+0x2db/0x310 net/socket.c:2718 +___sys_recvmsg net/socket.c:2762 [inline] +do_recvmmsg+0x2e5/0x710 net/socket.c:2856 +__sys_recvmmsg net/socket.c:2935 [inline] +__do_sys_recvmmsg net/socket.c:2958 [inline] +__se_sys_recvmmsg net/socket.c:2951 [inline] +__x64_sys_recvmmsg+0xe2/0x160 net/socket.c:2951 +do_syscall_x64 arch/x86/entry/common.c:50 [inline] +do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 +entry_SYSCALL_64_after_hwframe+0x63/0xcd + +value changed: 0x00 -> 0x01 + +Fixes: 16b304f3404f ("netlink: Eliminate kmalloc in netlink dump operation.") +Reported-by: syzbot +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/netlink/af_netlink.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c +index 6867158656b86..c73784b7b67dc 100644 +--- a/net/netlink/af_netlink.c ++++ b/net/netlink/af_netlink.c +@@ -1983,7 +1983,7 @@ static int netlink_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + + skb_free_datagram(sk, skb); + +- if (nlk->cb_running && ++ if (READ_ONCE(nlk->cb_running) && + atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf / 2) { + ret = netlink_dump(sk); + if (ret) { +@@ -2265,7 +2265,7 @@ static int netlink_dump(struct sock *sk) + if (cb->done) + cb->done(cb); + +- nlk->cb_running = false; ++ WRITE_ONCE(nlk->cb_running, false); + module = cb->module; + skb = cb->skb; + mutex_unlock(nlk->cb_mutex); +@@ -2325,7 +2325,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, + goto error_put; + } + +- nlk->cb_running = true; ++ WRITE_ONCE(nlk->cb_running, true); + nlk->dump_done_errno = INT_MAX; + + mutex_unlock(nlk->cb_mutex); +@@ -2631,7 +2631,7 @@ static int netlink_seq_show(struct seq_file *seq, void *v) + nlk->groups ? (u32)nlk->groups[0] : 0, + sk_rmem_alloc_get(s), + sk_wmem_alloc_get(s), +- nlk->cb_running, ++ READ_ONCE(nlk->cb_running), + refcount_read(&s->sk_refcnt), + atomic_read(&s->sk_drops), + sock_i_ino(s) +-- +2.39.2 + diff --git a/queue-4.19/series b/queue-4.19/series index e69de29bb2d..2acf1eea9fb 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -0,0 +1,11 @@ +net-fix-load-tearing-on-sk-sk_stamp-in-sock_recv_cms.patch +netlink-annotate-accesses-to-nlk-cb_running.patch +net-annotate-sk-sk_err-write-from-do_recvmmsg.patch +tcp-reduce-pollout-events-caused-by-tcp_notsent_lowa.patch +tcp-return-epollout-from-tcp_poll-only-when-notsent_.patch +tcp-factor-out-__tcp_close-helper.patch +tcp-add-annotations-around-sk-sk_shutdown-accesses.patch +ipvlan-fix-out-of-bounds-caused-by-unclear-skb-cb.patch +net-datagram-fix-data-races-in-datagram_poll.patch +af_unix-fix-a-data-race-of-sk-sk_receive_queue-qlen.patch +af_unix-fix-data-races-around-sk-sk_shutdown.patch diff --git a/queue-4.19/tcp-add-annotations-around-sk-sk_shutdown-accesses.patch b/queue-4.19/tcp-add-annotations-around-sk-sk_shutdown-accesses.patch new file mode 100644 index 00000000000..1acd46ca354 --- /dev/null +++ b/queue-4.19/tcp-add-annotations-around-sk-sk_shutdown-accesses.patch @@ -0,0 +1,158 @@ +From b90093f1aec578ea761a14312f3468d18f2f83fe Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 9 May 2023 20:36:56 +0000 +Subject: tcp: add annotations around sk->sk_shutdown accesses + +From: Eric Dumazet + +[ Upstream commit e14cadfd80d76f01bfaa1a8d745b1db19b57d6be ] + +Now sk->sk_shutdown is no longer a bitfield, we can add +standard READ_ONCE()/WRITE_ONCE() annotations to silence +KCSAN reports like the following: + +BUG: KCSAN: data-race in tcp_disconnect / tcp_poll + +write to 0xffff88814588582c of 1 bytes by task 3404 on cpu 1: +tcp_disconnect+0x4d6/0xdb0 net/ipv4/tcp.c:3121 +__inet_stream_connect+0x5dd/0x6e0 net/ipv4/af_inet.c:715 +inet_stream_connect+0x48/0x70 net/ipv4/af_inet.c:727 +__sys_connect_file net/socket.c:2001 [inline] +__sys_connect+0x19b/0x1b0 net/socket.c:2018 +__do_sys_connect net/socket.c:2028 [inline] +__se_sys_connect net/socket.c:2025 [inline] +__x64_sys_connect+0x41/0x50 net/socket.c:2025 +do_syscall_x64 arch/x86/entry/common.c:50 [inline] +do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 +entry_SYSCALL_64_after_hwframe+0x63/0xcd + +read to 0xffff88814588582c of 1 bytes by task 3374 on cpu 0: +tcp_poll+0x2e6/0x7d0 net/ipv4/tcp.c:562 +sock_poll+0x253/0x270 net/socket.c:1383 +vfs_poll include/linux/poll.h:88 [inline] +io_poll_check_events io_uring/poll.c:281 [inline] +io_poll_task_func+0x15a/0x820 io_uring/poll.c:333 +handle_tw_list io_uring/io_uring.c:1184 [inline] +tctx_task_work+0x1fe/0x4d0 io_uring/io_uring.c:1246 +task_work_run+0x123/0x160 kernel/task_work.c:179 +get_signal+0xe64/0xff0 kernel/signal.c:2635 +arch_do_signal_or_restart+0x89/0x2a0 arch/x86/kernel/signal.c:306 +exit_to_user_mode_loop+0x6f/0xe0 kernel/entry/common.c:168 +exit_to_user_mode_prepare+0x6c/0xb0 kernel/entry/common.c:204 +__syscall_exit_to_user_mode_work kernel/entry/common.c:286 [inline] +syscall_exit_to_user_mode+0x26/0x140 kernel/entry/common.c:297 +do_syscall_64+0x4d/0xc0 arch/x86/entry/common.c:86 +entry_SYSCALL_64_after_hwframe+0x63/0xcd + +value changed: 0x03 -> 0x00 + +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Reported-by: syzbot +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/ipv4/af_inet.c | 2 +- + net/ipv4/tcp.c | 14 ++++++++------ + net/ipv4/tcp_input.c | 4 ++-- + 3 files changed, 11 insertions(+), 9 deletions(-) + +diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c +index 4b26ae525d6d2..fb142ea730060 100644 +--- a/net/ipv4/af_inet.c ++++ b/net/ipv4/af_inet.c +@@ -865,7 +865,7 @@ int inet_shutdown(struct socket *sock, int how) + EPOLLHUP, even on eg. unconnected UDP sockets -- RR */ + /* fall through */ + default: +- sk->sk_shutdown |= how; ++ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | how); + if (sk->sk_prot->shutdown) + sk->sk_prot->shutdown(sk, how); + break; +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 9200e7330b7d6..b51e0a1e15b67 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -515,6 +515,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) + __poll_t mask; + struct sock *sk = sock->sk; + const struct tcp_sock *tp = tcp_sk(sk); ++ u8 shutdown; + int state; + + sock_poll_wait(file, sock, wait); +@@ -557,9 +558,10 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) + * NOTE. Check for TCP_CLOSE is added. The goal is to prevent + * blocking on fresh not-connected or disconnected socket. --ANK + */ +- if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) ++ shutdown = READ_ONCE(sk->sk_shutdown); ++ if (shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) + mask |= EPOLLHUP; +- if (sk->sk_shutdown & RCV_SHUTDOWN) ++ if (shutdown & RCV_SHUTDOWN) + mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; + + /* Connected or passive Fast Open socket? */ +@@ -575,7 +577,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) + if (tcp_stream_is_readable(tp, target, sk)) + mask |= EPOLLIN | EPOLLRDNORM; + +- if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { ++ if (!(shutdown & SEND_SHUTDOWN)) { + if (__sk_stream_is_writeable(sk, 1)) { + mask |= EPOLLOUT | EPOLLWRNORM; + } else { /* send SIGIO later */ +@@ -2344,7 +2346,7 @@ void __tcp_close(struct sock *sk, long timeout) + int data_was_unread = 0; + int state; + +- sk->sk_shutdown = SHUTDOWN_MASK; ++ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); + + if (sk->sk_state == TCP_LISTEN) { + tcp_set_state(sk, TCP_CLOSE); +@@ -2598,7 +2600,7 @@ int tcp_disconnect(struct sock *sk, int flags) + if (!(sk->sk_userlocks & SOCK_BINDADDR_LOCK)) + inet_reset_saddr(sk); + +- sk->sk_shutdown = 0; ++ WRITE_ONCE(sk->sk_shutdown, 0); + sock_reset_flag(sk, SOCK_DONE); + tp->srtt_us = 0; + tp->rcv_rtt_last_tsecr = 0; +@@ -3807,7 +3809,7 @@ void tcp_done(struct sock *sk) + if (req) + reqsk_fastopen_remove(sk, req, false); + +- sk->sk_shutdown = SHUTDOWN_MASK; ++ WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK); + + if (!sock_flag(sk, SOCK_DEAD)) + sk->sk_state_change(sk); +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 11716780667c7..bd921fa7b9ab4 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -4130,7 +4130,7 @@ void tcp_fin(struct sock *sk) + + inet_csk_schedule_ack(sk); + +- sk->sk_shutdown |= RCV_SHUTDOWN; ++ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | RCV_SHUTDOWN); + sock_set_flag(sk, SOCK_DONE); + + switch (sk->sk_state) { +@@ -6209,7 +6209,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb) + break; + + tcp_set_state(sk, TCP_FIN_WAIT2); +- sk->sk_shutdown |= SEND_SHUTDOWN; ++ WRITE_ONCE(sk->sk_shutdown, sk->sk_shutdown | SEND_SHUTDOWN); + + sk_dst_confirm(sk); + +-- +2.39.2 + diff --git a/queue-4.19/tcp-factor-out-__tcp_close-helper.patch b/queue-4.19/tcp-factor-out-__tcp_close-helper.patch new file mode 100644 index 00000000000..2c079667b6d --- /dev/null +++ b/queue-4.19/tcp-factor-out-__tcp_close-helper.patch @@ -0,0 +1,68 @@ +From d5d6d0a6afda5177ebaaff4c9f40537c4a3a6276 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 16 Nov 2020 10:48:04 +0100 +Subject: tcp: factor out __tcp_close() helper + +From: Paolo Abeni + +[ Upstream commit 77c3c95637526f1e4330cc9a4b2065f668c2c4fe ] + +unlocked version of protocol level close, will be used by +MPTCP to allow decouple orphaning and subflow level close. + +Signed-off-by: Paolo Abeni +Signed-off-by: Jakub Kicinski +Stable-dep-of: e14cadfd80d7 ("tcp: add annotations around sk->sk_shutdown accesses") +Signed-off-by: Sasha Levin +--- + include/net/tcp.h | 1 + + net/ipv4/tcp.c | 9 +++++++-- + 2 files changed, 8 insertions(+), 2 deletions(-) + +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 9e37f3912ff19..81300a04b5808 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -389,6 +389,7 @@ void tcp_update_metrics(struct sock *sk); + void tcp_init_metrics(struct sock *sk); + void tcp_metrics_init(void); + bool tcp_peer_is_proven(struct request_sock *req, struct dst_entry *dst); ++void __tcp_close(struct sock *sk, long timeout); + void tcp_close(struct sock *sk, long timeout); + void tcp_init_sock(struct sock *sk); + void tcp_init_transfer(struct sock *sk, int bpf_op); +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 2fcf6e5a371dd..9200e7330b7d6 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -2338,13 +2338,12 @@ bool tcp_check_oom(struct sock *sk, int shift) + return too_many_orphans || out_of_socket_memory; + } + +-void tcp_close(struct sock *sk, long timeout) ++void __tcp_close(struct sock *sk, long timeout) + { + struct sk_buff *skb; + int data_was_unread = 0; + int state; + +- lock_sock(sk); + sk->sk_shutdown = SHUTDOWN_MASK; + + if (sk->sk_state == TCP_LISTEN) { +@@ -2505,6 +2504,12 @@ void tcp_close(struct sock *sk, long timeout) + out: + bh_unlock_sock(sk); + local_bh_enable(); ++} ++ ++void tcp_close(struct sock *sk, long timeout) ++{ ++ lock_sock(sk); ++ __tcp_close(sk, timeout); + release_sock(sk); + sock_put(sk); + } +-- +2.39.2 + diff --git a/queue-4.19/tcp-reduce-pollout-events-caused-by-tcp_notsent_lowa.patch b/queue-4.19/tcp-reduce-pollout-events-caused-by-tcp_notsent_lowa.patch new file mode 100644 index 00000000000..02cb6b19e1b --- /dev/null +++ b/queue-4.19/tcp-reduce-pollout-events-caused-by-tcp_notsent_lowa.patch @@ -0,0 +1,159 @@ +From 9ff97f283be3a9c4d707f8291396dfa81c9d4813 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 4 Dec 2018 07:58:17 -0800 +Subject: tcp: reduce POLLOUT events caused by TCP_NOTSENT_LOWAT + +From: Eric Dumazet + +[ Upstream commit a74f0fa082b76c6a76cba5672f36218518bfdc09 ] + +TCP_NOTSENT_LOWAT socket option or sysctl was added in linux-3.12 +as a step to enable bigger tcp sndbuf limits. + +It works reasonably well, but the following happens : + +Once the limit is reached, TCP stack generates +an [E]POLLOUT event for every incoming ACK packet. + +This causes a high number of context switches. + +This patch implements the strategy David Miller added +in sock_def_write_space() : + + - If TCP socket has a notsent_lowat constraint of X bytes, + allow sendmsg() to fill up to X bytes, but send [E]POLLOUT + only if number of notsent bytes is below X/2 + +This considerably reduces TCP_NOTSENT_LOWAT overhead, +while allowing to keep the pipe full. + +Tested: + 100 ms RTT netem testbed between A and B, 100 concurrent TCP_STREAM + +A:/# cat /proc/sys/net/ipv4/tcp_wmem +4096 262144 64000000 +A:/# super_netperf 100 -H B -l 1000 -- -K bbr & + +A:/# grep TCP /proc/net/sockstat +TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 1364904 # This is about 54 MB of memory per flow :/ + +A:/# vmstat 5 5 +procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- + r b swpd free buff cache si so bi bo in cs us sy id wa st + 0 0 0 256220672 13532 694976 0 0 10 0 28 14 0 1 99 0 0 + 2 0 0 256320016 13532 698480 0 0 512 0 715901 5927 0 10 90 0 0 + 0 0 0 256197232 13532 700992 0 0 735 13 771161 5849 0 11 89 0 0 + 1 0 0 256233824 13532 703320 0 0 512 23 719650 6635 0 11 89 0 0 + 2 0 0 256226880 13532 705780 0 0 642 4 775650 6009 0 12 88 0 0 + +A:/# echo 2097152 >/proc/sys/net/ipv4/tcp_notsent_lowat + +A:/# grep TCP /proc/net/sockstat +TCP: inuse 203 orphan 0 tw 19 alloc 414 mem 86411 # 3.5 MB per flow + +A:/# vmstat 5 5 # check that context switches have not inflated too much. +procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu----- + r b swpd free buff cache si so bi bo in cs us sy id wa st + 2 0 0 260386512 13592 662148 0 0 10 0 17 14 0 1 99 0 0 + 0 0 0 260519680 13592 604184 0 0 512 13 726843 12424 0 10 90 0 0 + 1 1 0 260435424 13592 598360 0 0 512 25 764645 12925 0 10 90 0 0 + 1 0 0 260855392 13592 578380 0 0 512 7 722943 13624 0 11 88 0 0 + 1 0 0 260445008 13592 601176 0 0 614 34 772288 14317 0 10 90 0 0 + +Signed-off-by: Eric Dumazet +Acked-by: Soheil Hassas Yeganeh +Signed-off-by: David S. Miller +Stable-dep-of: e14cadfd80d7 ("tcp: add annotations around sk->sk_shutdown accesses") +Signed-off-by: Sasha Levin +--- + include/net/sock.h | 20 +++++++++++++++----- + include/net/tcp.h | 8 ++++++-- + net/core/stream.c | 2 +- + 3 files changed, 22 insertions(+), 8 deletions(-) + +diff --git a/include/net/sock.h b/include/net/sock.h +index 629cc89b7f0e4..cfbd241935a30 100644 +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -1130,7 +1130,7 @@ struct proto { + unsigned int inuse_idx; + #endif + +- bool (*stream_memory_free)(const struct sock *sk); ++ bool (*stream_memory_free)(const struct sock *sk, int wake); + bool (*stream_memory_read)(const struct sock *sk); + /* Memory pressure */ + void (*enter_memory_pressure)(struct sock *sk); +@@ -1212,19 +1212,29 @@ static inline void sk_refcnt_debug_release(const struct sock *sk) + #define sk_refcnt_debug_release(sk) do { } while (0) + #endif /* SOCK_REFCNT_DEBUG */ + +-static inline bool sk_stream_memory_free(const struct sock *sk) ++static inline bool __sk_stream_memory_free(const struct sock *sk, int wake) + { + if (sk->sk_wmem_queued >= sk->sk_sndbuf) + return false; + + return sk->sk_prot->stream_memory_free ? +- sk->sk_prot->stream_memory_free(sk) : true; ++ sk->sk_prot->stream_memory_free(sk, wake) : true; + } + +-static inline bool sk_stream_is_writeable(const struct sock *sk) ++static inline bool sk_stream_memory_free(const struct sock *sk) ++{ ++ return __sk_stream_memory_free(sk, 0); ++} ++ ++static inline bool __sk_stream_is_writeable(const struct sock *sk, int wake) + { + return sk_stream_wspace(sk) >= sk_stream_min_wspace(sk) && +- sk_stream_memory_free(sk); ++ __sk_stream_memory_free(sk, wake); ++} ++ ++static inline bool sk_stream_is_writeable(const struct sock *sk) ++{ ++ return __sk_stream_is_writeable(sk, 0); + } + + static inline int sk_under_cgroup_hierarchy(struct sock *sk, +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 9a154fe06c60d..9e37f3912ff19 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -1884,12 +1884,16 @@ static inline u32 tcp_notsent_lowat(const struct tcp_sock *tp) + return tp->notsent_lowat ?: READ_ONCE(net->ipv4.sysctl_tcp_notsent_lowat); + } + +-static inline bool tcp_stream_memory_free(const struct sock *sk) ++/* @wake is one when sk_stream_write_space() calls us. ++ * This sends EPOLLOUT only if notsent_bytes is half the limit. ++ * This mimics the strategy used in sock_def_write_space(). ++ */ ++static inline bool tcp_stream_memory_free(const struct sock *sk, int wake) + { + const struct tcp_sock *tp = tcp_sk(sk); + u32 notsent_bytes = READ_ONCE(tp->write_seq) - tp->snd_nxt; + +- return notsent_bytes < tcp_notsent_lowat(tp); ++ return (notsent_bytes << wake) < tcp_notsent_lowat(tp); + } + + #ifdef CONFIG_PROC_FS +diff --git a/net/core/stream.c b/net/core/stream.c +index 23e6669d3f8d2..cd60746877b1e 100644 +--- a/net/core/stream.c ++++ b/net/core/stream.c +@@ -32,7 +32,7 @@ void sk_stream_write_space(struct sock *sk) + struct socket *sock = sk->sk_socket; + struct socket_wq *wq; + +- if (sk_stream_is_writeable(sk) && sock) { ++ if (__sk_stream_is_writeable(sk, 1) && sock) { + clear_bit(SOCK_NOSPACE, &sock->flags); + + rcu_read_lock(); +-- +2.39.2 + diff --git a/queue-4.19/tcp-return-epollout-from-tcp_poll-only-when-notsent_.patch b/queue-4.19/tcp-return-epollout-from-tcp_poll-only-when-notsent_.patch new file mode 100644 index 00000000000..6175015eb51 --- /dev/null +++ b/queue-4.19/tcp-return-epollout-from-tcp_poll-only-when-notsent_.patch @@ -0,0 +1,54 @@ +From 397c4b513c6f7007aa916444c7e6a92f70c1b008 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 14 Sep 2020 17:52:09 -0400 +Subject: tcp: return EPOLLOUT from tcp_poll only when notsent_bytes is half + the limit + +From: Soheil Hassas Yeganeh + +[ Upstream commit 8ba3c9d1c6d75d1e6af2087278b30e17f68e1fff ] + +If there was any event available on the TCP socket, tcp_poll() +will be called to retrieve all the events. In tcp_poll(), we call +sk_stream_is_writeable() which returns true as long as we are at least +one byte below notsent_lowat. This will result in quite a few +spurious EPLLOUT and frequent tiny sendmsg() calls as a result. + +Similar to sk_stream_write_space(), use __sk_stream_is_writeable +with a wake value of 1, so that we set EPOLLOUT only if half the +space is available for write. + +Signed-off-by: Soheil Hassas Yeganeh +Signed-off-by: Eric Dumazet +Signed-off-by: David S. Miller +Stable-dep-of: e14cadfd80d7 ("tcp: add annotations around sk->sk_shutdown accesses") +Signed-off-by: Sasha Levin +--- + net/ipv4/tcp.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 68f89fe7f9233..2fcf6e5a371dd 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -576,7 +576,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) + mask |= EPOLLIN | EPOLLRDNORM; + + if (!(sk->sk_shutdown & SEND_SHUTDOWN)) { +- if (sk_stream_is_writeable(sk)) { ++ if (__sk_stream_is_writeable(sk, 1)) { + mask |= EPOLLOUT | EPOLLWRNORM; + } else { /* send SIGIO later */ + sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk); +@@ -588,7 +588,7 @@ __poll_t tcp_poll(struct file *file, struct socket *sock, poll_table *wait) + * pairs with the input side. + */ + smp_mb__after_atomic(); +- if (sk_stream_is_writeable(sk)) ++ if (__sk_stream_is_writeable(sk, 1)) + mask |= EPOLLOUT | EPOLLWRNORM; + } + } else +-- +2.39.2 + -- 2.47.3