From: Sasha Levin Date: Sun, 30 Jan 2022 15:27:50 +0000 (-0500) Subject: Fixes for 5.15 X-Git-Tag: v5.4.176~20^2~7 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=352487f08274d9d36c57886966d6b860d3f36316;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 5.15 Signed-off-by: Sasha Levin --- diff --git a/queue-5.15/arm-9170-1-fix-panic-when-kasan-and-kprobe-are-enabl.patch b/queue-5.15/arm-9170-1-fix-panic-when-kasan-and-kprobe-are-enabl.patch new file mode 100644 index 00000000000..3f3699623c0 --- /dev/null +++ b/queue-5.15/arm-9170-1-fix-panic-when-kasan-and-kprobe-are-enabl.patch @@ -0,0 +1,119 @@ +From 169c8ffd95e10e13013b1e2e8f51fbb0e387b99d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Dec 2021 10:08:23 +0100 +Subject: ARM: 9170/1: fix panic when kasan and kprobe are enabled + +From: sparkhuang + +[ Upstream commit 8b59b0a53c840921b625378f137e88adfa87647e ] + +arm32 uses software to simulate the instruction replaced +by kprobe. some instructions may be simulated by constructing +assembly functions. therefore, before executing instruction +simulation, it is necessary to construct assembly function +execution environment in C language through binding registers. +after kasan is enabled, the register binding relationship will +be destroyed, resulting in instruction simulation errors and +causing kernel panic. + +the kprobe emulate instruction function is distributed in three +files: actions-common.c actions-arm.c actions-thumb.c, so disable +KASAN when compiling these files. + +for example, use kprobe insert on cap_capable+20 after kasan +enabled, the cap_capable assembly code is as follows: +: +e92d47f0 push {r4, r5, r6, r7, r8, r9, sl, lr} +e1a05000 mov r5, r0 +e280006c add r0, r0, #108 ; 0x6c +e1a04001 mov r4, r1 +e1a06002 mov r6, r2 +e59fa090 ldr sl, [pc, #144] ; +ebfc7bf8 bl c03aa4b4 <__asan_load4> +e595706c ldr r7, [r5, #108] ; 0x6c +e2859014 add r9, r5, #20 +...... +The emulate_ldr assembly code after enabling kasan is as follows: +c06f1384 : +e92d47f0 push {r4, r5, r6, r7, r8, r9, sl, lr} +e282803c add r8, r2, #60 ; 0x3c +e1a05000 mov r5, r0 +e7e37855 ubfx r7, r5, #16, #4 +e1a00008 mov r0, r8 +e1a09001 mov r9, r1 +e1a04002 mov r4, r2 +ebf35462 bl c03c6530 <__asan_load4> +e357000f cmp r7, #15 +e7e36655 ubfx r6, r5, #12, #4 +e205a00f and sl, r5, #15 +0a000001 beq c06f13bc +e0840107 add r0, r4, r7, lsl #2 +ebf3545c bl c03c6530 <__asan_load4> +e084010a add r0, r4, sl, lsl #2 +ebf3545a bl c03c6530 <__asan_load4> +e2890010 add r0, r9, #16 +ebf35458 bl c03c6530 <__asan_load4> +e5990010 ldr r0, [r9, #16] +e12fff30 blx r0 +e356000f cm r6, #15 +1a000014 bne c06f1430 +e1a06000 mov r6, r0 +e2840040 add r0, r4, #64 ; 0x40 +...... + +when running in emulate_ldr to simulate the ldr instruction, panic +occurred, and the log is as follows: +Unable to handle kernel NULL pointer dereference at virtual address +00000090 +pgd = ecb46400 +[00000090] *pgd=2e0fa003, *pmd=00000000 +Internal error: Oops: 206 [#1] SMP ARM +PC is at cap_capable+0x14/0xb0 +LR is at emulate_ldr+0x50/0xc0 +psr: 600d0293 sp : ecd63af8 ip : 00000004 fp : c0a7c30c +r10: 00000000 r9 : c30897f4 r8 : ecd63cd4 +r7 : 0000000f r6 : 0000000a r5 : e59fa090 r4 : ecd63c98 +r3 : c06ae294 r2 : 00000000 r1 : b7611300 r0 : bf4ec008 +Flags: nZCv IRQs off FIQs on Mode SVC_32 ISA ARM Segment user +Control: 32c5387d Table: 2d546400 DAC: 55555555 +Process bash (pid: 1643, stack limit = 0xecd60190) +(cap_capable) from (kprobe_handler+0x218/0x340) +(kprobe_handler) from (kprobe_trap_handler+0x24/0x48) +(kprobe_trap_handler) from (do_undefinstr+0x13c/0x364) +(do_undefinstr) from (__und_svc_finish+0x0/0x30) +(__und_svc_finish) from (cap_capable+0x18/0xb0) +(cap_capable) from (cap_vm_enough_memory+0x38/0x48) +(cap_vm_enough_memory) from +(security_vm_enough_memory_mm+0x48/0x6c) +(security_vm_enough_memory_mm) from +(copy_process.constprop.5+0x16b4/0x25c8) +(copy_process.constprop.5) from (_do_fork+0xe8/0x55c) +(_do_fork) from (SyS_clone+0x1c/0x24) +(SyS_clone) from (__sys_trace_return+0x0/0x10) +Code: 0050a0e1 6c0080e2 0140a0e1 0260a0e1 (f801f0e7) + +Fixes: 35aa1df43283 ("ARM kprobes: instruction single-stepping support") +Fixes: 421015713b30 ("ARM: 9017/2: Enable KASan for ARM") +Signed-off-by: huangshaobo +Acked-by: Ard Biesheuvel +Signed-off-by: Russell King (Oracle) +Signed-off-by: Sasha Levin +--- + arch/arm/probes/kprobes/Makefile | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/arch/arm/probes/kprobes/Makefile b/arch/arm/probes/kprobes/Makefile +index 14db56f49f0a3..6159010dac4a6 100644 +--- a/arch/arm/probes/kprobes/Makefile ++++ b/arch/arm/probes/kprobes/Makefile +@@ -1,4 +1,7 @@ + # SPDX-License-Identifier: GPL-2.0 ++KASAN_SANITIZE_actions-common.o := n ++KASAN_SANITIZE_actions-arm.o := n ++KASAN_SANITIZE_actions-thumb.o := n + obj-$(CONFIG_KPROBES) += core.o actions-common.o checkers-common.o + obj-$(CONFIG_ARM_KPROBES_TEST) += test-kprobes.o + test-kprobes-objs := test-core.o +-- +2.34.1 + diff --git a/queue-5.15/efi-libstub-arm64-fix-image-check-alignment-at-entry.patch b/queue-5.15/efi-libstub-arm64-fix-image-check-alignment-at-entry.patch new file mode 100644 index 00000000000..6574b453abe --- /dev/null +++ b/queue-5.15/efi-libstub-arm64-fix-image-check-alignment-at-entry.patch @@ -0,0 +1,49 @@ +From ef3193384de640c25668a8dde90b6a4397e864b4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 19 Jan 2022 18:14:27 +0200 +Subject: efi/libstub: arm64: Fix image check alignment at entry + +From: Mihai Carabas + +[ Upstream commit e9b7c3a4263bdcfd31bc3d03d48ce0ded7a94635 ] + +The kernel is aligned at SEGMENT_SIZE and this is the size populated in the PE +headers: + +arch/arm64/kernel/efi-header.S: .long SEGMENT_ALIGN // SectionAlignment + +EFI_KIMG_ALIGN is defined as: (SEGMENT_ALIGN > THREAD_ALIGN ? SEGMENT_ALIGN : +THREAD_ALIGN) + +So it depends on THREAD_ALIGN. On newer builds this message started to appear +even though the loader is taking into account the PE header (which is stating +SEGMENT_ALIGN). + +Fixes: c32ac11da3f8 ("efi/libstub: arm64: Double check image alignment at entry") +Signed-off-by: Mihai Carabas +Signed-off-by: Ard Biesheuvel +Signed-off-by: Sasha Levin +--- + drivers/firmware/efi/libstub/arm64-stub.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/drivers/firmware/efi/libstub/arm64-stub.c b/drivers/firmware/efi/libstub/arm64-stub.c +index 2363fee9211c9..9cc556013d085 100644 +--- a/drivers/firmware/efi/libstub/arm64-stub.c ++++ b/drivers/firmware/efi/libstub/arm64-stub.c +@@ -119,9 +119,9 @@ efi_status_t handle_kernel_image(unsigned long *image_addr, + if (image->image_base != _text) + efi_err("FIRMWARE BUG: efi_loaded_image_t::image_base has bogus value\n"); + +- if (!IS_ALIGNED((u64)_text, EFI_KIMG_ALIGN)) +- efi_err("FIRMWARE BUG: kernel image not aligned on %ldk boundary\n", +- EFI_KIMG_ALIGN >> 10); ++ if (!IS_ALIGNED((u64)_text, SEGMENT_ALIGN)) ++ efi_err("FIRMWARE BUG: kernel image not aligned on %dk boundary\n", ++ SEGMENT_ALIGN >> 10); + + kernel_size = _edata - _text; + kernel_memsize = kernel_size + (_end - _edata); +-- +2.34.1 + diff --git a/queue-5.15/ipv6-annotate-accesses-to-fn-fn_sernum.patch b/queue-5.15/ipv6-annotate-accesses-to-fn-fn_sernum.patch new file mode 100644 index 00000000000..493331512ee --- /dev/null +++ b/queue-5.15/ipv6-annotate-accesses-to-fn-fn_sernum.patch @@ -0,0 +1,197 @@ +From 2adb2fe8cd76a1617463803e1c7ed2bc1f2f0768 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 20 Jan 2022 09:41:12 -0800 +Subject: ipv6: annotate accesses to fn->fn_sernum + +From: Eric Dumazet + +[ Upstream commit aafc2e3285c2d7a79b7ee15221c19fbeca7b1509 ] + +struct fib6_node's fn_sernum field can be +read while other threads change it. + +Add READ_ONCE()/WRITE_ONCE() annotations. + +Do not change existing smp barriers in fib6_get_cookie_safe() +and __fib6_update_sernum_upto_root() + +syzbot reported: + +BUG: KCSAN: data-race in fib6_clean_node / inet6_csk_route_socket + +write to 0xffff88813df62e2c of 4 bytes by task 1920 on cpu 1: + fib6_clean_node+0xc2/0x260 net/ipv6/ip6_fib.c:2178 + fib6_walk_continue+0x38e/0x430 net/ipv6/ip6_fib.c:2112 + fib6_walk net/ipv6/ip6_fib.c:2160 [inline] + fib6_clean_tree net/ipv6/ip6_fib.c:2240 [inline] + __fib6_clean_all+0x1a9/0x2e0 net/ipv6/ip6_fib.c:2256 + fib6_flush_trees+0x6c/0x80 net/ipv6/ip6_fib.c:2281 + rt_genid_bump_ipv6 include/net/net_namespace.h:488 [inline] + addrconf_dad_completed+0x57f/0x870 net/ipv6/addrconf.c:4230 + addrconf_dad_work+0x908/0x1170 + process_one_work+0x3f6/0x960 kernel/workqueue.c:2307 + worker_thread+0x616/0xa70 kernel/workqueue.c:2454 + kthread+0x1bf/0x1e0 kernel/kthread.c:359 + ret_from_fork+0x1f/0x30 + +read to 0xffff88813df62e2c of 4 bytes by task 15701 on cpu 0: + fib6_get_cookie_safe include/net/ip6_fib.h:285 [inline] + rt6_get_cookie include/net/ip6_fib.h:306 [inline] + ip6_dst_store include/net/ip6_route.h:234 [inline] + inet6_csk_route_socket+0x352/0x3c0 net/ipv6/inet6_connection_sock.c:109 + inet6_csk_xmit+0x91/0x1e0 net/ipv6/inet6_connection_sock.c:121 + __tcp_transmit_skb+0x1323/0x1840 net/ipv4/tcp_output.c:1402 + tcp_transmit_skb net/ipv4/tcp_output.c:1420 [inline] + tcp_write_xmit+0x1450/0x4460 net/ipv4/tcp_output.c:2680 + __tcp_push_pending_frames+0x68/0x1c0 net/ipv4/tcp_output.c:2864 + tcp_push+0x2d9/0x2f0 net/ipv4/tcp.c:725 + mptcp_push_release net/mptcp/protocol.c:1491 [inline] + __mptcp_push_pending+0x46c/0x490 net/mptcp/protocol.c:1578 + mptcp_sendmsg+0x9ec/0xa50 net/mptcp/protocol.c:1764 + inet6_sendmsg+0x5f/0x80 net/ipv6/af_inet6.c:643 + sock_sendmsg_nosec net/socket.c:705 [inline] + sock_sendmsg net/socket.c:725 [inline] + kernel_sendmsg+0x97/0xd0 net/socket.c:745 + sock_no_sendpage+0x84/0xb0 net/core/sock.c:3086 + inet_sendpage+0x9d/0xc0 net/ipv4/af_inet.c:834 + kernel_sendpage+0x187/0x200 net/socket.c:3492 + sock_sendpage+0x5a/0x70 net/socket.c:1007 + pipe_to_sendpage+0x128/0x160 fs/splice.c:364 + splice_from_pipe_feed fs/splice.c:418 [inline] + __splice_from_pipe+0x207/0x500 fs/splice.c:562 + splice_from_pipe fs/splice.c:597 [inline] + generic_splice_sendpage+0x94/0xd0 fs/splice.c:746 + do_splice_from fs/splice.c:767 [inline] + direct_splice_actor+0x80/0xa0 fs/splice.c:936 + splice_direct_to_actor+0x345/0x650 fs/splice.c:891 + do_splice_direct+0x106/0x190 fs/splice.c:979 + do_sendfile+0x675/0xc40 fs/read_write.c:1245 + __do_sys_sendfile64 fs/read_write.c:1310 [inline] + __se_sys_sendfile64 fs/read_write.c:1296 [inline] + __x64_sys_sendfile64+0x102/0x140 fs/read_write.c:1296 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x44/0xd0 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x44/0xae + +value changed: 0x0000026f -> 0x00000271 + +Reported by Kernel Concurrency Sanitizer on: +CPU: 0 PID: 15701 Comm: syz-executor.2 Not tainted 5.16.0-syzkaller #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 + +The Fixes tag I chose is probably arbitrary, I do not think +we need to backport this patch to older kernels. + +Fixes: c5cff8561d2d ("ipv6: add rcu grace period before freeing fib6_node") +Signed-off-by: Eric Dumazet +Reported-by: syzbot +Link: https://lore.kernel.org/r/20220120174112.1126644-1-eric.dumazet@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + include/net/ip6_fib.h | 2 +- + net/ipv6/ip6_fib.c | 23 +++++++++++++---------- + net/ipv6/route.c | 2 +- + 3 files changed, 15 insertions(+), 12 deletions(-) + +diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h +index 83b8070d1cc93..c85b040728d7e 100644 +--- a/include/net/ip6_fib.h ++++ b/include/net/ip6_fib.h +@@ -281,7 +281,7 @@ static inline bool fib6_get_cookie_safe(const struct fib6_info *f6i, + fn = rcu_dereference(f6i->fib6_node); + + if (fn) { +- *cookie = fn->fn_sernum; ++ *cookie = READ_ONCE(fn->fn_sernum); + /* pairs with smp_wmb() in __fib6_update_sernum_upto_root() */ + smp_rmb(); + status = true; +diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c +index 0371d2c141455..a506e57c4032a 100644 +--- a/net/ipv6/ip6_fib.c ++++ b/net/ipv6/ip6_fib.c +@@ -111,7 +111,7 @@ void fib6_update_sernum(struct net *net, struct fib6_info *f6i) + fn = rcu_dereference_protected(f6i->fib6_node, + lockdep_is_held(&f6i->fib6_table->tb6_lock)); + if (fn) +- fn->fn_sernum = fib6_new_sernum(net); ++ WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net)); + } + + /* +@@ -589,12 +589,13 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb, + spin_unlock_bh(&table->tb6_lock); + if (res > 0) { + cb->args[4] = 1; +- cb->args[5] = w->root->fn_sernum; ++ cb->args[5] = READ_ONCE(w->root->fn_sernum); + } + } else { +- if (cb->args[5] != w->root->fn_sernum) { ++ int sernum = READ_ONCE(w->root->fn_sernum); ++ if (cb->args[5] != sernum) { + /* Begin at the root if the tree changed */ +- cb->args[5] = w->root->fn_sernum; ++ cb->args[5] = sernum; + w->state = FWS_INIT; + w->node = w->root; + w->skip = w->count; +@@ -1344,7 +1345,7 @@ static void __fib6_update_sernum_upto_root(struct fib6_info *rt, + /* paired with smp_rmb() in fib6_get_cookie_safe() */ + smp_wmb(); + while (fn) { +- fn->fn_sernum = sernum; ++ WRITE_ONCE(fn->fn_sernum, sernum); + fn = rcu_dereference_protected(fn->parent, + lockdep_is_held(&rt->fib6_table->tb6_lock)); + } +@@ -2173,8 +2174,8 @@ static int fib6_clean_node(struct fib6_walker *w) + }; + + if (c->sernum != FIB6_NO_SERNUM_CHANGE && +- w->node->fn_sernum != c->sernum) +- w->node->fn_sernum = c->sernum; ++ READ_ONCE(w->node->fn_sernum) != c->sernum) ++ WRITE_ONCE(w->node->fn_sernum, c->sernum); + + if (!c->func) { + WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE); +@@ -2542,7 +2543,7 @@ static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter, + iter->w.state = FWS_INIT; + iter->w.node = iter->w.root; + iter->w.args = iter; +- iter->sernum = iter->w.root->fn_sernum; ++ iter->sernum = READ_ONCE(iter->w.root->fn_sernum); + INIT_LIST_HEAD(&iter->w.lh); + fib6_walker_link(net, &iter->w); + } +@@ -2570,8 +2571,10 @@ static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl, + + static void ipv6_route_check_sernum(struct ipv6_route_iter *iter) + { +- if (iter->sernum != iter->w.root->fn_sernum) { +- iter->sernum = iter->w.root->fn_sernum; ++ int sernum = READ_ONCE(iter->w.root->fn_sernum); ++ ++ if (iter->sernum != sernum) { ++ iter->sernum = sernum; + iter->w.state = FWS_INIT; + iter->w.node = iter->w.root; + WARN_ON(iter->w.skip); +diff --git a/net/ipv6/route.c b/net/ipv6/route.c +index 0632382a5427b..3c5bb49692206 100644 +--- a/net/ipv6/route.c ++++ b/net/ipv6/route.c +@@ -2802,7 +2802,7 @@ static void ip6_link_failure(struct sk_buff *skb) + if (from) { + fn = rcu_dereference(from->fib6_node); + if (fn && (rt->rt6i_flags & RTF_DEFAULT)) +- fn->fn_sernum = -1; ++ WRITE_ONCE(fn->fn_sernum, -1); + } + } + rcu_read_unlock(); +-- +2.34.1 + diff --git a/queue-5.15/ipv6_tunnel-rate-limit-warning-messages.patch b/queue-5.15/ipv6_tunnel-rate-limit-warning-messages.patch new file mode 100644 index 00000000000..83652c2e49e --- /dev/null +++ b/queue-5.15/ipv6_tunnel-rate-limit-warning-messages.patch @@ -0,0 +1,51 @@ +From 006c8ca2dd19aa9e6a73cb7e89e4f4a7af8f271b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 20 Jan 2022 10:05:46 +0200 +Subject: ipv6_tunnel: Rate limit warning messages + +From: Ido Schimmel + +[ Upstream commit 6cee105e7f2ced596373951d9ea08dacc3883c68 ] + +The warning messages can be invoked from the data path for every packet +transmitted through an ip6gre netdev, leading to high CPU utilization. + +Fix that by rate limiting the messages. + +Fixes: 09c6bbf090ec ("[IPV6]: Do mandatory IPv6 tunnel endpoint checks in realtime") +Reported-by: Maksym Yaremchuk +Tested-by: Maksym Yaremchuk +Signed-off-by: Ido Schimmel +Reviewed-by: Amit Cohen +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/ipv6/ip6_tunnel.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c +index 20a67efda47f5..fa8da8ff35b42 100644 +--- a/net/ipv6/ip6_tunnel.c ++++ b/net/ipv6/ip6_tunnel.c +@@ -1036,14 +1036,14 @@ int ip6_tnl_xmit_ctl(struct ip6_tnl *t, + + if (unlikely(!ipv6_chk_addr_and_flags(net, laddr, ldev, false, + 0, IFA_F_TENTATIVE))) +- pr_warn("%s xmit: Local address not yet configured!\n", +- p->name); ++ pr_warn_ratelimited("%s xmit: Local address not yet configured!\n", ++ p->name); + else if (!(p->flags & IP6_TNL_F_ALLOW_LOCAL_REMOTE) && + !ipv6_addr_is_multicast(raddr) && + unlikely(ipv6_chk_addr_and_flags(net, raddr, ldev, + true, 0, IFA_F_TENTATIVE))) +- pr_warn("%s xmit: Routing loop! Remote address found on this node!\n", +- p->name); ++ pr_warn_ratelimited("%s xmit: Routing loop! Remote address found on this node!\n", ++ p->name); + else + ret = 1; + rcu_read_unlock(); +-- +2.34.1 + diff --git a/queue-5.15/kvm-arm64-pkvm-use-the-mm_ops-indirection-for-cache-.patch b/queue-5.15/kvm-arm64-pkvm-use-the-mm_ops-indirection-for-cache-.patch new file mode 100644 index 00000000000..e7471ccdf12 --- /dev/null +++ b/queue-5.15/kvm-arm64-pkvm-use-the-mm_ops-indirection-for-cache-.patch @@ -0,0 +1,67 @@ +From faa9223e29cc788ae6234c967529d1978f9d9e27 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 14 Jan 2022 08:57:58 +0000 +Subject: KVM: arm64: pkvm: Use the mm_ops indirection for cache maintenance + +From: Marc Zyngier + +[ Upstream commit 094d00f8ca58c5d29b25e23b4daaed1ff1f13b41 ] + +CMOs issued from EL2 cannot directly use the kernel helpers, +as EL2 doesn't have a mapping of the guest pages. Oops. + +Instead, use the mm_ops indirection to use helpers that will +perform a mapping at EL2 and allow the CMO to be effective. + +Fixes: 25aa28691bb9 ("KVM: arm64: Move guest CMOs to the fault handlers") +Reviewed-by: Quentin Perret +Signed-off-by: Marc Zyngier +Link: https://lore.kernel.org/r/20220114125038.1336965-1-maz@kernel.org +Signed-off-by: Sasha Levin +--- + arch/arm64/kvm/hyp/pgtable.c | 18 ++++++------------ + 1 file changed, 6 insertions(+), 12 deletions(-) + +diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c +index f8ceebe4982eb..4c77ff556f0ae 100644 +--- a/arch/arm64/kvm/hyp/pgtable.c ++++ b/arch/arm64/kvm/hyp/pgtable.c +@@ -921,13 +921,9 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + */ + stage2_put_pte(ptep, mmu, addr, level, mm_ops); + +- if (need_flush) { +- kvm_pte_t *pte_follow = kvm_pte_follow(pte, mm_ops); +- +- dcache_clean_inval_poc((unsigned long)pte_follow, +- (unsigned long)pte_follow + +- kvm_granule_size(level)); +- } ++ if (need_flush && mm_ops->dcache_clean_inval_poc) ++ mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops), ++ kvm_granule_size(level)); + + if (childp) + mm_ops->put_page(childp); +@@ -1089,15 +1085,13 @@ static int stage2_flush_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + struct kvm_pgtable *pgt = arg; + struct kvm_pgtable_mm_ops *mm_ops = pgt->mm_ops; + kvm_pte_t pte = *ptep; +- kvm_pte_t *pte_follow; + + if (!kvm_pte_valid(pte) || !stage2_pte_cacheable(pgt, pte)) + return 0; + +- pte_follow = kvm_pte_follow(pte, mm_ops); +- dcache_clean_inval_poc((unsigned long)pte_follow, +- (unsigned long)pte_follow + +- kvm_granule_size(level)); ++ if (mm_ops->dcache_clean_inval_poc) ++ mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops), ++ kvm_granule_size(level)); + return 0; + } + +-- +2.34.1 + diff --git a/queue-5.15/mptcp-allow-changing-the-backup-bit-by-endpoint-id.patch b/queue-5.15/mptcp-allow-changing-the-backup-bit-by-endpoint-id.patch new file mode 100644 index 00000000000..cbdbe7a0017 --- /dev/null +++ b/queue-5.15/mptcp-allow-changing-the-backup-bit-by-endpoint-id.patch @@ -0,0 +1,61 @@ +From 68a8ab696c5cbc469b8d85f8fcb5da8df8bb4f90 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 3 Dec 2021 14:35:36 -0800 +Subject: mptcp: allow changing the "backup" bit by endpoint id + +From: Davide Caratti + +[ Upstream commit 602837e8479d20d49559b4b97b79d34c0efe7ecb ] + +a non-zero 'id' is sufficient to identify MPTCP endpoints: allow changing +the value of 'backup' bit by simply specifying the endpoint id. + +Link: https://github.com/multipath-tcp/mptcp_net-next/issues/158 +Signed-off-by: Davide Caratti +Signed-off-by: Mat Martineau +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/mptcp/pm_netlink.c | 14 ++++++++++---- + 1 file changed, 10 insertions(+), 4 deletions(-) + +diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c +index d96860053816a..3be10bf22cf7c 100644 +--- a/net/mptcp/pm_netlink.c ++++ b/net/mptcp/pm_netlink.c +@@ -1711,22 +1711,28 @@ next: + + static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) + { ++ struct mptcp_pm_addr_entry addr = { .addr = { .family = AF_UNSPEC }, }, *entry; + struct nlattr *attr = info->attrs[MPTCP_PM_ATTR_ADDR]; + struct pm_nl_pernet *pernet = genl_info_pm_nl(info); +- struct mptcp_pm_addr_entry addr, *entry; + struct net *net = sock_net(skb->sk); +- u8 bkup = 0; ++ u8 bkup = 0, lookup_by_id = 0; + int ret; + +- ret = mptcp_pm_parse_addr(attr, info, true, &addr); ++ ret = mptcp_pm_parse_addr(attr, info, false, &addr); + if (ret < 0) + return ret; + + if (addr.flags & MPTCP_PM_ADDR_FLAG_BACKUP) + bkup = 1; ++ if (addr.addr.family == AF_UNSPEC) { ++ lookup_by_id = 1; ++ if (!addr.addr.id) ++ return -EOPNOTSUPP; ++ } + + list_for_each_entry(entry, &pernet->local_addr_list, list) { +- if (addresses_equal(&entry->addr, &addr.addr, true)) { ++ if ((!lookup_by_id && addresses_equal(&entry->addr, &addr.addr, true)) || ++ (lookup_by_id && entry->addr.id == addr.addr.id)) { + mptcp_nl_addr_backup(net, &entry->addr, bkup); + + if (bkup) +-- +2.34.1 + diff --git a/queue-5.15/mptcp-clean-up-harmless-false-expressions.patch b/queue-5.15/mptcp-clean-up-harmless-false-expressions.patch new file mode 100644 index 00000000000..c73a97c4f46 --- /dev/null +++ b/queue-5.15/mptcp-clean-up-harmless-false-expressions.patch @@ -0,0 +1,60 @@ +From ba9ade4de4e8f2fc4cbf8320e093ebf8f059014f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 17 Dec 2021 15:37:02 -0800 +Subject: mptcp: clean up harmless false expressions + +From: Jean Sacren + +[ Upstream commit 59060a47ca50bbdb1d863b73667a1065873ecc06 ] + +entry->addr.id is u8 with a range from 0 to 255 and MAX_ADDR_ID is 255. +We should drop both false expressions of (entry->addr.id > MAX_ADDR_ID). + +We should also remove the obsolete parentheses in the first if branch. + +Use U8_MAX for MAX_ADDR_ID and add a comment to show the link to +mptcp_addr_info.id as suggested by Mr. Matthieu Baerts. + +Reviewed-by: Matthieu Baerts +Signed-off-by: Jean Sacren +Signed-off-by: Mat Martineau +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/mptcp/pm_netlink.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c +index 3be10bf22cf7c..15c89d4fea4d2 100644 +--- a/net/mptcp/pm_netlink.c ++++ b/net/mptcp/pm_netlink.c +@@ -38,7 +38,8 @@ struct mptcp_pm_add_entry { + u8 retrans_times; + }; + +-#define MAX_ADDR_ID 255 ++/* max value of mptcp_addr_info.id */ ++#define MAX_ADDR_ID U8_MAX + #define BITMAP_SZ DIV_ROUND_UP(MAX_ADDR_ID + 1, BITS_PER_LONG) + + struct pm_nl_pernet { +@@ -831,14 +832,13 @@ find_next: + entry->addr.id = find_next_zero_bit(pernet->id_bitmap, + MAX_ADDR_ID + 1, + pernet->next_id); +- if ((!entry->addr.id || entry->addr.id > MAX_ADDR_ID) && +- pernet->next_id != 1) { ++ if (!entry->addr.id && pernet->next_id != 1) { + pernet->next_id = 1; + goto find_next; + } + } + +- if (!entry->addr.id || entry->addr.id > MAX_ADDR_ID) ++ if (!entry->addr.id) + goto out; + + __set_bit(entry->addr.id, pernet->id_bitmap); +-- +2.34.1 + diff --git a/queue-5.15/mptcp-fix-msk-traversal-in-mptcp_nl_cmd_set_flags.patch b/queue-5.15/mptcp-fix-msk-traversal-in-mptcp_nl_cmd_set_flags.patch new file mode 100644 index 00000000000..1d558efdb53 --- /dev/null +++ b/queue-5.15/mptcp-fix-msk-traversal-in-mptcp_nl_cmd_set_flags.patch @@ -0,0 +1,85 @@ +From 1eefa603bbf4fca0a4f4208e8f95f139588869e7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 20 Jan 2022 16:35:27 -0800 +Subject: mptcp: fix msk traversal in mptcp_nl_cmd_set_flags() + +From: Paolo Abeni + +[ Upstream commit 8e9eacad7ec7a9cbf262649ebf1fa6e6f6cc7d82 ] + +The MPTCP endpoint list is under RCU protection, guarded by the +pernet spinlock. mptcp_nl_cmd_set_flags() traverses the list +without acquiring the spin-lock nor under the RCU critical section. + +This change addresses the issue performing the lookup and the endpoint +update under the pernet spinlock. + +Fixes: 0f9f696a502e ("mptcp: add set_flags command in PM netlink") +Signed-off-by: Paolo Abeni +Signed-off-by: Mat Martineau +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/mptcp/pm_netlink.c | 37 +++++++++++++++++++++++++++---------- + 1 file changed, 27 insertions(+), 10 deletions(-) + +diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c +index bba166ddacc78..7f11eb3e35137 100644 +--- a/net/mptcp/pm_netlink.c ++++ b/net/mptcp/pm_netlink.c +@@ -469,6 +469,20 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) + return NULL; + } + ++static struct mptcp_pm_addr_entry * ++__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info, ++ bool lookup_by_id) ++{ ++ struct mptcp_pm_addr_entry *entry; ++ ++ list_for_each_entry(entry, &pernet->local_addr_list, list) { ++ if ((!lookup_by_id && addresses_equal(&entry->addr, info, true)) || ++ (lookup_by_id && entry->addr.id == info->id)) ++ return entry; ++ } ++ return NULL; ++} ++ + static int + lookup_id_by_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *addr) + { +@@ -1753,18 +1767,21 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) + return -EOPNOTSUPP; + } + +- list_for_each_entry(entry, &pernet->local_addr_list, list) { +- if ((!lookup_by_id && addresses_equal(&entry->addr, &addr.addr, true)) || +- (lookup_by_id && entry->addr.id == addr.addr.id)) { +- mptcp_nl_addr_backup(net, &entry->addr, bkup); +- +- if (bkup) +- entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; +- else +- entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; +- } ++ spin_lock_bh(&pernet->lock); ++ entry = __lookup_addr(pernet, &addr.addr, lookup_by_id); ++ if (!entry) { ++ spin_unlock_bh(&pernet->lock); ++ return -EINVAL; + } + ++ if (bkup) ++ entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; ++ else ++ entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; ++ addr = *entry; ++ spin_unlock_bh(&pernet->lock); ++ ++ mptcp_nl_addr_backup(net, &addr.addr, bkup); + return 0; + } + +-- +2.34.1 + diff --git a/queue-5.15/mptcp-fix-removing-ids-bitmap-setting.patch b/queue-5.15/mptcp-fix-removing-ids-bitmap-setting.patch new file mode 100644 index 00000000000..6b203cccf5b --- /dev/null +++ b/queue-5.15/mptcp-fix-removing-ids-bitmap-setting.patch @@ -0,0 +1,38 @@ +From b607a1d248f0c837a4a59bd0f54b35c871916765 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 20 Jan 2022 16:35:28 -0800 +Subject: mptcp: fix removing ids bitmap setting + +From: Geliang Tang + +[ Upstream commit a4c0214fbee97c46e3f41fee37931d66c0fc3cb1 ] + +In mptcp_pm_nl_rm_addr_or_subflow(), the bit of rm_list->ids[i] in the +id_avail_bitmap should be set, not rm_list->ids[1]. This patch fixed it. + +Fixes: 86e39e04482b ("mptcp: keep track of local endpoint still available for each msk") +Acked-by: Paolo Abeni +Signed-off-by: Geliang Tang +Signed-off-by: Mat Martineau +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/mptcp/pm_netlink.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c +index 7f11eb3e35137..84e6b55375e1d 100644 +--- a/net/mptcp/pm_netlink.c ++++ b/net/mptcp/pm_netlink.c +@@ -781,7 +781,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, + msk->pm.subflows--; + __MPTCP_INC_STATS(sock_net(sk), rm_type); + } +- __set_bit(rm_list->ids[1], msk->pm.id_avail_bitmap); ++ __set_bit(rm_list->ids[i], msk->pm.id_avail_bitmap); + if (!removed) + continue; + +-- +2.34.1 + diff --git a/queue-5.15/mptcp-keep-track-of-local-endpoint-still-available-f.patch b/queue-5.15/mptcp-keep-track-of-local-endpoint-still-available-f.patch new file mode 100644 index 00000000000..922c54fcbb8 --- /dev/null +++ b/queue-5.15/mptcp-keep-track-of-local-endpoint-still-available-f.patch @@ -0,0 +1,404 @@ +From 96327b679e618f5e286c988cdfa012750e660571 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 6 Jan 2022 16:20:22 -0800 +Subject: mptcp: keep track of local endpoint still available for each msk + +From: Paolo Abeni + +[ Upstream commit 86e39e04482b0aadf3ee3ed5fcf2d63816559d36 ] + +Include into the path manager status a bitmap tracking the list +of local endpoints still available - not yet used - for the +relevant mptcp socket. + +Keep such map updated at endpoint creation/deletion time, so +that we can easily skip already used endpoint at local address +selection time. + +The endpoint used by the initial subflow is lazyly accounted at +subflow creation time: the usage bitmap is be up2date before +endpoint selection and we avoid such unneeded task in some relevant +scenarios - e.g. busy servers accepting incoming subflows but +not creating any additional ones nor annuncing additional addresses. + +Overall this allows for fair local endpoints usage in case of +subflow failure. + +As a side effect, this patch also enforces that each endpoint +is used at most once for each mptcp connection. + +Signed-off-by: Paolo Abeni +Signed-off-by: Mat Martineau +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/mptcp/pm.c | 1 + + net/mptcp/pm_netlink.c | 125 +++++++++++------- + net/mptcp/protocol.c | 3 +- + net/mptcp/protocol.h | 12 +- + .../testing/selftests/net/mptcp/mptcp_join.sh | 5 +- + 5 files changed, 91 insertions(+), 55 deletions(-) + +diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c +index 6ab386ff32944..332ac6eda3ba4 100644 +--- a/net/mptcp/pm.c ++++ b/net/mptcp/pm.c +@@ -370,6 +370,7 @@ void mptcp_pm_data_init(struct mptcp_sock *msk) + WRITE_ONCE(msk->pm.accept_subflow, false); + WRITE_ONCE(msk->pm.remote_deny_join_id0, false); + msk->pm.status = 0; ++ bitmap_fill(msk->pm.id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + + spin_lock_init(&msk->pm.lock); + INIT_LIST_HEAD(&msk->pm.anno_list); +diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c +index 15c89d4fea4d2..bba166ddacc78 100644 +--- a/net/mptcp/pm_netlink.c ++++ b/net/mptcp/pm_netlink.c +@@ -38,10 +38,6 @@ struct mptcp_pm_add_entry { + u8 retrans_times; + }; + +-/* max value of mptcp_addr_info.id */ +-#define MAX_ADDR_ID U8_MAX +-#define BITMAP_SZ DIV_ROUND_UP(MAX_ADDR_ID + 1, BITS_PER_LONG) +- + struct pm_nl_pernet { + /* protects pernet updates */ + spinlock_t lock; +@@ -53,14 +49,14 @@ struct pm_nl_pernet { + unsigned int local_addr_max; + unsigned int subflows_max; + unsigned int next_id; +- unsigned long id_bitmap[BITMAP_SZ]; ++ DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + }; + + #define MPTCP_PM_ADDR_MAX 8 + #define ADD_ADDR_RETRANS_MAX 3 + + static bool addresses_equal(const struct mptcp_addr_info *a, +- struct mptcp_addr_info *b, bool use_port) ++ const struct mptcp_addr_info *b, bool use_port) + { + bool addr_equals = false; + +@@ -174,6 +170,9 @@ select_local_address(const struct pm_nl_pernet *pernet, + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SUBFLOW)) + continue; + ++ if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) ++ continue; ++ + if (entry->addr.family != sk->sk_family) { + #if IS_ENABLED(CONFIG_MPTCP_IPV6) + if ((entry->addr.family == AF_INET && +@@ -184,23 +183,17 @@ select_local_address(const struct pm_nl_pernet *pernet, + continue; + } + +- /* avoid any address already in use by subflows and +- * pending join +- */ +- if (!lookup_subflow_by_saddr(&msk->conn_list, &entry->addr)) { +- ret = entry; +- break; +- } ++ ret = entry; ++ break; + } + rcu_read_unlock(); + return ret; + } + + static struct mptcp_pm_addr_entry * +-select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos) ++select_signal_address(struct pm_nl_pernet *pernet, struct mptcp_sock *msk) + { + struct mptcp_pm_addr_entry *entry, *ret = NULL; +- int i = 0; + + rcu_read_lock(); + /* do not keep any additional per socket state, just signal +@@ -209,12 +202,14 @@ select_signal_address(struct pm_nl_pernet *pernet, unsigned int pos) + * can lead to additional addresses not being announced. + */ + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list) { ++ if (!test_bit(entry->addr.id, msk->pm.id_avail_bitmap)) ++ continue; ++ + if (!(entry->flags & MPTCP_PM_ADDR_FLAG_SIGNAL)) + continue; +- if (i++ == pos) { +- ret = entry; +- break; +- } ++ ++ ret = entry; ++ break; + } + rcu_read_unlock(); + return ret; +@@ -258,9 +253,11 @@ EXPORT_SYMBOL_GPL(mptcp_pm_get_local_addr_max); + + static void check_work_pending(struct mptcp_sock *msk) + { +- if (msk->pm.add_addr_signaled == mptcp_pm_get_add_addr_signal_max(msk) && +- (msk->pm.local_addr_used == mptcp_pm_get_local_addr_max(msk) || +- msk->pm.subflows == mptcp_pm_get_subflows_max(msk))) ++ struct pm_nl_pernet *pernet = net_generic(sock_net((struct sock *)msk), pm_nl_pernet_id); ++ ++ if (msk->pm.subflows == mptcp_pm_get_subflows_max(msk) || ++ (find_next_and_bit(pernet->id_bitmap, msk->pm.id_avail_bitmap, ++ MPTCP_PM_MAX_ADDR_ID + 1, 0) == MPTCP_PM_MAX_ADDR_ID + 1)) + WRITE_ONCE(msk->pm.work_pending, false); + } + +@@ -460,6 +457,35 @@ static unsigned int fill_remote_addresses_vec(struct mptcp_sock *msk, bool fullm + return i; + } + ++static struct mptcp_pm_addr_entry * ++__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) ++{ ++ struct mptcp_pm_addr_entry *entry; ++ ++ list_for_each_entry(entry, &pernet->local_addr_list, list) { ++ if (entry->addr.id == id) ++ return entry; ++ } ++ return NULL; ++} ++ ++static int ++lookup_id_by_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *addr) ++{ ++ struct mptcp_pm_addr_entry *entry; ++ int ret = -1; ++ ++ rcu_read_lock(); ++ list_for_each_entry(entry, &pernet->local_addr_list, list) { ++ if (addresses_equal(&entry->addr, addr, entry->addr.port)) { ++ ret = entry->addr.id; ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ return ret; ++} ++ + static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) + { + struct sock *sk = (struct sock *)msk; +@@ -475,6 +501,19 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) + local_addr_max = mptcp_pm_get_local_addr_max(msk); + subflows_max = mptcp_pm_get_subflows_max(msk); + ++ /* do lazy endpoint usage accounting for the MPC subflows */ ++ if (unlikely(!(msk->pm.status & BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED))) && msk->first) { ++ struct mptcp_addr_info local; ++ int mpc_id; ++ ++ local_address((struct sock_common *)msk->first, &local); ++ mpc_id = lookup_id_by_addr(pernet, &local); ++ if (mpc_id < 0) ++ __clear_bit(mpc_id, msk->pm.id_avail_bitmap); ++ ++ msk->pm.status |= BIT(MPTCP_PM_MPC_ENDPOINT_ACCOUNTED); ++ } ++ + pr_debug("local %d:%d signal %d:%d subflows %d:%d\n", + msk->pm.local_addr_used, local_addr_max, + msk->pm.add_addr_signaled, add_addr_signal_max, +@@ -482,21 +521,16 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) + + /* check first for announce */ + if (msk->pm.add_addr_signaled < add_addr_signal_max) { +- local = select_signal_address(pernet, +- msk->pm.add_addr_signaled); ++ local = select_signal_address(pernet, msk); + + if (local) { + if (mptcp_pm_alloc_anno_list(msk, local)) { ++ __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + msk->pm.add_addr_signaled++; + mptcp_pm_announce_addr(msk, &local->addr, false); + mptcp_pm_nl_addr_send_ack(msk); + } +- } else { +- /* pick failed, avoid fourther attempts later */ +- msk->pm.local_addr_used = add_addr_signal_max; + } +- +- check_work_pending(msk); + } + + /* check if should create a new subflow */ +@@ -510,19 +544,16 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) + int i, nr; + + msk->pm.local_addr_used++; +- check_work_pending(msk); + nr = fill_remote_addresses_vec(msk, fullmesh, addrs); ++ if (nr) ++ __clear_bit(local->addr.id, msk->pm.id_avail_bitmap); + spin_unlock_bh(&msk->pm.lock); + for (i = 0; i < nr; i++) + __mptcp_subflow_connect(sk, &local->addr, &addrs[i]); + spin_lock_bh(&msk->pm.lock); +- return; + } +- +- /* lookup failed, avoid fourther attempts later */ +- msk->pm.local_addr_used = local_addr_max; +- check_work_pending(msk); + } ++ check_work_pending(msk); + } + + static void mptcp_pm_nl_fully_established(struct mptcp_sock *msk) +@@ -736,6 +767,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, + msk->pm.subflows--; + __MPTCP_INC_STATS(sock_net(sk), rm_type); + } ++ __set_bit(rm_list->ids[1], msk->pm.id_avail_bitmap); + if (!removed) + continue; + +@@ -765,6 +797,9 @@ void mptcp_pm_nl_work(struct mptcp_sock *msk) + + msk_owned_by_me(msk); + ++ if (!(pm->status & MPTCP_PM_WORK_MASK)) ++ return; ++ + spin_lock_bh(&msk->pm.lock); + + pr_debug("msk=%p status=%x", msk, pm->status); +@@ -810,7 +845,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, + /* to keep the code simple, don't do IDR-like allocation for address ID, + * just bail when we exceed limits + */ +- if (pernet->next_id == MAX_ADDR_ID) ++ if (pernet->next_id == MPTCP_PM_MAX_ADDR_ID) + pernet->next_id = 1; + if (pernet->addrs >= MPTCP_PM_ADDR_MAX) + goto out; +@@ -830,7 +865,7 @@ static int mptcp_pm_nl_append_new_local_addr(struct pm_nl_pernet *pernet, + if (!entry->addr.id) { + find_next: + entry->addr.id = find_next_zero_bit(pernet->id_bitmap, +- MAX_ADDR_ID + 1, ++ MPTCP_PM_MAX_ADDR_ID + 1, + pernet->next_id); + if (!entry->addr.id && pernet->next_id != 1) { + pernet->next_id = 1; +@@ -1197,18 +1232,6 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) + return 0; + } + +-static struct mptcp_pm_addr_entry * +-__lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) +-{ +- struct mptcp_pm_addr_entry *entry; +- +- list_for_each_entry(entry, &pernet->local_addr_list, list) { +- if (entry->addr.id == id) +- return entry; +- } +- return NULL; +-} +- + int mptcp_pm_get_flags_and_ifindex_by_id(struct net *net, unsigned int id, + u8 *flags, int *ifindex) + { +@@ -1467,7 +1490,7 @@ static int mptcp_nl_cmd_flush_addrs(struct sk_buff *skb, struct genl_info *info) + list_splice_init(&pernet->local_addr_list, &free_list); + __reset_counters(pernet); + pernet->next_id = 1; +- bitmap_zero(pernet->id_bitmap, MAX_ADDR_ID + 1); ++ bitmap_zero(pernet->id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + spin_unlock_bh(&pernet->lock); + mptcp_nl_remove_addrs_list(sock_net(skb->sk), &free_list); + synchronize_rcu(); +@@ -1577,7 +1600,7 @@ static int mptcp_nl_cmd_dump_addrs(struct sk_buff *msg, + pernet = net_generic(net, pm_nl_pernet_id); + + spin_lock_bh(&pernet->lock); +- for (i = id; i < MAX_ADDR_ID + 1; i++) { ++ for (i = id; i < MPTCP_PM_MAX_ADDR_ID + 1; i++) { + if (test_bit(i, pernet->id_bitmap)) { + entry = __lookup_addr_by_id(pernet, i); + if (!entry) +diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c +index 4c889552cde77..354f169ca120a 100644 +--- a/net/mptcp/protocol.c ++++ b/net/mptcp/protocol.c +@@ -2435,8 +2435,7 @@ static void mptcp_worker(struct work_struct *work) + + mptcp_check_fastclose(msk); + +- if (msk->pm.status) +- mptcp_pm_nl_work(msk); ++ mptcp_pm_nl_work(msk); + + if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags)) + mptcp_check_for_eof(msk); +diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h +index 82c5dc4d6b49d..9fc6f494075fa 100644 +--- a/net/mptcp/protocol.h ++++ b/net/mptcp/protocol.h +@@ -174,16 +174,25 @@ enum mptcp_pm_status { + MPTCP_PM_ADD_ADDR_SEND_ACK, + MPTCP_PM_RM_ADDR_RECEIVED, + MPTCP_PM_ESTABLISHED, +- MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */ + MPTCP_PM_SUBFLOW_ESTABLISHED, ++ MPTCP_PM_ALREADY_ESTABLISHED, /* persistent status, set after ESTABLISHED event */ ++ MPTCP_PM_MPC_ENDPOINT_ACCOUNTED /* persistent status, set after MPC local address is ++ * accounted int id_avail_bitmap ++ */ + }; + ++/* Status bits below MPTCP_PM_ALREADY_ESTABLISHED need pm worker actions */ ++#define MPTCP_PM_WORK_MASK ((1 << MPTCP_PM_ALREADY_ESTABLISHED) - 1) ++ + enum mptcp_addr_signal_status { + MPTCP_ADD_ADDR_SIGNAL, + MPTCP_ADD_ADDR_ECHO, + MPTCP_RM_ADDR_SIGNAL, + }; + ++/* max value of mptcp_addr_info.id */ ++#define MPTCP_PM_MAX_ADDR_ID U8_MAX ++ + struct mptcp_pm_data { + struct mptcp_addr_info local; + struct mptcp_addr_info remote; +@@ -202,6 +211,7 @@ struct mptcp_pm_data { + u8 local_addr_used; + u8 subflows; + u8 status; ++ DECLARE_BITMAP(id_avail_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); + struct mptcp_rm_list rm_list_tx; + struct mptcp_rm_list rm_list_rx; + }; +diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh +index 586af88194e56..0c12602fa22e8 100755 +--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh ++++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh +@@ -1068,7 +1068,10 @@ signal_address_tests() + ip netns exec $ns2 ./pm_nl_ctl add 10.0.3.2 flags signal + ip netns exec $ns2 ./pm_nl_ctl add 10.0.4.2 flags signal + run_tests $ns1 $ns2 10.0.1.1 +- chk_add_nr 4 4 ++ ++ # the server will not signal the address terminating ++ # the MPC subflow ++ chk_add_nr 3 3 + } + + link_failure_tests() +-- +2.34.1 + diff --git a/queue-5.15/net-fix-information-leakage-in-proc-net-ptype.patch b/queue-5.15/net-fix-information-leakage-in-proc-net-ptype.patch new file mode 100644 index 00000000000..21968b1e43f --- /dev/null +++ b/queue-5.15/net-fix-information-leakage-in-proc-net-ptype.patch @@ -0,0 +1,78 @@ +From 62cf11a278e6b862b0ad7d05a518dbd832f6935c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 18 Jan 2022 14:20:13 -0500 +Subject: net: fix information leakage in /proc/net/ptype + +From: Congyu Liu + +[ Upstream commit 47934e06b65637c88a762d9c98329ae6e3238888 ] + +In one net namespace, after creating a packet socket without binding +it to a device, users in other net namespaces can observe the new +`packet_type` added by this packet socket by reading `/proc/net/ptype` +file. This is minor information leakage as packet socket is +namespace aware. + +Add a net pointer in `packet_type` to keep the net namespace of +of corresponding packet socket. In `ptype_seq_show`, this net pointer +must be checked when it is not NULL. + +Fixes: 2feb27dbe00c ("[NETNS]: Minor information leak via /proc/net/ptype file.") +Signed-off-by: Congyu Liu +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + include/linux/netdevice.h | 1 + + net/core/net-procfs.c | 3 ++- + net/packet/af_packet.c | 2 ++ + 3 files changed, 5 insertions(+), 1 deletion(-) + +diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h +index ce81cc96a98d9..fba54624191a2 100644 +--- a/include/linux/netdevice.h ++++ b/include/linux/netdevice.h +@@ -2636,6 +2636,7 @@ struct packet_type { + struct net_device *); + bool (*id_match)(struct packet_type *ptype, + struct sock *sk); ++ struct net *af_packet_net; + void *af_packet_priv; + struct list_head list; + }; +diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c +index d8b9dbabd4a43..5b8016335acaf 100644 +--- a/net/core/net-procfs.c ++++ b/net/core/net-procfs.c +@@ -260,7 +260,8 @@ static int ptype_seq_show(struct seq_file *seq, void *v) + + if (v == SEQ_START_TOKEN) + seq_puts(seq, "Type Device Function\n"); +- else if (pt->dev == NULL || dev_net(pt->dev) == seq_file_net(seq)) { ++ else if ((!pt->af_packet_net || net_eq(pt->af_packet_net, seq_file_net(seq))) && ++ (!pt->dev || net_eq(dev_net(pt->dev), seq_file_net(seq)))) { + if (pt->type == htons(ETH_P_ALL)) + seq_puts(seq, "ALL "); + else +diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c +index 1bc7ef49e1487..1a138e8d32d66 100644 +--- a/net/packet/af_packet.c ++++ b/net/packet/af_packet.c +@@ -1738,6 +1738,7 @@ static int fanout_add(struct sock *sk, struct fanout_args *args) + match->prot_hook.dev = po->prot_hook.dev; + match->prot_hook.func = packet_rcv_fanout; + match->prot_hook.af_packet_priv = match; ++ match->prot_hook.af_packet_net = read_pnet(&match->net); + match->prot_hook.id_match = match_fanout_group; + match->max_num_members = args->max_num_members; + list_add(&match->list, &fanout_list); +@@ -3323,6 +3324,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, + po->prot_hook.func = packet_rcv_spkt; + + po->prot_hook.af_packet_priv = sk; ++ po->prot_hook.af_packet_net = sock_net(sk); + + if (proto) { + po->prot_hook.type = proto; +-- +2.34.1 + diff --git a/queue-5.15/net-phy-broadcom-hook-up-soft_reset-for-bcm54616s.patch b/queue-5.15/net-phy-broadcom-hook-up-soft_reset-for-bcm54616s.patch new file mode 100644 index 00000000000..7b85ce17243 --- /dev/null +++ b/queue-5.15/net-phy-broadcom-hook-up-soft_reset-for-bcm54616s.patch @@ -0,0 +1,61 @@ +From 340b01a2aa30d880662fbccca87b15467d53c257 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 18 Jan 2022 15:52:43 -0600 +Subject: net: phy: broadcom: hook up soft_reset for BCM54616S + +From: Robert Hancock + +[ Upstream commit d15c7e875d44367005370e6a82e8f3a382a04f9b ] + +A problem was encountered with the Bel-Fuse 1GBT-SFP05 SFP module (which +is a 1 Gbps copper module operating in SGMII mode with an internal +BCM54616S PHY device) using the Xilinx AXI Ethernet MAC core, where the +module would work properly on the initial insertion or boot of the +device, but after the device was rebooted, the link would either only +come up at 100 Mbps speeds or go up and down erratically. + +I found no meaningful changes in the PHY configuration registers between +the working and non-working boots, but the status registers seemed to +have a lot of error indications set on the SERDES side of the device on +the non-working boot. I suspect the problem is that whatever happens on +the SGMII link when the device is rebooted and the FPGA logic gets +reloaded ends up putting the module's onboard PHY into a bad state. + +Since commit 6e2d85ec0559 ("net: phy: Stop with excessive soft reset") +the genphy_soft_reset call is not made automatically by the PHY core +unless the callback is explicitly specified in the driver structure. For +most of these Broadcom devices, there is probably a hardware reset that +gets asserted to reset the PHY during boot, however for SFP modules +(where the BCM54616S is commonly found) no such reset line exists, so if +the board keeps the SFP cage powered up across a reboot, it will end up +with no reset occurring during reboots. + +Hook up the genphy_soft_reset callback for BCM54616S to ensure that a +PHY reset is performed before the device is initialized. This appears to +fix the issue with erratic operation after a reboot with this SFP +module. + +Fixes: 6e2d85ec0559 ("net: phy: Stop with excessive soft reset") +Signed-off-by: Robert Hancock +Reviewed-by: Florian Fainelli +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/phy/broadcom.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/net/phy/broadcom.c b/drivers/net/phy/broadcom.c +index 83aea5c5cd03c..db26ff8ce7dbb 100644 +--- a/drivers/net/phy/broadcom.c ++++ b/drivers/net/phy/broadcom.c +@@ -768,6 +768,7 @@ static struct phy_driver broadcom_drivers[] = { + .phy_id_mask = 0xfffffff0, + .name = "Broadcom BCM54616S", + /* PHY_GBIT_FEATURES */ ++ .soft_reset = genphy_soft_reset, + .config_init = bcm54xx_config_init, + .config_aneg = bcm54616s_config_aneg, + .config_intr = bcm_phy_config_intr, +-- +2.34.1 + diff --git a/queue-5.15/net-stmmac-dwmac-visconti-fix-bit-definitions-for-et.patch b/queue-5.15/net-stmmac-dwmac-visconti-fix-bit-definitions-for-et.patch new file mode 100644 index 00000000000..d28d2693746 --- /dev/null +++ b/queue-5.15/net-stmmac-dwmac-visconti-fix-bit-definitions-for-et.patch @@ -0,0 +1,60 @@ +From 2838c988be07cfa57a51a5eb22da09d8f039fad9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 19 Jan 2022 13:46:47 +0900 +Subject: net: stmmac: dwmac-visconti: Fix bit definitions for ETHER_CLK_SEL + +From: Yuji Ishikawa + +[ Upstream commit 1ba1a4a90fa416a6f389206416c5f488cf8b1543 ] + +just 0 should be used to represent cleared bits + +* ETHER_CLK_SEL_DIV_SEL_20 +* ETHER_CLK_SEL_TX_CLK_EXT_SEL_IN +* ETHER_CLK_SEL_RX_CLK_EXT_SEL_IN +* ETHER_CLK_SEL_TX_CLK_O_TX_I +* ETHER_CLK_SEL_RMII_CLK_SEL_IN + +Fixes: b38dd98ff8d0 ("net: stmmac: Add Toshiba Visconti SoCs glue driver") +Signed-off-by: Yuji Ishikawa +Reviewed-by: Nobuhiro Iwamatsu +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c +index fac788718c045..1c599a005aab6 100644 +--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c ++++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c +@@ -22,21 +22,21 @@ + #define ETHER_CLK_SEL_RMII_CLK_EN BIT(2) + #define ETHER_CLK_SEL_RMII_CLK_RST BIT(3) + #define ETHER_CLK_SEL_DIV_SEL_2 BIT(4) +-#define ETHER_CLK_SEL_DIV_SEL_20 BIT(0) ++#define ETHER_CLK_SEL_DIV_SEL_20 0 + #define ETHER_CLK_SEL_FREQ_SEL_125M (BIT(9) | BIT(8)) + #define ETHER_CLK_SEL_FREQ_SEL_50M BIT(9) + #define ETHER_CLK_SEL_FREQ_SEL_25M BIT(8) + #define ETHER_CLK_SEL_FREQ_SEL_2P5M 0 +-#define ETHER_CLK_SEL_TX_CLK_EXT_SEL_IN BIT(0) ++#define ETHER_CLK_SEL_TX_CLK_EXT_SEL_IN 0 + #define ETHER_CLK_SEL_TX_CLK_EXT_SEL_TXC BIT(10) + #define ETHER_CLK_SEL_TX_CLK_EXT_SEL_DIV BIT(11) +-#define ETHER_CLK_SEL_RX_CLK_EXT_SEL_IN BIT(0) ++#define ETHER_CLK_SEL_RX_CLK_EXT_SEL_IN 0 + #define ETHER_CLK_SEL_RX_CLK_EXT_SEL_RXC BIT(12) + #define ETHER_CLK_SEL_RX_CLK_EXT_SEL_DIV BIT(13) +-#define ETHER_CLK_SEL_TX_CLK_O_TX_I BIT(0) ++#define ETHER_CLK_SEL_TX_CLK_O_TX_I 0 + #define ETHER_CLK_SEL_TX_CLK_O_RMII_I BIT(14) + #define ETHER_CLK_SEL_TX_O_E_N_IN BIT(15) +-#define ETHER_CLK_SEL_RMII_CLK_SEL_IN BIT(0) ++#define ETHER_CLK_SEL_RMII_CLK_SEL_IN 0 + #define ETHER_CLK_SEL_RMII_CLK_SEL_RX_C BIT(16) + + #define ETHER_CLK_SEL_RX_TX_CLK_EN (ETHER_CLK_SEL_RX_CLK_EN | ETHER_CLK_SEL_TX_CLK_EN) +-- +2.34.1 + diff --git a/queue-5.15/net-stmmac-dwmac-visconti-fix-clock-configuration-fo.patch b/queue-5.15/net-stmmac-dwmac-visconti-fix-clock-configuration-fo.patch new file mode 100644 index 00000000000..9f5aedd9e8e --- /dev/null +++ b/queue-5.15/net-stmmac-dwmac-visconti-fix-clock-configuration-fo.patch @@ -0,0 +1,81 @@ +From 9a25d5f55f530a13abb63b04f35997acfb20c983 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 19 Jan 2022 13:46:48 +0900 +Subject: net: stmmac: dwmac-visconti: Fix clock configuration for RMII mode + +From: Yuji Ishikawa + +[ Upstream commit 0959bc4bd4206433ed101a1332a23e93ad16ec77 ] + +Bit pattern of the ETHER_CLOCK_SEL register for RMII/MII mode should be fixed. +Also, some control bits should be modified with a specific sequence. + +Fixes: b38dd98ff8d0 ("net: stmmac: Add Toshiba Visconti SoCs glue driver") +Signed-off-by: Yuji Ishikawa +Reviewed-by: Nobuhiro Iwamatsu +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + .../ethernet/stmicro/stmmac/dwmac-visconti.c | 32 ++++++++++++------- + 1 file changed, 21 insertions(+), 11 deletions(-) + +diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c +index 1c599a005aab6..4578c64953eac 100644 +--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c ++++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-visconti.c +@@ -96,31 +96,41 @@ static void visconti_eth_fix_mac_speed(void *priv, unsigned int speed) + val |= ETHER_CLK_SEL_TX_O_E_N_IN; + writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); + ++ /* Set Clock-Mux, Start clock, Set TX_O direction */ + switch (dwmac->phy_intf_sel) { + case ETHER_CONFIG_INTF_RGMII: + val = clk_sel_val | ETHER_CLK_SEL_RX_CLK_EXT_SEL_RXC; ++ writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); ++ ++ val |= ETHER_CLK_SEL_RX_TX_CLK_EN; ++ writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); ++ ++ val &= ~ETHER_CLK_SEL_TX_O_E_N_IN; ++ writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); + break; + case ETHER_CONFIG_INTF_RMII: + val = clk_sel_val | ETHER_CLK_SEL_RX_CLK_EXT_SEL_DIV | +- ETHER_CLK_SEL_TX_CLK_EXT_SEL_TXC | ETHER_CLK_SEL_TX_O_E_N_IN | ++ ETHER_CLK_SEL_TX_CLK_EXT_SEL_DIV | ETHER_CLK_SEL_TX_O_E_N_IN | + ETHER_CLK_SEL_RMII_CLK_SEL_RX_C; ++ writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); ++ ++ val |= ETHER_CLK_SEL_RMII_CLK_RST; ++ writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); ++ ++ val |= ETHER_CLK_SEL_RMII_CLK_EN | ETHER_CLK_SEL_RX_TX_CLK_EN; ++ writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); + break; + case ETHER_CONFIG_INTF_MII: + default: + val = clk_sel_val | ETHER_CLK_SEL_RX_CLK_EXT_SEL_RXC | +- ETHER_CLK_SEL_TX_CLK_EXT_SEL_DIV | ETHER_CLK_SEL_TX_O_E_N_IN | +- ETHER_CLK_SEL_RMII_CLK_EN; ++ ETHER_CLK_SEL_TX_CLK_EXT_SEL_TXC | ETHER_CLK_SEL_TX_O_E_N_IN; ++ writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); ++ ++ val |= ETHER_CLK_SEL_RX_TX_CLK_EN; ++ writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); + break; + } + +- /* Start clock */ +- writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); +- val |= ETHER_CLK_SEL_RX_TX_CLK_EN; +- writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); +- +- val &= ~ETHER_CLK_SEL_TX_O_E_N_IN; +- writel(val, dwmac->reg + REG_ETHER_CLOCK_SEL); +- + spin_unlock_irqrestore(&dwmac->lock, flags); + } + +-- +2.34.1 + diff --git a/queue-5.15/netfilter-conntrack-don-t-increment-invalid-counter-.patch b/queue-5.15/netfilter-conntrack-don-t-increment-invalid-counter-.patch new file mode 100644 index 00000000000..0957bbe92bb --- /dev/null +++ b/queue-5.15/netfilter-conntrack-don-t-increment-invalid-counter-.patch @@ -0,0 +1,51 @@ +From 27e186efa8e0b0f6f604bc4c41dc716cdf6d770f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 13 Jan 2022 21:37:58 +0100 +Subject: netfilter: conntrack: don't increment invalid counter on NF_REPEAT + +From: Florian Westphal + +[ Upstream commit 830af2eba40327abec64325a5b08b1e85c37a2e0 ] + +The packet isn't invalid, REPEAT means we're trying again after cleaning +out a stale connection, e.g. via tcp tracker. + +This caused increases of invalid stat counter in a test case involving +frequent connection reuse, even though no packet is actually invalid. + +Fixes: 56a62e2218f5 ("netfilter: conntrack: fix NF_REPEAT handling") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_conntrack_core.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c +index 4712a90a1820c..7f79974607643 100644 +--- a/net/netfilter/nf_conntrack_core.c ++++ b/net/netfilter/nf_conntrack_core.c +@@ -1922,15 +1922,17 @@ repeat: + pr_debug("nf_conntrack_in: Can't track with proto module\n"); + nf_conntrack_put(&ct->ct_general); + skb->_nfct = 0; +- NF_CT_STAT_INC_ATOMIC(state->net, invalid); +- if (ret == -NF_DROP) +- NF_CT_STAT_INC_ATOMIC(state->net, drop); + /* Special case: TCP tracker reports an attempt to reopen a + * closed/aborted connection. We have to go back and create a + * fresh conntrack. + */ + if (ret == -NF_REPEAT) + goto repeat; ++ ++ NF_CT_STAT_INC_ATOMIC(state->net, invalid); ++ if (ret == -NF_DROP) ++ NF_CT_STAT_INC_ATOMIC(state->net, drop); ++ + ret = -ret; + goto out; + } +-- +2.34.1 + diff --git a/queue-5.15/nfs-ensure-the-server-has-an-up-to-date-ctime-before.patch b/queue-5.15/nfs-ensure-the-server-has-an-up-to-date-ctime-before.patch new file mode 100644 index 00000000000..b72fc657e89 --- /dev/null +++ b/queue-5.15/nfs-ensure-the-server-has-an-up-to-date-ctime-before.patch @@ -0,0 +1,37 @@ +From 819a1638d01ceef0d04387333898e7e819560df2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Dec 2021 16:38:15 -0500 +Subject: NFS: Ensure the server has an up to date ctime before hardlinking + +From: Trond Myklebust + +[ Upstream commit 204975036b34f55237bc44c8a302a88468ef21b5 ] + +Creating a hard link is required by POSIX to update the file ctime, so +ensure that the file data is synced to disk so that we don't clobber the +updated ctime by writing back after creating the hard link. + +Fixes: 9f7682728728 ("NFS: Move the delegation return down into nfs4_proc_link()") +Signed-off-by: Trond Myklebust +Signed-off-by: Anna Schumaker +Signed-off-by: Sasha Levin +--- + fs/nfs/dir.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c +index 5b68c44848caf..fdea6ff824cf8 100644 +--- a/fs/nfs/dir.c ++++ b/fs/nfs/dir.c +@@ -2382,6 +2382,8 @@ nfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) + + trace_nfs_link_enter(inode, dir, dentry); + d_drop(dentry); ++ if (S_ISREG(inode->i_mode)) ++ nfs_sync_inode(inode); + error = NFS_PROTO(dir)->link(inode, dir, &dentry->d_name); + if (error == 0) { + ihold(inode); +-- +2.34.1 + diff --git a/queue-5.15/nfs-ensure-the-server-has-an-up-to-date-ctime-before.patch-14723 b/queue-5.15/nfs-ensure-the-server-has-an-up-to-date-ctime-before.patch-14723 new file mode 100644 index 00000000000..e5800304f40 --- /dev/null +++ b/queue-5.15/nfs-ensure-the-server-has-an-up-to-date-ctime-before.patch-14723 @@ -0,0 +1,37 @@ +From f255aff22cfe3d57206c276db792e6b8c7619b60 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Dec 2021 16:38:16 -0500 +Subject: NFS: Ensure the server has an up to date ctime before renaming + +From: Trond Myklebust + +[ Upstream commit 6ff9d99bb88faebf134ca668842349d9718e5464 ] + +Renaming a file is required by POSIX to update the file ctime, so +ensure that the file data is synced to disk so that we don't clobber the +updated ctime by writing back after creating the hard link. + +Fixes: f2c2c552f119 ("NFS: Move delegation recall into the NFSv4 callback for rename_setup()") +Signed-off-by: Trond Myklebust +Signed-off-by: Anna Schumaker +Signed-off-by: Sasha Levin +--- + fs/nfs/dir.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c +index fdea6ff824cf8..ac0e1ab1b71ba 100644 +--- a/fs/nfs/dir.c ++++ b/fs/nfs/dir.c +@@ -2472,6 +2472,8 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, + } + } + ++ if (S_ISREG(old_inode->i_mode)) ++ nfs_sync_inode(old_inode); + task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, NULL); + if (IS_ERR(task)) { + error = PTR_ERR(task); +-- +2.34.1 + diff --git a/queue-5.15/nfsv4-handle-case-where-the-lookup-of-a-directory-fa.patch b/queue-5.15/nfsv4-handle-case-where-the-lookup-of-a-directory-fa.patch new file mode 100644 index 00000000000..27fb96138e1 --- /dev/null +++ b/queue-5.15/nfsv4-handle-case-where-the-lookup-of-a-directory-fa.patch @@ -0,0 +1,52 @@ +From c407b6e1c8205cf17ffa2218dbc9bd5fb00b7000 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 6 Jan 2022 18:24:02 -0500 +Subject: NFSv4: Handle case where the lookup of a directory fails + +From: Trond Myklebust + +[ Upstream commit ac795161c93699d600db16c1a8cc23a65a1eceaf ] + +If the application sets the O_DIRECTORY flag, and tries to open a +regular file, nfs_atomic_open() will punt to doing a regular lookup. +If the server then returns a regular file, we will happily return a +file descriptor with uninitialised open state. + +The fix is to return the expected ENOTDIR error in these cases. + +Reported-by: Lyu Tao +Fixes: 0dd2b474d0b6 ("nfs: implement i_op->atomic_open()") +Signed-off-by: Trond Myklebust +Signed-off-by: Anna Schumaker +Signed-off-by: Sasha Levin +--- + fs/nfs/dir.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c +index ac0e1ab1b71ba..2d156311c374d 100644 +--- a/fs/nfs/dir.c ++++ b/fs/nfs/dir.c +@@ -1982,6 +1982,19 @@ out: + + no_open: + res = nfs_lookup(dir, dentry, lookup_flags); ++ if (!res) { ++ inode = d_inode(dentry); ++ if ((lookup_flags & LOOKUP_DIRECTORY) && inode && ++ !S_ISDIR(inode->i_mode)) ++ res = ERR_PTR(-ENOTDIR); ++ } else if (!IS_ERR(res)) { ++ inode = d_inode(res); ++ if ((lookup_flags & LOOKUP_DIRECTORY) && inode && ++ !S_ISDIR(inode->i_mode)) { ++ dput(res); ++ res = ERR_PTR(-ENOTDIR); ++ } ++ } + if (switched) { + d_lookup_done(dentry); + if (!res) +-- +2.34.1 + diff --git a/queue-5.15/nfsv4-nfs_atomic_open-can-race-when-looking-up-a-non.patch b/queue-5.15/nfsv4-nfs_atomic_open-can-race-when-looking-up-a-non.patch new file mode 100644 index 00000000000..3b7fffed9a4 --- /dev/null +++ b/queue-5.15/nfsv4-nfs_atomic_open-can-race-when-looking-up-a-non.patch @@ -0,0 +1,45 @@ +From 46db83ff5fbfc5cb89d9903bad6b205eab622c9b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 6 Jan 2022 18:24:03 -0500 +Subject: NFSv4: nfs_atomic_open() can race when looking up a non-regular file + +From: Trond Myklebust + +[ Upstream commit 1751fc1db36f6f411709e143d5393f92d12137a9 ] + +If the file type changes back to being a regular file on the server +between the failed OPEN and our LOOKUP, then we need to re-run the OPEN. + +Fixes: 0dd2b474d0b6 ("nfs: implement i_op->atomic_open()") +Signed-off-by: Trond Myklebust +Signed-off-by: Anna Schumaker +Signed-off-by: Sasha Levin +--- + fs/nfs/dir.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c +index 2d156311c374d..ed79c1bd84a29 100644 +--- a/fs/nfs/dir.c ++++ b/fs/nfs/dir.c +@@ -1987,12 +1987,17 @@ no_open: + if ((lookup_flags & LOOKUP_DIRECTORY) && inode && + !S_ISDIR(inode->i_mode)) + res = ERR_PTR(-ENOTDIR); ++ else if (inode && S_ISREG(inode->i_mode)) ++ res = ERR_PTR(-EOPENSTALE); + } else if (!IS_ERR(res)) { + inode = d_inode(res); + if ((lookup_flags & LOOKUP_DIRECTORY) && inode && + !S_ISDIR(inode->i_mode)) { + dput(res); + res = ERR_PTR(-ENOTDIR); ++ } else if (inode && S_ISREG(inode->i_mode)) { ++ dput(res); ++ res = ERR_PTR(-EOPENSTALE); + } + } + if (switched) { +-- +2.34.1 + diff --git a/queue-5.15/octeontx2-af-cn10k-do-not-enable-rpm-loopback-for-lp.patch b/queue-5.15/octeontx2-af-cn10k-do-not-enable-rpm-loopback-for-lp.patch new file mode 100644 index 00000000000..b43db5b221c --- /dev/null +++ b/queue-5.15/octeontx2-af-cn10k-do-not-enable-rpm-loopback-for-lp.patch @@ -0,0 +1,64 @@ +From 069df600c33a90fe62f8eddf777e4da0a31795a9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 21 Jan 2022 12:04:45 +0530 +Subject: octeontx2-af: cn10k: Do not enable RPM loopback for LPC interfaces + +From: Geetha sowjanya + +[ Upstream commit df66b6ebc5dcf7253e35a640b9ec4add54195c25 ] + +Internal looback is not supported to low rate LPCS interface like +SGMII/QSGMII. Hence don't allow to enable for such interfaces. + +Fixes: 3ad3f8f93c81 ("octeontx2-af: cn10k: MAC internal loopback support") +Signed-off-by: Geetha sowjanya +Signed-off-by: Subbaraya Sundeep +Signed-off-by: Sunil Goutham +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + .../net/ethernet/marvell/octeontx2/af/rpm.c | 27 +++++++++---------- + 1 file changed, 12 insertions(+), 15 deletions(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c +index 07b0eafccad87..b3803577324e6 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rpm.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rpm.c +@@ -251,22 +251,19 @@ int rpm_lmac_internal_loopback(void *rpmd, int lmac_id, bool enable) + if (!rpm || lmac_id >= rpm->lmac_count) + return -ENODEV; + lmac_type = rpm->mac_ops->get_lmac_type(rpm, lmac_id); +- if (lmac_type == LMAC_MODE_100G_R) { +- cfg = rpm_read(rpm, lmac_id, RPMX_MTI_PCS100X_CONTROL1); +- +- if (enable) +- cfg |= RPMX_MTI_PCS_LBK; +- else +- cfg &= ~RPMX_MTI_PCS_LBK; +- rpm_write(rpm, lmac_id, RPMX_MTI_PCS100X_CONTROL1, cfg); +- } else { +- cfg = rpm_read(rpm, lmac_id, RPMX_MTI_LPCSX_CONTROL1); +- if (enable) +- cfg |= RPMX_MTI_PCS_LBK; +- else +- cfg &= ~RPMX_MTI_PCS_LBK; +- rpm_write(rpm, lmac_id, RPMX_MTI_LPCSX_CONTROL1, cfg); ++ ++ if (lmac_type == LMAC_MODE_QSGMII || lmac_type == LMAC_MODE_SGMII) { ++ dev_err(&rpm->pdev->dev, "loopback not supported for LPC mode\n"); ++ return 0; + } + ++ cfg = rpm_read(rpm, lmac_id, RPMX_MTI_PCS100X_CONTROL1); ++ ++ if (enable) ++ cfg |= RPMX_MTI_PCS_LBK; ++ else ++ cfg &= ~RPMX_MTI_PCS_LBK; ++ rpm_write(rpm, lmac_id, RPMX_MTI_PCS100X_CONTROL1, cfg); ++ + return 0; + } +-- +2.34.1 + diff --git a/queue-5.15/octeontx2-af-do-not-fixup-all-vf-action-entries.patch b/queue-5.15/octeontx2-af-do-not-fixup-all-vf-action-entries.patch new file mode 100644 index 00000000000..88da0239f3f --- /dev/null +++ b/queue-5.15/octeontx2-af-do-not-fixup-all-vf-action-entries.patch @@ -0,0 +1,131 @@ +From 89fd81ca0a9542b5236f7e29affc9a072b18deed Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 21 Jan 2022 12:04:39 +0530 +Subject: octeontx2-af: Do not fixup all VF action entries + +From: Subbaraya Sundeep + +[ Upstream commit d225c449ab2be25273a3674f476c6c0b57c50254 ] + +AF modifies all the rules destined for VF to use +the action same as default RSS action. This fixup +was needed because AF only installs default rules with +RSS action. But the action in rules installed by a PF +for its VFs should not be changed by this fixup. +This is because action can be drop or direct to +queue as specified by user(ntuple filters). +This patch fixes that problem. + +Fixes: 967db3529eca ("octeontx2-af: add support for multicast/promisc packet") +Signed-off-by: Subbaraya Sundeep +Signed-off-by: Naveen Mamindlapalli +Signed-off-by: Sunil Goutham +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + .../ethernet/marvell/octeontx2/af/rvu_npc.c | 22 ++++++++++++++++--- + .../marvell/octeontx2/af/rvu_npc_fs.c | 20 ++++++++++------- + 2 files changed, 31 insertions(+), 11 deletions(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +index 5efb4174e82df..87f18e32b4634 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc.c +@@ -402,6 +402,7 @@ static void npc_fixup_vf_rule(struct rvu *rvu, struct npc_mcam *mcam, + int blkaddr, int index, struct mcam_entry *entry, + bool *enable) + { ++ struct rvu_npc_mcam_rule *rule; + u16 owner, target_func; + struct rvu_pfvf *pfvf; + u64 rx_action; +@@ -423,6 +424,12 @@ static void npc_fixup_vf_rule(struct rvu *rvu, struct npc_mcam *mcam, + test_bit(NIXLF_INITIALIZED, &pfvf->flags))) + *enable = false; + ++ /* fix up not needed for the rules added by user(ntuple filters) */ ++ list_for_each_entry(rule, &mcam->mcam_rules, list) { ++ if (rule->entry == index) ++ return; ++ } ++ + /* copy VF default entry action to the VF mcam entry */ + rx_action = npc_get_default_entry_action(rvu, mcam, blkaddr, + target_func); +@@ -489,8 +496,8 @@ static void npc_config_mcam_entry(struct rvu *rvu, struct npc_mcam *mcam, + } + + /* PF installing VF rule */ +- if (intf == NIX_INTF_RX && actindex < mcam->bmap_entries) +- npc_fixup_vf_rule(rvu, mcam, blkaddr, index, entry, &enable); ++ if (is_npc_intf_rx(intf) && actindex < mcam->bmap_entries) ++ npc_fixup_vf_rule(rvu, mcam, blkaddr, actindex, entry, &enable); + + /* Set 'action' */ + rvu_write64(rvu, blkaddr, +@@ -916,7 +923,8 @@ static void npc_update_vf_flow_entry(struct rvu *rvu, struct npc_mcam *mcam, + int blkaddr, u16 pcifunc, u64 rx_action) + { + int actindex, index, bank, entry; +- bool enable; ++ struct rvu_npc_mcam_rule *rule; ++ bool enable, update; + + if (!(pcifunc & RVU_PFVF_FUNC_MASK)) + return; +@@ -924,6 +932,14 @@ static void npc_update_vf_flow_entry(struct rvu *rvu, struct npc_mcam *mcam, + mutex_lock(&mcam->lock); + for (index = 0; index < mcam->bmap_entries; index++) { + if (mcam->entry2target_pffunc[index] == pcifunc) { ++ update = true; ++ /* update not needed for the rules added via ntuple filters */ ++ list_for_each_entry(rule, &mcam->mcam_rules, list) { ++ if (rule->entry == index) ++ update = false; ++ } ++ if (!update) ++ continue; + bank = npc_get_bank(mcam, index); + actindex = index; + entry = index & (mcam->banksize - 1); +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c +index 51ddc7b81d0bd..ca404d51d9f56 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_npc_fs.c +@@ -1098,14 +1098,6 @@ find_rule: + write_req.cntr = rule->cntr; + } + +- err = rvu_mbox_handler_npc_mcam_write_entry(rvu, &write_req, +- &write_rsp); +- if (err) { +- rvu_mcam_remove_counter_from_rule(rvu, owner, rule); +- if (new) +- kfree(rule); +- return err; +- } + /* update rule */ + memcpy(&rule->packet, &dummy.packet, sizeof(rule->packet)); + memcpy(&rule->mask, &dummy.mask, sizeof(rule->mask)); +@@ -1129,6 +1121,18 @@ find_rule: + if (req->default_rule) + pfvf->def_ucast_rule = rule; + ++ /* write to mcam entry registers */ ++ err = rvu_mbox_handler_npc_mcam_write_entry(rvu, &write_req, ++ &write_rsp); ++ if (err) { ++ rvu_mcam_remove_counter_from_rule(rvu, owner, rule); ++ if (new) { ++ list_del(&rule->list); ++ kfree(rule); ++ } ++ return err; ++ } ++ + /* VF's MAC address is being changed via PF */ + if (pf_set_vfs_mac) { + ether_addr_copy(pfvf->default_mac, req->packet.dmac); +-- +2.34.1 + diff --git a/queue-5.15/octeontx2-af-fix-lbk-backpressure-id-count.patch b/queue-5.15/octeontx2-af-fix-lbk-backpressure-id-count.patch new file mode 100644 index 00000000000..f5e6f76fcc7 --- /dev/null +++ b/queue-5.15/octeontx2-af-fix-lbk-backpressure-id-count.patch @@ -0,0 +1,44 @@ +From 9917c059291ea7f257714fd0f3dc222912189ebf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 21 Jan 2022 12:04:40 +0530 +Subject: octeontx2-af: Fix LBK backpressure id count + +From: Sunil Goutham + +[ Upstream commit 00bfe94e388fe12bfd0d4f6361b1b1343374ff5b ] + +In rvu_nix_get_bpid() lbk_bpid_cnt is being read from +wrong register. Due to this backpressure enable is failing +for LBK VF32 onwards. This patch fixes that. + +Fixes: fe1939bb2340 ("octeontx2-af: Add SDP interface support") +Signed-off-by: Sunil Goutham +Signed-off-by: Subbaraya Sundeep +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +index 6970540dc4709..8ee324aabf2d6 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +@@ -511,11 +511,11 @@ static int rvu_nix_get_bpid(struct rvu *rvu, struct nix_bp_cfg_req *req, + cfg = rvu_read64(rvu, blkaddr, NIX_AF_CONST); + lmac_chan_cnt = cfg & 0xFF; + +- cfg = rvu_read64(rvu, blkaddr, NIX_AF_CONST1); +- sdp_chan_cnt = cfg & 0xFFF; +- + cgx_bpid_cnt = hw->cgx_links * lmac_chan_cnt; + lbk_bpid_cnt = hw->lbk_links * ((cfg >> 16) & 0xFF); ++ ++ cfg = rvu_read64(rvu, blkaddr, NIX_AF_CONST1); ++ sdp_chan_cnt = cfg & 0xFFF; + sdp_bpid_cnt = hw->sdp_links * sdp_chan_cnt; + + pfvf = rvu_get_pfvf(rvu, req->hdr.pcifunc); +-- +2.34.1 + diff --git a/queue-5.15/octeontx2-af-increase-link-credit-restore-polling-ti.patch b/queue-5.15/octeontx2-af-increase-link-credit-restore-polling-ti.patch new file mode 100644 index 00000000000..9c16f4891fa --- /dev/null +++ b/queue-5.15/octeontx2-af-increase-link-credit-restore-polling-ti.patch @@ -0,0 +1,55 @@ +From f8fedc91133ecf56b1538eddb4b5f59cdd64f27e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 21 Jan 2022 12:04:44 +0530 +Subject: octeontx2-af: Increase link credit restore polling timeout + +From: Geetha sowjanya + +[ Upstream commit 1581d61b42d985cefe7b71eea67ab3bfcbf34d0f ] + +It's been observed that sometimes link credit restore takes +a lot of time than the current timeout. This patch increases +the default timeout value and return the proper error value +on failure. + +Fixes: 1c74b89171c3 ("octeontx2-af: Wait for TX link idle for credits change") +Signed-off-by: Geetha sowjanya +Signed-off-by: Subbaraya Sundeep +Signed-off-by: Sunil Goutham +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/marvell/octeontx2/af/mbox.h | 1 + + drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c | 4 ++-- + 2 files changed, 3 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h +index a8618259de943..26ad71842b3b2 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h ++++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h +@@ -700,6 +700,7 @@ enum nix_af_status { + NIX_AF_ERR_BANDPROF_INVAL_REQ = -428, + NIX_AF_ERR_CQ_CTX_WRITE_ERR = -429, + NIX_AF_ERR_AQ_CTX_RETRY_WRITE = -430, ++ NIX_AF_ERR_LINK_CREDITS = -431, + }; + + /* For NIX RX vtag action */ +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +index 9d4cc0ae61474..959266894cf15 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +@@ -3891,8 +3891,8 @@ nix_config_link_credits(struct rvu *rvu, int blkaddr, int link, + NIX_AF_TL1X_SW_XOFF(schq), BIT_ULL(0)); + } + +- rc = -EBUSY; +- poll_tmo = jiffies + usecs_to_jiffies(10000); ++ rc = NIX_AF_ERR_LINK_CREDITS; ++ poll_tmo = jiffies + usecs_to_jiffies(200000); + /* Wait for credits to return */ + do { + if (time_after(jiffies, poll_tmo)) +-- +2.34.1 + diff --git a/queue-5.15/octeontx2-af-retry-until-rvu-block-reset-complete.patch b/queue-5.15/octeontx2-af-retry-until-rvu-block-reset-complete.patch new file mode 100644 index 00000000000..c3354ea661a --- /dev/null +++ b/queue-5.15/octeontx2-af-retry-until-rvu-block-reset-complete.patch @@ -0,0 +1,43 @@ +From b111e1c10fad5cb45c85b2d7ec080e8c37cabf31 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 21 Jan 2022 12:04:41 +0530 +Subject: octeontx2-af: Retry until RVU block reset complete + +From: Geetha sowjanya + +[ Upstream commit 03ffbc9914bd1130fba464f0a41c01372e5fc359 ] + +Few RVU blocks like SSO require more time for reset on some +silicons. Hence retrying the block reset until success. + +Fixes: c0fa2cff8822c ("octeontx2-af: Handle return value in block reset") +Signed-off-by: Geetha sowjanya +Signed-off-by: Subbaraya Sundeep +Signed-off-by: Sunil Goutham +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/marvell/octeontx2/af/rvu.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +index 90dc5343827f0..11ef46e72ddd9 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +@@ -520,8 +520,11 @@ static void rvu_block_reset(struct rvu *rvu, int blkaddr, u64 rst_reg) + + rvu_write64(rvu, blkaddr, rst_reg, BIT_ULL(0)); + err = rvu_poll_reg(rvu, blkaddr, rst_reg, BIT_ULL(63), true); +- if (err) +- dev_err(rvu->dev, "HW block:%d reset failed\n", blkaddr); ++ if (err) { ++ dev_err(rvu->dev, "HW block:%d reset timeout retrying again\n", blkaddr); ++ while (rvu_poll_reg(rvu, blkaddr, rst_reg, BIT_ULL(63), true) == -EBUSY) ++ ; ++ } + } + + static void rvu_reset_all_blocks(struct rvu *rvu) +-- +2.34.1 + diff --git a/queue-5.15/octeontx2-af-verify-cq-context-updates.patch b/queue-5.15/octeontx2-af-verify-cq-context-updates.patch new file mode 100644 index 00000000000..49dc64fb26b --- /dev/null +++ b/queue-5.15/octeontx2-af-verify-cq-context-updates.patch @@ -0,0 +1,148 @@ +From 9f6506cd4e6651d3395b59182ee00509f84a9d66 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 17 Sep 2021 18:40:24 +0530 +Subject: octeontx2-af: verify CQ context updates + +From: Hariprasad Kelam + +[ Upstream commit 14e94f9445a9e91d460f5d4b519f8892c3fb14bb ] + +As per HW errata AQ modification to CQ could be discarded on heavy +traffic. This patch implements workaround for the same after each +CQ write by AQ check whether the requested fields (except those +which HW can update eg: avg_level) are properly updated or not. + +If CQ context is not updated then perform AQ write again. + +Signed-off-by: Hariprasad Kelam +Signed-off-by: Sunil Goutham +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + .../net/ethernet/marvell/octeontx2/af/mbox.h | 2 + + .../ethernet/marvell/octeontx2/af/rvu_nix.c | 78 ++++++++++++++++++- + 2 files changed, 79 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h +index 154877706a0e1..a8618259de943 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/mbox.h ++++ b/drivers/net/ethernet/marvell/octeontx2/af/mbox.h +@@ -698,6 +698,8 @@ enum nix_af_status { + NIX_AF_ERR_INVALID_BANDPROF = -426, + NIX_AF_ERR_IPOLICER_NOTSUPP = -427, + NIX_AF_ERR_BANDPROF_INVAL_REQ = -428, ++ NIX_AF_ERR_CQ_CTX_WRITE_ERR = -429, ++ NIX_AF_ERR_AQ_CTX_RETRY_WRITE = -430, + }; + + /* For NIX RX vtag action */ +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +index 8ee324aabf2d6..9d4cc0ae61474 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_nix.c +@@ -28,6 +28,7 @@ static int nix_verify_bandprof(struct nix_cn10k_aq_enq_req *req, + static int nix_free_all_bandprof(struct rvu *rvu, u16 pcifunc); + static void nix_clear_ratelimit_aggr(struct rvu *rvu, struct nix_hw *nix_hw, + u32 leaf_prof); ++static const char *nix_get_ctx_name(int ctype); + + enum mc_tbl_sz { + MC_TBL_SZ_256, +@@ -1061,10 +1062,68 @@ static int rvu_nix_blk_aq_enq_inst(struct rvu *rvu, struct nix_hw *nix_hw, + return 0; + } + ++static int rvu_nix_verify_aq_ctx(struct rvu *rvu, struct nix_hw *nix_hw, ++ struct nix_aq_enq_req *req, u8 ctype) ++{ ++ struct nix_cn10k_aq_enq_req aq_req; ++ struct nix_cn10k_aq_enq_rsp aq_rsp; ++ int rc, word; ++ ++ if (req->ctype != NIX_AQ_CTYPE_CQ) ++ return 0; ++ ++ rc = nix_aq_context_read(rvu, nix_hw, &aq_req, &aq_rsp, ++ req->hdr.pcifunc, ctype, req->qidx); ++ if (rc) { ++ dev_err(rvu->dev, ++ "%s: Failed to fetch %s%d context of PFFUNC 0x%x\n", ++ __func__, nix_get_ctx_name(ctype), req->qidx, ++ req->hdr.pcifunc); ++ return rc; ++ } ++ ++ /* Make copy of original context & mask which are required ++ * for resubmission ++ */ ++ memcpy(&aq_req.cq_mask, &req->cq_mask, sizeof(struct nix_cq_ctx_s)); ++ memcpy(&aq_req.cq, &req->cq, sizeof(struct nix_cq_ctx_s)); ++ ++ /* exclude fields which HW can update */ ++ aq_req.cq_mask.cq_err = 0; ++ aq_req.cq_mask.wrptr = 0; ++ aq_req.cq_mask.tail = 0; ++ aq_req.cq_mask.head = 0; ++ aq_req.cq_mask.avg_level = 0; ++ aq_req.cq_mask.update_time = 0; ++ aq_req.cq_mask.substream = 0; ++ ++ /* Context mask (cq_mask) holds mask value of fields which ++ * are changed in AQ WRITE operation. ++ * for example cq.drop = 0xa; ++ * cq_mask.drop = 0xff; ++ * Below logic performs '&' between cq and cq_mask so that non ++ * updated fields are masked out for request and response ++ * comparison ++ */ ++ for (word = 0; word < sizeof(struct nix_cq_ctx_s) / sizeof(u64); ++ word++) { ++ *(u64 *)((u8 *)&aq_rsp.cq + word * 8) &= ++ (*(u64 *)((u8 *)&aq_req.cq_mask + word * 8)); ++ *(u64 *)((u8 *)&aq_req.cq + word * 8) &= ++ (*(u64 *)((u8 *)&aq_req.cq_mask + word * 8)); ++ } ++ ++ if (memcmp(&aq_req.cq, &aq_rsp.cq, sizeof(struct nix_cq_ctx_s))) ++ return NIX_AF_ERR_AQ_CTX_RETRY_WRITE; ++ ++ return 0; ++} ++ + static int rvu_nix_aq_enq_inst(struct rvu *rvu, struct nix_aq_enq_req *req, + struct nix_aq_enq_rsp *rsp) + { + struct nix_hw *nix_hw; ++ int err, retries = 5; + int blkaddr; + + blkaddr = rvu_get_blkaddr(rvu, BLKTYPE_NIX, req->hdr.pcifunc); +@@ -1075,7 +1134,24 @@ static int rvu_nix_aq_enq_inst(struct rvu *rvu, struct nix_aq_enq_req *req, + if (!nix_hw) + return NIX_AF_ERR_INVALID_NIXBLK; + +- return rvu_nix_blk_aq_enq_inst(rvu, nix_hw, req, rsp); ++retry: ++ err = rvu_nix_blk_aq_enq_inst(rvu, nix_hw, req, rsp); ++ ++ /* HW errata 'AQ Modification to CQ could be discarded on heavy traffic' ++ * As a work around perfrom CQ context read after each AQ write. If AQ ++ * read shows AQ write is not updated perform AQ write again. ++ */ ++ if (!err && req->op == NIX_AQ_INSTOP_WRITE) { ++ err = rvu_nix_verify_aq_ctx(rvu, nix_hw, req, NIX_AQ_CTYPE_CQ); ++ if (err == NIX_AF_ERR_AQ_CTX_RETRY_WRITE) { ++ if (retries--) ++ goto retry; ++ else ++ return NIX_AF_ERR_CQ_CTX_WRITE_ERR; ++ } ++ } ++ ++ return err; + } + + static const char *nix_get_ctx_name(int ctype) +-- +2.34.1 + diff --git a/queue-5.15/octeontx2-pf-cn10k-ensure-valid-pointers-are-freed-t.patch b/queue-5.15/octeontx2-pf-cn10k-ensure-valid-pointers-are-freed-t.patch new file mode 100644 index 00000000000..dc60a02795b --- /dev/null +++ b/queue-5.15/octeontx2-pf-cn10k-ensure-valid-pointers-are-freed-t.patch @@ -0,0 +1,54 @@ +From 3ce849716925d71e73715c2cee9972c3643202cd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 21 Jan 2022 12:04:43 +0530 +Subject: octeontx2-pf: cn10k: Ensure valid pointers are freed to aura + +From: Geetha sowjanya + +[ Upstream commit c5d731c54a17677939bd59ee8be4ed74d7485ba4 ] + +While freeing SQB pointers to aura, driver first memcpy to +target address and then triggers lmtst operation to free pointer +to the aura. We need to ensure(by adding dmb barrier)that memcpy +is finished before pointers are freed to the aura. This patch also +adds the missing sq context structure entry in debugfs. + +Fixes: ef6c8da71eaf ("octeontx2-pf: cn10K: Reserve LMTST lines per core") +Signed-off-by: Geetha sowjanya +Signed-off-by: Subbaraya Sundeep +Signed-off-by: Sunil Goutham +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c | 2 ++ + drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h | 1 + + 2 files changed, 3 insertions(+) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +index 49d822a98adab..f001579569a2b 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +@@ -1131,6 +1131,8 @@ static void print_nix_cn10k_sq_ctx(struct seq_file *m, + seq_printf(m, "W3: head_offset\t\t\t%d\nW3: smenq_next_sqb_vld\t\t%d\n\n", + sq_ctx->head_offset, sq_ctx->smenq_next_sqb_vld); + ++ seq_printf(m, "W3: smq_next_sq_vld\t\t%d\nW3: smq_pend\t\t\t%d\n", ++ sq_ctx->smq_next_sq_vld, sq_ctx->smq_pend); + seq_printf(m, "W4: next_sqb \t\t\t%llx\n\n", sq_ctx->next_sqb); + seq_printf(m, "W5: tail_sqb \t\t\t%llx\n\n", sq_ctx->tail_sqb); + seq_printf(m, "W6: smenq_sqb \t\t\t%llx\n\n", sq_ctx->smenq_sqb); +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +index a51ecd771d075..637450de189c8 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +@@ -591,6 +591,7 @@ static inline void __cn10k_aura_freeptr(struct otx2_nic *pfvf, u64 aura, + size++; + tar_addr |= ((size - 1) & 0x7) << 4; + } ++ dma_wmb(); + memcpy((u64 *)lmt_info->lmt_addr, ptrs, sizeof(u64) * num_ptrs); + /* Perform LMTST flush */ + cn10k_lmt_flush(val, tar_addr); +-- +2.34.1 + diff --git a/queue-5.15/octeontx2-pf-forward-error-codes-to-vf.patch b/queue-5.15/octeontx2-pf-forward-error-codes-to-vf.patch new file mode 100644 index 00000000000..cc84e6cf7ef --- /dev/null +++ b/queue-5.15/octeontx2-pf-forward-error-codes-to-vf.patch @@ -0,0 +1,48 @@ +From 447beaed92f63363ca25914e7f0202ba09659a38 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 21 Jan 2022 12:04:46 +0530 +Subject: octeontx2-pf: Forward error codes to VF + +From: Subbaraya Sundeep + +[ Upstream commit a8db854be28622a2477cb21cdf7f829adbb2c42d ] + +PF forwards its VF messages to AF and corresponding +replies from AF to VF. AF sets proper error code in the +replies after processing message requests. Currently PF +checks the error codes in replies and sends invalid +message to VF. This way VF lacks the information of +error code set by AF for its messages. This patch +changes that such that PF simply forwards AF replies +so that VF can handle error codes. + +Fixes: d424b6c02415 ("octeontx2-pf: Enable SRIOV and added VF mbox handling") +Signed-off-by: Subbaraya Sundeep +Signed-off-by: Sunil Goutham +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +index 53a3e8de1a51e..b1894d4045b8d 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +@@ -386,7 +386,12 @@ static int otx2_forward_vf_mbox_msgs(struct otx2_nic *pf, + dst_mdev->msg_size = mbox_hdr->msg_size; + dst_mdev->num_msgs = num_msgs; + err = otx2_sync_mbox_msg(dst_mbox); +- if (err) { ++ /* Error code -EIO indicate there is a communication failure ++ * to the AF. Rest of the error codes indicate that AF processed ++ * VF messages and set the error codes in response messages ++ * (if any) so simply forward responses to VF. ++ */ ++ if (err == -EIO) { + dev_warn(pf->dev, + "AF not responding to VF%d messages\n", vf); + /* restore PF mbase and exit */ +-- +2.34.1 + diff --git a/queue-5.15/perf-fix-perf_event_read_local-time.patch b/queue-5.15/perf-fix-perf_event_read_local-time.patch new file mode 100644 index 00000000000..7f30536f51b --- /dev/null +++ b/queue-5.15/perf-fix-perf_event_read_local-time.patch @@ -0,0 +1,575 @@ +From 4059b9d5d31f6330507d7534fc11bb01670677b4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 20 Dec 2021 13:19:52 +0100 +Subject: perf: Fix perf_event_read_local() time + +From: Peter Zijlstra + +[ Upstream commit 09f5e7dc7ad705289e1b1ec065439aa3c42951c4 ] + +Time readers that cannot take locks (due to NMI etc..) currently make +use of perf_event::shadow_ctx_time, which, for that event gives: + + time' = now + (time - timestamp) + +or, alternatively arranged: + + time' = time + (now - timestamp) + +IOW, the progression of time since the last time the shadow_ctx_time +was updated. + +There's problems with this: + + A) the shadow_ctx_time is per-event, even though the ctx_time it + reflects is obviously per context. The direct concequence of this + is that the context needs to iterate all events all the time to + keep the shadow_ctx_time in sync. + + B) even with the prior point, the context itself might not be active + meaning its time should not advance to begin with. + + C) shadow_ctx_time isn't consistently updated when ctx_time is + +There are 3 users of this stuff, that suffer differently from this: + + - calc_timer_values() + - perf_output_read() + - perf_event_update_userpage() /* A */ + + - perf_event_read_local() /* A,B */ + +In particular, perf_output_read() doesn't suffer at all, because it's +sample driven and hence only relevant when the event is actually +running. + +This same was supposed to be true for perf_event_update_userpage(), +after all self-monitoring implies the context is active *HOWEVER*, as +per commit f79256532682 ("perf/core: fix userpage->time_enabled of +inactive events") this goes wrong when combined with counter +overcommit, in that case those events that do not get scheduled when +the context becomes active (task events typically) miss out on the +EVENT_TIME update and ENABLED time is inflated (for a little while) +with the time the context was inactive. Once the event gets rotated +in, this gets corrected, leading to a non-monotonic timeflow. + +perf_event_read_local() made things even worse, it can request time at +any point, suffering all the problems perf_event_update_userpage() +does and more. Because while perf_event_update_userpage() is limited +by the context being active, perf_event_read_local() users have no +such constraint. + +Therefore, completely overhaul things and do away with +perf_event::shadow_ctx_time. Instead have regular context time updates +keep track of this offset directly and provide perf_event_time_now() +to complement perf_event_time(). + +perf_event_time_now() will, in adition to being context wide, also +take into account if the context is active. For inactive context, it +will not advance time. + +This latter property means the cgroup perf_cgroup_info context needs +to grow addition state to track this. + +Additionally, since all this is strictly per-cpu, we can use barrier() +to order context activity vs context time. + +Fixes: 7d9285e82db5 ("perf/bpf: Extend the perf_event_read_local() interface, a.k.a. "bpf: perf event change needed for subsequent bpf helpers"") +Signed-off-by: Peter Zijlstra (Intel) +Tested-by: Song Liu +Tested-by: Namhyung Kim +Link: https://lkml.kernel.org/r/YcB06DasOBtU0b00@hirez.programming.kicks-ass.net +Signed-off-by: Sasha Levin +--- + include/linux/perf_event.h | 15 +-- + kernel/events/core.c | 246 ++++++++++++++++++++++--------------- + 2 files changed, 149 insertions(+), 112 deletions(-) + +diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h +index ae1f0c8b75623..6cce33e7e7acc 100644 +--- a/include/linux/perf_event.h ++++ b/include/linux/perf_event.h +@@ -680,18 +680,6 @@ struct perf_event { + u64 total_time_running; + u64 tstamp; + +- /* +- * timestamp shadows the actual context timing but it can +- * be safely used in NMI interrupt context. It reflects the +- * context time as it was when the event was last scheduled in, +- * or when ctx_sched_in failed to schedule the event because we +- * run out of PMC. +- * +- * ctx_time already accounts for ctx->timestamp. Therefore to +- * compute ctx_time for a sample, simply add perf_clock(). +- */ +- u64 shadow_ctx_time; +- + struct perf_event_attr attr; + u16 header_size; + u16 id_header_size; +@@ -838,6 +826,7 @@ struct perf_event_context { + */ + u64 time; + u64 timestamp; ++ u64 timeoffset; + + /* + * These fields let us detect when two contexts have both +@@ -920,6 +909,8 @@ struct bpf_perf_event_data_kern { + struct perf_cgroup_info { + u64 time; + u64 timestamp; ++ u64 timeoffset; ++ int active; + }; + + struct perf_cgroup { +diff --git a/kernel/events/core.c b/kernel/events/core.c +index 0fe6a65bbd58f..0153f8f972834 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -674,6 +674,23 @@ perf_event_set_state(struct perf_event *event, enum perf_event_state state) + WRITE_ONCE(event->state, state); + } + ++/* ++ * UP store-release, load-acquire ++ */ ++ ++#define __store_release(ptr, val) \ ++do { \ ++ barrier(); \ ++ WRITE_ONCE(*(ptr), (val)); \ ++} while (0) ++ ++#define __load_acquire(ptr) \ ++({ \ ++ __unqual_scalar_typeof(*(ptr)) ___p = READ_ONCE(*(ptr)); \ ++ barrier(); \ ++ ___p; \ ++}) ++ + #ifdef CONFIG_CGROUP_PERF + + static inline bool +@@ -719,34 +736,51 @@ static inline u64 perf_cgroup_event_time(struct perf_event *event) + return t->time; + } + +-static inline void __update_cgrp_time(struct perf_cgroup *cgrp) ++static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) + { +- struct perf_cgroup_info *info; +- u64 now; +- +- now = perf_clock(); ++ struct perf_cgroup_info *t; + +- info = this_cpu_ptr(cgrp->info); ++ t = per_cpu_ptr(event->cgrp->info, event->cpu); ++ if (!__load_acquire(&t->active)) ++ return t->time; ++ now += READ_ONCE(t->timeoffset); ++ return now; ++} + +- info->time += now - info->timestamp; ++static inline void __update_cgrp_time(struct perf_cgroup_info *info, u64 now, bool adv) ++{ ++ if (adv) ++ info->time += now - info->timestamp; + info->timestamp = now; ++ /* ++ * see update_context_time() ++ */ ++ WRITE_ONCE(info->timeoffset, info->time - info->timestamp); + } + +-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) ++static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, bool final) + { + struct perf_cgroup *cgrp = cpuctx->cgrp; + struct cgroup_subsys_state *css; ++ struct perf_cgroup_info *info; + + if (cgrp) { ++ u64 now = perf_clock(); ++ + for (css = &cgrp->css; css; css = css->parent) { + cgrp = container_of(css, struct perf_cgroup, css); +- __update_cgrp_time(cgrp); ++ info = this_cpu_ptr(cgrp->info); ++ ++ __update_cgrp_time(info, now, true); ++ if (final) ++ __store_release(&info->active, 0); + } + } + } + + static inline void update_cgrp_time_from_event(struct perf_event *event) + { ++ struct perf_cgroup_info *info; + struct perf_cgroup *cgrp; + + /* +@@ -760,8 +794,10 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) + /* + * Do not update time when cgroup is not active + */ +- if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) +- __update_cgrp_time(event->cgrp); ++ if (cgroup_is_descendant(cgrp->css.cgroup, event->cgrp->css.cgroup)) { ++ info = this_cpu_ptr(event->cgrp->info); ++ __update_cgrp_time(info, perf_clock(), true); ++ } + } + + static inline void +@@ -785,7 +821,8 @@ perf_cgroup_set_timestamp(struct task_struct *task, + for (css = &cgrp->css; css; css = css->parent) { + cgrp = container_of(css, struct perf_cgroup, css); + info = this_cpu_ptr(cgrp->info); +- info->timestamp = ctx->timestamp; ++ __update_cgrp_time(info, ctx->timestamp, false); ++ __store_release(&info->active, 1); + } + } + +@@ -981,14 +1018,6 @@ out: + return ret; + } + +-static inline void +-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) +-{ +- struct perf_cgroup_info *t; +- t = per_cpu_ptr(event->cgrp->info, event->cpu); +- event->shadow_ctx_time = now - t->timestamp; +-} +- + static inline void + perf_cgroup_event_enable(struct perf_event *event, struct perf_event_context *ctx) + { +@@ -1066,7 +1095,8 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) + { + } + +-static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx) ++static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx, ++ bool final) + { + } + +@@ -1098,12 +1128,12 @@ perf_cgroup_switch(struct task_struct *task, struct task_struct *next) + { + } + +-static inline void +-perf_cgroup_set_shadow_time(struct perf_event *event, u64 now) ++static inline u64 perf_cgroup_event_time(struct perf_event *event) + { ++ return 0; + } + +-static inline u64 perf_cgroup_event_time(struct perf_event *event) ++static inline u64 perf_cgroup_event_time_now(struct perf_event *event, u64 now) + { + return 0; + } +@@ -1525,22 +1555,59 @@ static void perf_unpin_context(struct perf_event_context *ctx) + /* + * Update the record of the current time in a context. + */ +-static void update_context_time(struct perf_event_context *ctx) ++static void __update_context_time(struct perf_event_context *ctx, bool adv) + { + u64 now = perf_clock(); + +- ctx->time += now - ctx->timestamp; ++ if (adv) ++ ctx->time += now - ctx->timestamp; + ctx->timestamp = now; ++ ++ /* ++ * The above: time' = time + (now - timestamp), can be re-arranged ++ * into: time` = now + (time - timestamp), which gives a single value ++ * offset to compute future time without locks on. ++ * ++ * See perf_event_time_now(), which can be used from NMI context where ++ * it's (obviously) not possible to acquire ctx->lock in order to read ++ * both the above values in a consistent manner. ++ */ ++ WRITE_ONCE(ctx->timeoffset, ctx->time - ctx->timestamp); ++} ++ ++static void update_context_time(struct perf_event_context *ctx) ++{ ++ __update_context_time(ctx, true); + } + + static u64 perf_event_time(struct perf_event *event) + { + struct perf_event_context *ctx = event->ctx; + ++ if (unlikely(!ctx)) ++ return 0; ++ + if (is_cgroup_event(event)) + return perf_cgroup_event_time(event); + +- return ctx ? ctx->time : 0; ++ return ctx->time; ++} ++ ++static u64 perf_event_time_now(struct perf_event *event, u64 now) ++{ ++ struct perf_event_context *ctx = event->ctx; ++ ++ if (unlikely(!ctx)) ++ return 0; ++ ++ if (is_cgroup_event(event)) ++ return perf_cgroup_event_time_now(event, now); ++ ++ if (!(__load_acquire(&ctx->is_active) & EVENT_TIME)) ++ return ctx->time; ++ ++ now += READ_ONCE(ctx->timeoffset); ++ return now; + } + + static enum event_type_t get_event_type(struct perf_event *event) +@@ -2346,7 +2413,7 @@ __perf_remove_from_context(struct perf_event *event, + + if (ctx->is_active & EVENT_TIME) { + update_context_time(ctx); +- update_cgrp_time_from_cpuctx(cpuctx); ++ update_cgrp_time_from_cpuctx(cpuctx, false); + } + + event_sched_out(event, cpuctx, ctx); +@@ -2357,6 +2424,9 @@ __perf_remove_from_context(struct perf_event *event, + list_del_event(event, ctx); + + if (!ctx->nr_events && ctx->is_active) { ++ if (ctx == &cpuctx->ctx) ++ update_cgrp_time_from_cpuctx(cpuctx, true); ++ + ctx->is_active = 0; + ctx->rotate_necessary = 0; + if (ctx->task) { +@@ -2478,40 +2548,6 @@ void perf_event_disable_inatomic(struct perf_event *event) + irq_work_queue(&event->pending); + } + +-static void perf_set_shadow_time(struct perf_event *event, +- struct perf_event_context *ctx) +-{ +- /* +- * use the correct time source for the time snapshot +- * +- * We could get by without this by leveraging the +- * fact that to get to this function, the caller +- * has most likely already called update_context_time() +- * and update_cgrp_time_xx() and thus both timestamp +- * are identical (or very close). Given that tstamp is, +- * already adjusted for cgroup, we could say that: +- * tstamp - ctx->timestamp +- * is equivalent to +- * tstamp - cgrp->timestamp. +- * +- * Then, in perf_output_read(), the calculation would +- * work with no changes because: +- * - event is guaranteed scheduled in +- * - no scheduled out in between +- * - thus the timestamp would be the same +- * +- * But this is a bit hairy. +- * +- * So instead, we have an explicit cgroup call to remain +- * within the time source all along. We believe it +- * is cleaner and simpler to understand. +- */ +- if (is_cgroup_event(event)) +- perf_cgroup_set_shadow_time(event, event->tstamp); +- else +- event->shadow_ctx_time = event->tstamp - ctx->timestamp; +-} +- + #define MAX_INTERRUPTS (~0ULL) + + static void perf_log_throttle(struct perf_event *event, int enable); +@@ -2552,8 +2588,6 @@ event_sched_in(struct perf_event *event, + + perf_pmu_disable(event->pmu); + +- perf_set_shadow_time(event, ctx); +- + perf_log_itrace_start(event); + + if (event->pmu->add(event, PERF_EF_START)) { +@@ -3247,16 +3281,6 @@ static void ctx_sched_out(struct perf_event_context *ctx, + return; + } + +- ctx->is_active &= ~event_type; +- if (!(ctx->is_active & EVENT_ALL)) +- ctx->is_active = 0; +- +- if (ctx->task) { +- WARN_ON_ONCE(cpuctx->task_ctx != ctx); +- if (!ctx->is_active) +- cpuctx->task_ctx = NULL; +- } +- + /* + * Always update time if it was set; not only when it changes. + * Otherwise we can 'forget' to update time for any but the last +@@ -3270,7 +3294,22 @@ static void ctx_sched_out(struct perf_event_context *ctx, + if (is_active & EVENT_TIME) { + /* update (and stop) ctx time */ + update_context_time(ctx); +- update_cgrp_time_from_cpuctx(cpuctx); ++ update_cgrp_time_from_cpuctx(cpuctx, ctx == &cpuctx->ctx); ++ /* ++ * CPU-release for the below ->is_active store, ++ * see __load_acquire() in perf_event_time_now() ++ */ ++ barrier(); ++ } ++ ++ ctx->is_active &= ~event_type; ++ if (!(ctx->is_active & EVENT_ALL)) ++ ctx->is_active = 0; ++ ++ if (ctx->task) { ++ WARN_ON_ONCE(cpuctx->task_ctx != ctx); ++ if (!ctx->is_active) ++ cpuctx->task_ctx = NULL; + } + + is_active ^= ctx->is_active; /* changed bits */ +@@ -3707,13 +3746,19 @@ static noinline int visit_groups_merge(struct perf_cpu_context *cpuctx, + return 0; + } + ++/* ++ * Because the userpage is strictly per-event (there is no concept of context, ++ * so there cannot be a context indirection), every userpage must be updated ++ * when context time starts :-( ++ * ++ * IOW, we must not miss EVENT_TIME edges. ++ */ + static inline bool event_update_userpage(struct perf_event *event) + { + if (likely(!atomic_read(&event->mmap_count))) + return false; + + perf_event_update_time(event); +- perf_set_shadow_time(event, event->ctx); + perf_event_update_userpage(event); + + return true; +@@ -3797,13 +3842,23 @@ ctx_sched_in(struct perf_event_context *ctx, + struct task_struct *task) + { + int is_active = ctx->is_active; +- u64 now; + + lockdep_assert_held(&ctx->lock); + + if (likely(!ctx->nr_events)) + return; + ++ if (is_active ^ EVENT_TIME) { ++ /* start ctx time */ ++ __update_context_time(ctx, false); ++ perf_cgroup_set_timestamp(task, ctx); ++ /* ++ * CPU-release for the below ->is_active store, ++ * see __load_acquire() in perf_event_time_now() ++ */ ++ barrier(); ++ } ++ + ctx->is_active |= (event_type | EVENT_TIME); + if (ctx->task) { + if (!is_active) +@@ -3814,13 +3869,6 @@ ctx_sched_in(struct perf_event_context *ctx, + + is_active ^= ctx->is_active; /* changed bits */ + +- if (is_active & EVENT_TIME) { +- /* start ctx time */ +- now = perf_clock(); +- ctx->timestamp = now; +- perf_cgroup_set_timestamp(task, ctx); +- } +- + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. +@@ -4414,6 +4462,18 @@ static inline u64 perf_event_count(struct perf_event *event) + return local64_read(&event->count) + atomic64_read(&event->child_count); + } + ++static void calc_timer_values(struct perf_event *event, ++ u64 *now, ++ u64 *enabled, ++ u64 *running) ++{ ++ u64 ctx_time; ++ ++ *now = perf_clock(); ++ ctx_time = perf_event_time_now(event, *now); ++ __perf_update_times(event, ctx_time, enabled, running); ++} ++ + /* + * NMI-safe method to read a local event, that is an event that + * is: +@@ -4473,10 +4533,9 @@ int perf_event_read_local(struct perf_event *event, u64 *value, + + *value = local64_read(&event->count); + if (enabled || running) { +- u64 now = event->shadow_ctx_time + perf_clock(); +- u64 __enabled, __running; ++ u64 __enabled, __running, __now;; + +- __perf_update_times(event, now, &__enabled, &__running); ++ calc_timer_values(event, &__now, &__enabled, &__running); + if (enabled) + *enabled = __enabled; + if (running) +@@ -5798,18 +5857,6 @@ static int perf_event_index(struct perf_event *event) + return event->pmu->event_idx(event); + } + +-static void calc_timer_values(struct perf_event *event, +- u64 *now, +- u64 *enabled, +- u64 *running) +-{ +- u64 ctx_time; +- +- *now = perf_clock(); +- ctx_time = event->shadow_ctx_time + *now; +- __perf_update_times(event, ctx_time, enabled, running); +-} +- + static void perf_event_init_userpage(struct perf_event *event) + { + struct perf_event_mmap_page *userpg; +@@ -6349,7 +6396,6 @@ accounting: + ring_buffer_attach(event, rb); + + perf_event_update_time(event); +- perf_set_shadow_time(event, event->ctx); + perf_event_init_userpage(event); + perf_event_update_userpage(event); + } else { +-- +2.34.1 + diff --git a/queue-5.15/phylib-fix-potential-use-after-free.patch b/queue-5.15/phylib-fix-potential-use-after-free.patch new file mode 100644 index 00000000000..569dcf23415 --- /dev/null +++ b/queue-5.15/phylib-fix-potential-use-after-free.patch @@ -0,0 +1,58 @@ +From c7a186d60eebdacf2d9454601c5929562826e865 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 19 Jan 2022 17:27:48 +0100 +Subject: phylib: fix potential use-after-free +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Marek Behún + +[ Upstream commit cbda1b16687580d5beee38273f6241ae3725960c ] + +Commit bafbdd527d56 ("phylib: Add device reset GPIO support") added call +to phy_device_reset(phydev) after the put_device() call in phy_detach(). + +The comment before the put_device() call says that the phydev might go +away with put_device(). + +Fix potential use-after-free by calling phy_device_reset() before +put_device(). + +Fixes: bafbdd527d56 ("phylib: Add device reset GPIO support") +Signed-off-by: Marek Behún +Reviewed-by: Andrew Lunn +Link: https://lore.kernel.org/r/20220119162748.32418-1-kabel@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/phy/phy_device.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c +index 4f9990b47a377..28f4a383aba72 100644 +--- a/drivers/net/phy/phy_device.c ++++ b/drivers/net/phy/phy_device.c +@@ -1746,6 +1746,9 @@ void phy_detach(struct phy_device *phydev) + phy_driver_is_genphy_10g(phydev)) + device_release_driver(&phydev->mdio.dev); + ++ /* Assert the reset signal */ ++ phy_device_reset(phydev, 1); ++ + /* + * The phydev might go away on the put_device() below, so avoid + * a use-after-free bug by reading the underlying bus first. +@@ -1757,9 +1760,6 @@ void phy_detach(struct phy_device *phydev) + ndev_owner = dev->dev.parent->driver->owner; + if (ndev_owner != bus->owner) + module_put(bus->owner); +- +- /* Assert the reset signal */ +- phy_device_reset(phydev, 1); + } + EXPORT_SYMBOL(phy_detach); + +-- +2.34.1 + diff --git a/queue-5.15/powerpc-64s-mask-srr0-before-checking-against-the-ma.patch b/queue-5.15/powerpc-64s-mask-srr0-before-checking-against-the-ma.patch new file mode 100644 index 00000000000..b188e736285 --- /dev/null +++ b/queue-5.15/powerpc-64s-mask-srr0-before-checking-against-the-ma.patch @@ -0,0 +1,59 @@ +From 7129d076879177c945633bc47433edbb47d7bcd6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 17 Jan 2022 23:44:03 +1000 +Subject: powerpc/64s: Mask SRR0 before checking against the masked NIP + +From: Nicholas Piggin + +[ Upstream commit aee101d7b95a03078945681dd7f7ea5e4a1e7686 ] + +Commit 314f6c23dd8d ("powerpc/64s: Mask NIP before checking against +SRR0") masked off the low 2 bits of the NIP value in the interrupt +stack frame in case they are non-zero and mis-compare against a SRR0 +register value of a CPU which always reads back 0 from the 2 low bits +which are reserved. + +This now causes the opposite problem that an implementation which does +implement those bits in SRR0 will mis-compare against the masked NIP +value in which they have been cleared. QEMU is one such implementation, +and this is allowed by the architecture. + +This can be triggered by sigfuz by setting low bits of PT_NIP in the +signal context. + +Fix this for now by masking the SRR0 bits as well. Cleaner is probably +to sanitise these values before putting them in registers or stack, but +this is the quick and backportable fix. + +Fixes: 314f6c23dd8d ("powerpc/64s: Mask NIP before checking against SRR0") +Signed-off-by: Nicholas Piggin +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/20220117134403.2995059-1-npiggin@gmail.com +Signed-off-by: Sasha Levin +--- + arch/powerpc/kernel/interrupt_64.S | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S +index 4b1ff94e67eb4..4c6d1a8dcefed 100644 +--- a/arch/powerpc/kernel/interrupt_64.S ++++ b/arch/powerpc/kernel/interrupt_64.S +@@ -30,6 +30,7 @@ COMPAT_SYS_CALL_TABLE: + .ifc \srr,srr + mfspr r11,SPRN_SRR0 + ld r12,_NIP(r1) ++ clrrdi r11,r11,2 + clrrdi r12,r12,2 + 100: tdne r11,r12 + EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) +@@ -40,6 +41,7 @@ COMPAT_SYS_CALL_TABLE: + .else + mfspr r11,SPRN_HSRR0 + ld r12,_NIP(r1) ++ clrrdi r11,r11,2 + clrrdi r12,r12,2 + 100: tdne r11,r12 + EMIT_WARN_ENTRY 100b,__FILE__,__LINE__,(BUGFLAG_WARNING | BUGFLAG_ONCE) +-- +2.34.1 + diff --git a/queue-5.15/powerpc64-bpf-limit-ldbrx-to-processors-compliant-wi.patch b/queue-5.15/powerpc64-bpf-limit-ldbrx-to-processors-compliant-wi.patch new file mode 100644 index 00000000000..2433ec21d91 --- /dev/null +++ b/queue-5.15/powerpc64-bpf-limit-ldbrx-to-processors-compliant-wi.patch @@ -0,0 +1,104 @@ +From c94c276e869eb8084d695854f48475aa0df1e018 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 6 Jan 2022 17:15:12 +0530 +Subject: powerpc64/bpf: Limit 'ldbrx' to processors compliant with ISA v2.06 + +From: Naveen N. Rao + +[ Upstream commit 3f5f766d5f7f95a69a630da3544a1a0cee1cdddf ] + +Johan reported the below crash with test_bpf on ppc64 e5500: + + test_bpf: #296 ALU_END_FROM_LE 64: 0x0123456789abcdef -> 0x67452301 jited:1 + Oops: Exception in kernel mode, sig: 4 [#1] + BE PAGE_SIZE=4K SMP NR_CPUS=24 QEMU e500 + Modules linked in: test_bpf(+) + CPU: 0 PID: 76 Comm: insmod Not tainted 5.14.0-03771-g98c2059e008a-dirty #1 + NIP: 8000000000061c3c LR: 80000000006dea64 CTR: 8000000000061c18 + REGS: c0000000032d3420 TRAP: 0700 Not tainted (5.14.0-03771-g98c2059e008a-dirty) + MSR: 0000000080089000 CR: 88002822 XER: 20000000 IRQMASK: 0 + <...> + NIP [8000000000061c3c] 0x8000000000061c3c + LR [80000000006dea64] .__run_one+0x104/0x17c [test_bpf] + Call Trace: + .__run_one+0x60/0x17c [test_bpf] (unreliable) + .test_bpf_init+0x6a8/0xdc8 [test_bpf] + .do_one_initcall+0x6c/0x28c + .do_init_module+0x68/0x28c + .load_module+0x2460/0x2abc + .__do_sys_init_module+0x120/0x18c + .system_call_exception+0x110/0x1b8 + system_call_common+0xf0/0x210 + --- interrupt: c00 at 0x101d0acc + <...> + ---[ end trace 47b2bf19090bb3d0 ]--- + + Illegal instruction + +The illegal instruction turned out to be 'ldbrx' emitted for +BPF_FROM_[L|B]E, which was only introduced in ISA v2.06. Guard use of +the same and implement an alternative approach for older processors. + +Fixes: 156d0e290e969c ("powerpc/ebpf/jit: Implement JIT compiler for extended BPF") +Reported-by: Johan Almbladh +Signed-off-by: Naveen N. Rao +Tested-by: Johan Almbladh +Acked-by: Johan Almbladh +Signed-off-by: Michael Ellerman +Link: https://lore.kernel.org/r/d1e51c6fdf572062cf3009a751c3406bda01b832.1641468127.git.naveen.n.rao@linux.vnet.ibm.com +Signed-off-by: Sasha Levin +--- + arch/powerpc/include/asm/ppc-opcode.h | 1 + + arch/powerpc/net/bpf_jit_comp64.c | 22 +++++++++++++--------- + 2 files changed, 14 insertions(+), 9 deletions(-) + +diff --git a/arch/powerpc/include/asm/ppc-opcode.h b/arch/powerpc/include/asm/ppc-opcode.h +index baea657bc8687..bca31a61e57f8 100644 +--- a/arch/powerpc/include/asm/ppc-opcode.h ++++ b/arch/powerpc/include/asm/ppc-opcode.h +@@ -498,6 +498,7 @@ + #define PPC_RAW_LDX(r, base, b) (0x7c00002a | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b)) + #define PPC_RAW_LHZ(r, base, i) (0xa0000000 | ___PPC_RT(r) | ___PPC_RA(base) | IMM_L(i)) + #define PPC_RAW_LHBRX(r, base, b) (0x7c00062c | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b)) ++#define PPC_RAW_LWBRX(r, base, b) (0x7c00042c | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b)) + #define PPC_RAW_LDBRX(r, base, b) (0x7c000428 | ___PPC_RT(r) | ___PPC_RA(base) | ___PPC_RB(b)) + #define PPC_RAW_STWCX(s, a, b) (0x7c00012d | ___PPC_RS(s) | ___PPC_RA(a) | ___PPC_RB(b)) + #define PPC_RAW_CMPWI(a, i) (0x2c000000 | ___PPC_RA(a) | IMM_L(i)) +diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c +index 95a337b5dc2b4..57e1b6680365c 100644 +--- a/arch/powerpc/net/bpf_jit_comp64.c ++++ b/arch/powerpc/net/bpf_jit_comp64.c +@@ -633,17 +633,21 @@ bpf_alu32_trunc: + EMIT(PPC_RAW_MR(dst_reg, b2p[TMP_REG_1])); + break; + case 64: +- /* +- * Way easier and faster(?) to store the value +- * into stack and then use ldbrx +- * +- * ctx->seen will be reliable in pass2, but +- * the instructions generated will remain the +- * same across all passes +- */ ++ /* Store the value to stack and then use byte-reverse loads */ + PPC_BPF_STL(dst_reg, 1, bpf_jit_stack_local(ctx)); + EMIT(PPC_RAW_ADDI(b2p[TMP_REG_1], 1, bpf_jit_stack_local(ctx))); +- EMIT(PPC_RAW_LDBRX(dst_reg, 0, b2p[TMP_REG_1])); ++ if (cpu_has_feature(CPU_FTR_ARCH_206)) { ++ EMIT(PPC_RAW_LDBRX(dst_reg, 0, b2p[TMP_REG_1])); ++ } else { ++ EMIT(PPC_RAW_LWBRX(dst_reg, 0, b2p[TMP_REG_1])); ++ if (IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN)) ++ EMIT(PPC_RAW_SLDI(dst_reg, dst_reg, 32)); ++ EMIT(PPC_RAW_LI(b2p[TMP_REG_2], 4)); ++ EMIT(PPC_RAW_LWBRX(b2p[TMP_REG_2], b2p[TMP_REG_2], b2p[TMP_REG_1])); ++ if (IS_ENABLED(CONFIG_CPU_BIG_ENDIAN)) ++ EMIT(PPC_RAW_SLDI(b2p[TMP_REG_2], b2p[TMP_REG_2], 32)); ++ EMIT(PPC_RAW_OR(dst_reg, dst_reg, b2p[TMP_REG_2])); ++ } + break; + } + break; +-- +2.34.1 + diff --git a/queue-5.15/rxrpc-adjust-retransmission-backoff.patch b/queue-5.15/rxrpc-adjust-retransmission-backoff.patch new file mode 100644 index 00000000000..57a7672c336 --- /dev/null +++ b/queue-5.15/rxrpc-adjust-retransmission-backoff.patch @@ -0,0 +1,93 @@ +From 1cbb4005d9ae8dbb89b79eff4f521438ac714bf6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 21 Jan 2022 23:12:58 +0000 +Subject: rxrpc: Adjust retransmission backoff + +From: David Howells + +[ Upstream commit 2c13c05c5ff4b9fc907b07f7311821910ebaaf8a ] + +Improve retransmission backoff by only backing off when we retransmit data +packets rather than when we set the lost ack timer. + +To this end: + + (1) In rxrpc_resend(), use rxrpc_get_rto_backoff() when setting the + retransmission timer and only tell it that we are retransmitting if we + actually have things to retransmit. + + Note that it's possible for the retransmission algorithm to race with + the processing of a received ACK, so we may see no packets needing + retransmission. + + (2) In rxrpc_send_data_packet(), don't bump the backoff when setting the + ack_lost_at timer, as it may then get bumped twice. + +With this, when looking at one particular packet, the retransmission +intervals were seen to be 1.5ms, 2ms, 3ms, 5ms, 9ms, 17ms, 33ms, 71ms, +136ms, 264ms, 544ms, 1.088s, 2.1s, 4.2s and 8.3s. + +Fixes: c410bf01933e ("rxrpc: Fix the excessive initial retransmission timeout") +Suggested-by: Marc Dionne +Signed-off-by: David Howells +Reviewed-by: Marc Dionne +Tested-by: Marc Dionne +cc: linux-afs@lists.infradead.org +Link: https://lore.kernel.org/r/164138117069.2023386.17446904856843997127.stgit@warthog.procyon.org.uk/ +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/rxrpc/call_event.c | 8 +++----- + net/rxrpc/output.c | 2 +- + 2 files changed, 4 insertions(+), 6 deletions(-) + +diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c +index 6be2672a65eab..df864e6922679 100644 +--- a/net/rxrpc/call_event.c ++++ b/net/rxrpc/call_event.c +@@ -157,7 +157,7 @@ static void rxrpc_congestion_timeout(struct rxrpc_call *call) + static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) + { + struct sk_buff *skb; +- unsigned long resend_at, rto_j; ++ unsigned long resend_at; + rxrpc_seq_t cursor, seq, top; + ktime_t now, max_age, oldest, ack_ts; + int ix; +@@ -165,10 +165,8 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) + + _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); + +- rto_j = call->peer->rto_j; +- + now = ktime_get_real(); +- max_age = ktime_sub(now, jiffies_to_usecs(rto_j)); ++ max_age = ktime_sub(now, jiffies_to_usecs(call->peer->rto_j)); + + spin_lock_bh(&call->lock); + +@@ -213,7 +211,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) + } + + resend_at = nsecs_to_jiffies(ktime_to_ns(ktime_sub(now, oldest))); +- resend_at += jiffies + rto_j; ++ resend_at += jiffies + rxrpc_get_rto_backoff(call->peer, retrans); + WRITE_ONCE(call->resend_at, resend_at); + + if (unacked) +diff --git a/net/rxrpc/output.c b/net/rxrpc/output.c +index 10f2bf2e9068a..a45c83f22236e 100644 +--- a/net/rxrpc/output.c ++++ b/net/rxrpc/output.c +@@ -468,7 +468,7 @@ done: + if (call->peer->rtt_count > 1) { + unsigned long nowj = jiffies, ack_lost_at; + +- ack_lost_at = rxrpc_get_rto_backoff(call->peer, retrans); ++ ack_lost_at = rxrpc_get_rto_backoff(call->peer, false); + ack_lost_at += nowj; + WRITE_ONCE(call->ack_lost_at, ack_lost_at); + rxrpc_reduce_call_timer(call, ack_lost_at, nowj, +-- +2.34.1 + diff --git a/queue-5.15/sched-pelt-relax-the-sync-of-util_sum-with-util_avg.patch b/queue-5.15/sched-pelt-relax-the-sync-of-util_sum-with-util_avg.patch new file mode 100644 index 00000000000..2b55450ef7b --- /dev/null +++ b/queue-5.15/sched-pelt-relax-the-sync-of-util_sum-with-util_avg.patch @@ -0,0 +1,105 @@ +From b19759818555424de7d26fde6024f936d388f140 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 11 Jan 2022 14:46:56 +0100 +Subject: sched/pelt: Relax the sync of util_sum with util_avg + +From: Vincent Guittot + +[ Upstream commit 98b0d890220d45418cfbc5157b3382e6da5a12ab ] + +Rick reported performance regressions in bugzilla because of cpu frequency +being lower than before: + https://bugzilla.kernel.org/show_bug.cgi?id=215045 + +He bisected the problem to: +commit 1c35b07e6d39 ("sched/fair: Ensure _sum and _avg values stay consistent") + +This commit forces util_sum to be synced with the new util_avg after +removing the contribution of a task and before the next periodic sync. By +doing so util_sum is rounded to its lower bound and might lost up to +LOAD_AVG_MAX-1 of accumulated contribution which has not yet been +reflected in util_avg. + +Instead of always setting util_sum to the low bound of util_avg, which can +significantly lower the utilization of root cfs_rq after propagating the +change down into the hierarchy, we revert the change of util_sum and +propagate the difference. + +In addition, we also check that cfs's util_sum always stays above the +lower bound for a given util_avg as it has been observed that +sched_entity's util_sum is sometimes above cfs one. + +Fixes: 1c35b07e6d39 ("sched/fair: Ensure _sum and _avg values stay consistent") +Reported-by: Rick Yiu +Signed-off-by: Vincent Guittot +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: Dietmar Eggemann +Tested-by: Sachin Sant +Link: https://lkml.kernel.org/r/20220111134659.24961-2-vincent.guittot@linaro.org +Signed-off-by: Sasha Levin +--- + kernel/sched/fair.c | 16 +++++++++++++--- + kernel/sched/pelt.h | 4 +++- + 2 files changed, 16 insertions(+), 4 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index d41f966f5866a..6420580f2730b 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -3422,7 +3422,6 @@ void set_task_rq_fair(struct sched_entity *se, + se->avg.last_update_time = n_last_update_time; + } + +- + /* + * When on migration a sched_entity joins/leaves the PELT hierarchy, we need to + * propagate its contribution. The key to this propagation is the invariant +@@ -3490,7 +3489,6 @@ void set_task_rq_fair(struct sched_entity *se, + * XXX: only do this for the part of runnable > running ? + * + */ +- + static inline void + update_tg_cfs_util(struct cfs_rq *cfs_rq, struct sched_entity *se, struct cfs_rq *gcfs_rq) + { +@@ -3722,7 +3720,19 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) + + r = removed_util; + sub_positive(&sa->util_avg, r); +- sa->util_sum = sa->util_avg * divider; ++ sub_positive(&sa->util_sum, r * divider); ++ /* ++ * Because of rounding, se->util_sum might ends up being +1 more than ++ * cfs->util_sum. Although this is not a problem by itself, detaching ++ * a lot of tasks with the rounding problem between 2 updates of ++ * util_avg (~1ms) can make cfs->util_sum becoming null whereas ++ * cfs_util_avg is not. ++ * Check that util_sum is still above its lower bound for the new ++ * util_avg. Given that period_contrib might have moved since the last ++ * sync, we are only sure that util_sum must be above or equal to ++ * util_avg * minimum possible divider ++ */ ++ sa->util_sum = max_t(u32, sa->util_sum, sa->util_avg * PELT_MIN_DIVIDER); + + r = removed_runnable; + sub_positive(&sa->runnable_avg, r); +diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h +index e06071bf3472c..c336f5f481bca 100644 +--- a/kernel/sched/pelt.h ++++ b/kernel/sched/pelt.h +@@ -37,9 +37,11 @@ update_irq_load_avg(struct rq *rq, u64 running) + } + #endif + ++#define PELT_MIN_DIVIDER (LOAD_AVG_MAX - 1024) ++ + static inline u32 get_pelt_divider(struct sched_avg *avg) + { +- return LOAD_AVG_MAX - 1024 + avg->period_contrib; ++ return PELT_MIN_DIVIDER + avg->period_contrib; + } + + static inline void cfs_se_util_change(struct sched_avg *avg) +-- +2.34.1 + diff --git a/queue-5.15/selftests-mptcp-fix-ipv6-routing-setup.patch b/queue-5.15/selftests-mptcp-fix-ipv6-routing-setup.patch new file mode 100644 index 00000000000..e967596dc80 --- /dev/null +++ b/queue-5.15/selftests-mptcp-fix-ipv6-routing-setup.patch @@ -0,0 +1,60 @@ +From e88ab3d1eeb91c03f3d87442859ad933f44f0107 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 20 Jan 2022 16:35:29 -0800 +Subject: selftests: mptcp: fix ipv6 routing setup + +From: Paolo Abeni + +[ Upstream commit 9846921dba4936d92f7608315b5d1e0a8ec3a538 ] + +MPJ ipv6 selftests currently lack per link route to the server +net. Additionally, ipv6 subflows endpoints are created without any +interface specified. The end-result is that in ipv6 self-tests +subflows are created all on the same link, leading to expected delays +and sporadic self-tests failures. + +Fix the issue by adding the missing setup bits. + +Fixes: 523514ed0a99 ("selftests: mptcp: add ADD_ADDR IPv6 test cases") +Reported-and-tested-by: Geliang Tang +Signed-off-by: Paolo Abeni +Signed-off-by: Mat Martineau +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/net/mptcp/mptcp_join.sh | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/tools/testing/selftests/net/mptcp/mptcp_join.sh b/tools/testing/selftests/net/mptcp/mptcp_join.sh +index 0c12602fa22e8..38777d1ef766f 100755 +--- a/tools/testing/selftests/net/mptcp/mptcp_join.sh ++++ b/tools/testing/selftests/net/mptcp/mptcp_join.sh +@@ -75,6 +75,7 @@ init() + + # let $ns2 reach any $ns1 address from any interface + ip -net "$ns2" route add default via 10.0.$i.1 dev ns2eth$i metric 10$i ++ ip -net "$ns2" route add default via dead:beef:$i::1 dev ns2eth$i metric 10$i + done + } + +@@ -1386,7 +1387,7 @@ ipv6_tests() + reset + ip netns exec $ns1 ./pm_nl_ctl limits 0 1 + ip netns exec $ns2 ./pm_nl_ctl limits 0 1 +- ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 flags subflow ++ ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 dev ns2eth3 flags subflow + run_tests $ns1 $ns2 dead:beef:1::1 0 0 0 slow + chk_join_nr "single subflow IPv6" 1 1 1 + +@@ -1421,7 +1422,7 @@ ipv6_tests() + ip netns exec $ns1 ./pm_nl_ctl limits 0 2 + ip netns exec $ns1 ./pm_nl_ctl add dead:beef:2::1 flags signal + ip netns exec $ns2 ./pm_nl_ctl limits 1 2 +- ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 flags subflow ++ ip netns exec $ns2 ./pm_nl_ctl add dead:beef:3::2 dev ns2eth3 flags subflow + run_tests $ns1 $ns2 dead:beef:1::1 0 -1 -1 slow + chk_join_nr "remove subflow and signal IPv6" 2 2 2 + chk_add_nr 1 1 +-- +2.34.1 + diff --git a/queue-5.15/series b/queue-5.15/series index 9bd6f0ee498..8fc7808ea44 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -83,3 +83,40 @@ rpmsg-char-fix-race-between-the-release-of-rpmsg_ctrldev-and-cdev.patch rpmsg-char-fix-race-between-the-release-of-rpmsg_eptdev-and-cdev.patch scsi-elx-efct-don-t-use-gfp_kernel-under-spin-lock.patch scsi-bnx2fc-flush-destroy_work-queue-before-calling-bnx2fc_interface_put.patch +nfs-ensure-the-server-has-an-up-to-date-ctime-before.patch +nfs-ensure-the-server-has-an-up-to-date-ctime-before.patch-14723 +nfsv4-handle-case-where-the-lookup-of-a-directory-fa.patch +nfsv4-nfs_atomic_open-can-race-when-looking-up-a-non.patch +kvm-arm64-pkvm-use-the-mm_ops-indirection-for-cache-.patch +sunrpc-use-bit-macro-in-rpc_show_xprt_state.patch +sunrpc-don-t-dereference-xprt-snd_task-if-it-s-a-coo.patch +powerpc64-bpf-limit-ldbrx-to-processors-compliant-wi.patch +netfilter-conntrack-don-t-increment-invalid-counter-.patch +powerpc-64s-mask-srr0-before-checking-against-the-ma.patch +perf-fix-perf_event_read_local-time.patch +sched-pelt-relax-the-sync-of-util_sum-with-util_avg.patch +arm-9170-1-fix-panic-when-kasan-and-kprobe-are-enabl.patch +net-fix-information-leakage-in-proc-net-ptype.patch +net-phy-broadcom-hook-up-soft_reset-for-bcm54616s.patch +ipv6_tunnel-rate-limit-warning-messages.patch +net-stmmac-dwmac-visconti-fix-bit-definitions-for-et.patch +net-stmmac-dwmac-visconti-fix-clock-configuration-fo.patch +phylib-fix-potential-use-after-free.patch +ipv6-annotate-accesses-to-fn-fn_sernum.patch +mptcp-allow-changing-the-backup-bit-by-endpoint-id.patch +mptcp-clean-up-harmless-false-expressions.patch +mptcp-keep-track-of-local-endpoint-still-available-f.patch +mptcp-fix-msk-traversal-in-mptcp_nl_cmd_set_flags.patch +mptcp-fix-removing-ids-bitmap-setting.patch +selftests-mptcp-fix-ipv6-routing-setup.patch +octeontx2-af-do-not-fixup-all-vf-action-entries.patch +octeontx2-af-fix-lbk-backpressure-id-count.patch +octeontx2-af-retry-until-rvu-block-reset-complete.patch +octeontx2-pf-cn10k-ensure-valid-pointers-are-freed-t.patch +octeontx2-af-verify-cq-context-updates.patch +octeontx2-af-increase-link-credit-restore-polling-ti.patch +octeontx2-af-cn10k-do-not-enable-rpm-loopback-for-lp.patch +octeontx2-pf-forward-error-codes-to-vf.patch +rxrpc-adjust-retransmission-backoff.patch +efi-libstub-arm64-fix-image-check-alignment-at-entry.patch +io_uring-fix-bug-in-slow-unregistering-of-nodes.patch diff --git a/queue-5.15/sunrpc-don-t-dereference-xprt-snd_task-if-it-s-a-coo.patch b/queue-5.15/sunrpc-don-t-dereference-xprt-snd_task-if-it-s-a-coo.patch new file mode 100644 index 00000000000..1d3310ad876 --- /dev/null +++ b/queue-5.15/sunrpc-don-t-dereference-xprt-snd_task-if-it-s-a-coo.patch @@ -0,0 +1,63 @@ +From 163d8c8237eecaebd61404f0f4f6b7ae73e969bd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 13 Jan 2022 12:20:36 -0500 +Subject: SUNRPC: Don't dereference xprt->snd_task if it's a cookie + +From: Chuck Lever + +[ Upstream commit aed28b7a2d620cb5cd0c554cb889075c02e25e8e ] + +Fixes: e26d9972720e ("SUNRPC: Clean up scheduling of autoclose") +Signed-off-by: Chuck Lever +Signed-off-by: Anna Schumaker +Signed-off-by: Sasha Levin +--- + include/trace/events/sunrpc.h | 18 +++++++++++++----- + 1 file changed, 13 insertions(+), 5 deletions(-) + +diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h +index 312507cb341f4..daaf407e9e494 100644 +--- a/include/trace/events/sunrpc.h ++++ b/include/trace/events/sunrpc.h +@@ -936,7 +936,8 @@ TRACE_EVENT(rpc_socket_nospace, + { BIT(XPRT_REMOVE), "REMOVE" }, \ + { BIT(XPRT_CONGESTED), "CONGESTED" }, \ + { BIT(XPRT_CWND_WAIT), "CWND_WAIT" }, \ +- { BIT(XPRT_WRITE_SPACE), "WRITE_SPACE" }) ++ { BIT(XPRT_WRITE_SPACE), "WRITE_SPACE" }, \ ++ { BIT(XPRT_SND_IS_COOKIE), "SND_IS_COOKIE" }) + + DECLARE_EVENT_CLASS(rpc_xprt_lifetime_class, + TP_PROTO( +@@ -1133,8 +1134,11 @@ DECLARE_EVENT_CLASS(xprt_writelock_event, + __entry->task_id = -1; + __entry->client_id = -1; + } +- __entry->snd_task_id = xprt->snd_task ? +- xprt->snd_task->tk_pid : -1; ++ if (xprt->snd_task && ++ !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) ++ __entry->snd_task_id = xprt->snd_task->tk_pid; ++ else ++ __entry->snd_task_id = -1; + ), + + TP_printk("task:%u@%u snd_task:%u", +@@ -1178,8 +1182,12 @@ DECLARE_EVENT_CLASS(xprt_cong_event, + __entry->task_id = -1; + __entry->client_id = -1; + } +- __entry->snd_task_id = xprt->snd_task ? +- xprt->snd_task->tk_pid : -1; ++ if (xprt->snd_task && ++ !test_bit(XPRT_SND_IS_COOKIE, &xprt->state)) ++ __entry->snd_task_id = xprt->snd_task->tk_pid; ++ else ++ __entry->snd_task_id = -1; ++ + __entry->cong = xprt->cong; + __entry->cwnd = xprt->cwnd; + __entry->wait = test_bit(XPRT_CWND_WAIT, &xprt->state); +-- +2.34.1 + diff --git a/queue-5.15/sunrpc-use-bit-macro-in-rpc_show_xprt_state.patch b/queue-5.15/sunrpc-use-bit-macro-in-rpc_show_xprt_state.patch new file mode 100644 index 00000000000..cda183259be --- /dev/null +++ b/queue-5.15/sunrpc-use-bit-macro-in-rpc_show_xprt_state.patch @@ -0,0 +1,56 @@ +From e4e0ab17ef8c18fafa1713b3aeb1163368e612dc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 16 Oct 2021 18:02:38 -0400 +Subject: SUNRPC: Use BIT() macro in rpc_show_xprt_state() + +From: Chuck Lever + +[ Upstream commit 76497b1adb89175eee85afc437f08a68247314b3 ] + +Clean up: BIT() is preferred over open-coding the shift. + +Signed-off-by: Chuck Lever +Signed-off-by: Trond Myklebust +Signed-off-by: Sasha Levin +--- + include/trace/events/sunrpc.h | 24 ++++++++++++------------ + 1 file changed, 12 insertions(+), 12 deletions(-) + +diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h +index 2d04eb96d4183..312507cb341f4 100644 +--- a/include/trace/events/sunrpc.h ++++ b/include/trace/events/sunrpc.h +@@ -925,18 +925,18 @@ TRACE_EVENT(rpc_socket_nospace, + + #define rpc_show_xprt_state(x) \ + __print_flags(x, "|", \ +- { (1UL << XPRT_LOCKED), "LOCKED"}, \ +- { (1UL << XPRT_CONNECTED), "CONNECTED"}, \ +- { (1UL << XPRT_CONNECTING), "CONNECTING"}, \ +- { (1UL << XPRT_CLOSE_WAIT), "CLOSE_WAIT"}, \ +- { (1UL << XPRT_BOUND), "BOUND"}, \ +- { (1UL << XPRT_BINDING), "BINDING"}, \ +- { (1UL << XPRT_CLOSING), "CLOSING"}, \ +- { (1UL << XPRT_OFFLINE), "OFFLINE"}, \ +- { (1UL << XPRT_REMOVE), "REMOVE"}, \ +- { (1UL << XPRT_CONGESTED), "CONGESTED"}, \ +- { (1UL << XPRT_CWND_WAIT), "CWND_WAIT"}, \ +- { (1UL << XPRT_WRITE_SPACE), "WRITE_SPACE"}) ++ { BIT(XPRT_LOCKED), "LOCKED" }, \ ++ { BIT(XPRT_CONNECTED), "CONNECTED" }, \ ++ { BIT(XPRT_CONNECTING), "CONNECTING" }, \ ++ { BIT(XPRT_CLOSE_WAIT), "CLOSE_WAIT" }, \ ++ { BIT(XPRT_BOUND), "BOUND" }, \ ++ { BIT(XPRT_BINDING), "BINDING" }, \ ++ { BIT(XPRT_CLOSING), "CLOSING" }, \ ++ { BIT(XPRT_OFFLINE), "OFFLINE" }, \ ++ { BIT(XPRT_REMOVE), "REMOVE" }, \ ++ { BIT(XPRT_CONGESTED), "CONGESTED" }, \ ++ { BIT(XPRT_CWND_WAIT), "CWND_WAIT" }, \ ++ { BIT(XPRT_WRITE_SPACE), "WRITE_SPACE" }) + + DECLARE_EVENT_CLASS(rpc_xprt_lifetime_class, + TP_PROTO( +-- +2.34.1 +