From f63aadd089f5215d2dfebf02fedca72d1b4d8d6a Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Sat, 27 Jan 2024 07:47:05 -0500 Subject: [PATCH] Fixes for 6.6 Signed-off-by: Sasha Levin --- ...de-silly-rename-files-from-userspace.patch | 54 +++ ...kernel-warning-when-running-offline-.patch | 113 +++++ ...ait-for-flr-to-complete-during-probe.patch | 43 ++ ..._addr_set_sun_path-to-allow-writing-.patch | 116 +++++ ...dified-uaddrlen-from-cgroup-sockaddr.patch | 449 ++++++++++++++++++ ...d-use-after-free-when-chunk-length-i.patch | 161 +++++++ .../fjes-fix-memleaks-in-fjes_hw_setup.patch | 109 +++++ ...i-buffer-packets-that-are-shrunk-by-.patch | 170 +++++++ .../i40e-set-xdp_rxq_info-frag_size.patch | 130 +++++ ...rxq_info-frag_size-for-zc-enabled-rx.patch | 46 ++ ...-redundant-xdp_rxq_info-registration.patch | 58 +++ ...xq_info-frag_size-for-zc-enabled-rx-.patch | 91 ++++ .../ice-work-on-pre-xdp-prog-frag-count.patch | 170 +++++++ ...lize-skb_frag_t-bv_offset-in-zc-driv.patch | 60 +++ ...cept_queue-s-spinlocks-in-inet6_crea.patch | 70 +++ .../llc-drop-support-for-eth_p_tr_802_2.patch | 130 +++++ ...sendmsg-more-robust-against-bonding-.patch | 154 ++++++ ...he-unhandled-context-fault-from-smmu.patch | 58 +++ ...-a-namespace-with-conflicting-altnam.patch | 81 ++++ ...el-fix-ptp-frame-parsing-for-lan8814.patch | 61 +++ ...e-enable-mcast-in-smfs-steering-mode.patch | 77 +++ ...fix-multicast-packets-sent-to-uplink.patch | 94 ++++ ...-can-t-go-to-uplink-vport-on-rx-rule.patch | 51 ++ ...the-right-gvmi-number-for-drop-actio.patch | 39 ++ ...warn-upon-a-callback-command-failure.patch | 149 ++++++ ...5-device-constant-for-selecting-cq-p.patch | 39 ++ ...software-parsing-when-ipsec-crypto-i.patch | 39 ++ ...-a-double-free-in-arfs_create_groups.patch | 100 ++++ ...potential-double-free-in-fs_any_crea.patch | 40 ++ ...eration-precedence-bug-in-port-times.patch | 41 ++ ...t-mlx5e-fix-peer-flow-lists-handling.patch | 126 +++++ ...-ipsec-replay-window-values-on-sende.patch | 68 +++ ...-clear-bm-pool-before-initialization.patch | 77 +++ ...n-array-index-out-of-bounds-in-rds_c.patch | 71 +++ ...ed-flower-fix-chain-template-offload.patch | 190 ++++++++ ...gal-rmb_desc-access-in-smc-d-connect.patch | 87 ++++ ...t-a-bit-for-the-reset-to-take-effect.patch | 63 +++ ...les-restrict-anonymous-set-and-map-n.patch | 60 +++ ...r-nf_tables-validate-nfproto_-family.patch | 196 ++++++++ ...mit-reject-configurations-that-cause.patch | 83 ++++ ...he-prevent-oops-in-fscache_put_cache.patch | 44 ++ ...ntial-sleeping-issue-in-mqueue_flush.patch | 76 +++ ...cu-kthreads-wakeup-when-cpu-is-dying.patch | 141 ++++++ ...-reuse-port-for-so_incoming_cpu-test.patch | 231 +++++++++ ...g-do-not-test-arp-ns-target-with-mod.patch | 63 +++ ...ts-bonding-increase-timeout-to-1200s.patch | 56 +++ ...fill-in-some-missing-configs-for-net.patch | 117 +++++ ...et-fix-rps_default_mask-with-32-cpus.patch | 51 ++ ...etdevsim-fix-the-udp_tunnel_nic-test.patch | 102 ++++ queue-6.6/series | 64 +++ ...st-size-to-initialize-bio_vec-in-svc.patch | 42 ++ .../tcp-add-memory-barrier-to-tcp_push.patch | 101 ++++ ...it-the-accept_queue-s-spinlocks-once.patch | 170 +++++++ ...isibility-when-inserting-an-element-.patch | 129 +++++ ...ring_need_wakeup-for-empty-fill-ring.patch | 52 ++ .../tsnep-remove-fcs-for-xdp-data-path.patch | 49 ++ ...g-rx-stats-accounting-in-tun_xdp_act.patch | 49 ++ ...ssing-dropped-counter-in-tun_xdp_act.patch | 52 ++ queue-6.6/udp-fix-busy-polling.patch | 134 ++++++ ...-type-that-is-not-ifla_vlan_qos_mapp.patch | 58 +++ ...mac80211-fix-potential-sta-link-leak.patch | 44 ++ ...-increase-for-mem_type_xsk_buff_pool.patch | 42 ++ ...-multi-buffer-bpf-helpers-for-zc-xdp.patch | 195 ++++++++ ...f_pool-responsible-for-clearing-xdp_.patch | 107 +++++ ...cle-buffer-in-case-rx-queue-was-full.patch | 58 +++ 65 files changed, 6241 insertions(+) create mode 100644 queue-6.6/afs-hide-silly-rename-files-from-userspace.patch create mode 100644 queue-6.6/bnxt_en-prevent-kernel-warning-when-running-offline-.patch create mode 100644 queue-6.6/bnxt_en-wait-for-flr-to-complete-during-probe.patch create mode 100644 queue-6.6/bpf-add-bpf_sock_addr_set_sun_path-to-allow-writing-.patch create mode 100644 queue-6.6/bpf-propagate-modified-uaddrlen-from-cgroup-sockaddr.patch create mode 100644 queue-6.6/btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch create mode 100644 queue-6.6/fjes-fix-memleaks-in-fjes_hw_setup.patch create mode 100644 queue-6.6/i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch create mode 100644 queue-6.6/i40e-set-xdp_rxq_info-frag_size.patch create mode 100644 queue-6.6/i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch create mode 100644 queue-6.6/ice-remove-redundant-xdp_rxq_info-registration.patch create mode 100644 queue-6.6/ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch create mode 100644 queue-6.6/ice-work-on-pre-xdp-prog-frag-count.patch create mode 100644 queue-6.6/intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch create mode 100644 queue-6.6/ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch create mode 100644 queue-6.6/llc-drop-support-for-eth_p_tr_802_2.patch create mode 100644 queue-6.6/llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch create mode 100644 queue-6.6/net-fec-fix-the-unhandled-context-fault-from-smmu.patch create mode 100644 queue-6.6/net-fix-removing-a-namespace-with-conflicting-altnam.patch create mode 100644 queue-6.6/net-micrel-fix-ptp-frame-parsing-for-lan8814.patch create mode 100644 queue-6.6/net-mlx5-bridge-enable-mcast-in-smfs-steering-mode.patch create mode 100644 queue-6.6/net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch create mode 100644 queue-6.6/net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch create mode 100644 queue-6.6/net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch create mode 100644 queue-6.6/net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch create mode 100644 queue-6.6/net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch create mode 100644 queue-6.6/net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch create mode 100644 queue-6.6/net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch create mode 100644 queue-6.6/net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch create mode 100644 queue-6.6/net-mlx5e-fix-operation-precedence-bug-in-port-times.patch create mode 100644 queue-6.6/net-mlx5e-fix-peer-flow-lists-handling.patch create mode 100644 queue-6.6/net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch create mode 100644 queue-6.6/net-mvpp2-clear-bm-pool-before-initialization.patch create mode 100644 queue-6.6/net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch create mode 100644 queue-6.6/net-sched-flower-fix-chain-template-offload.patch create mode 100644 queue-6.6/net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch create mode 100644 queue-6.6/net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch create mode 100644 queue-6.6/netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch create mode 100644 queue-6.6/netfilter-nf_tables-validate-nfproto_-family.patch create mode 100644 queue-6.6/netfilter-nft_limit-reject-configurations-that-cause.patch create mode 100644 queue-6.6/netfs-fscache-prevent-oops-in-fscache_put_cache.patch create mode 100644 queue-6.6/netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch create mode 100644 queue-6.6/rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch create mode 100644 queue-6.6/selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch create mode 100644 queue-6.6/selftests-bonding-do-not-test-arp-ns-target-with-mod.patch create mode 100644 queue-6.6/selftests-bonding-increase-timeout-to-1200s.patch create mode 100644 queue-6.6/selftests-fill-in-some-missing-configs-for-net.patch create mode 100644 queue-6.6/selftests-net-fix-rps_default_mask-with-32-cpus.patch create mode 100644 queue-6.6/selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch create mode 100644 queue-6.6/sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch create mode 100644 queue-6.6/tcp-add-memory-barrier-to-tcp_push.patch create mode 100644 queue-6.6/tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch create mode 100644 queue-6.6/tracing-ensure-visibility-when-inserting-an-element-.patch create mode 100644 queue-6.6/tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch create mode 100644 queue-6.6/tsnep-remove-fcs-for-xdp-data-path.patch create mode 100644 queue-6.6/tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch create mode 100644 queue-6.6/tun-fix-missing-dropped-counter-in-tun_xdp_act.patch create mode 100644 queue-6.6/udp-fix-busy-polling.patch create mode 100644 queue-6.6/vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch create mode 100644 queue-6.6/wifi-mac80211-fix-potential-sta-link-leak.patch create mode 100644 queue-6.6/xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch create mode 100644 queue-6.6/xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch create mode 100644 queue-6.6/xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch create mode 100644 queue-6.6/xsk-recycle-buffer-in-case-rx-queue-was-full.patch diff --git a/queue-6.6/afs-hide-silly-rename-files-from-userspace.patch b/queue-6.6/afs-hide-silly-rename-files-from-userspace.patch new file mode 100644 index 00000000000..f11e8df89ae --- /dev/null +++ b/queue-6.6/afs-hide-silly-rename-files-from-userspace.patch @@ -0,0 +1,54 @@ +From 0c016e7bd1183e0ed5b09bc1d81f8f8e0779831f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 8 Jan 2024 17:22:36 +0000 +Subject: afs: Hide silly-rename files from userspace + +From: David Howells + +[ Upstream commit 57e9d49c54528c49b8bffe6d99d782ea051ea534 ] + +There appears to be a race between silly-rename files being created/removed +and various userspace tools iterating over the contents of a directory, +leading to such errors as: + + find: './kernel/.tmp_cpio_dir/include/dt-bindings/reset/.__afs2080': No such file or directory + tar: ./include/linux/greybus/.__afs3C95: File removed before we read it + +when building a kernel. + +Fix afs_readdir() so that it doesn't return .__afsXXXX silly-rename files +to userspace. This doesn't stop them being looked up directly by name as +we need to be able to look them up from within the kernel as part of the +silly-rename algorithm. + +Fixes: 79ddbfa500b3 ("afs: Implement sillyrename for unlink and rename") +Signed-off-by: David Howells +cc: Marc Dionne +cc: linux-afs@lists.infradead.org +Signed-off-by: Sasha Levin +--- + fs/afs/dir.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/fs/afs/dir.c b/fs/afs/dir.c +index 5219182e52e1..2df2e9ee130d 100644 +--- a/fs/afs/dir.c ++++ b/fs/afs/dir.c +@@ -474,6 +474,14 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, + continue; + } + ++ /* Don't expose silly rename entries to userspace. */ ++ if (nlen > 6 && ++ dire->u.name[0] == '.' && ++ ctx->actor != afs_lookup_filldir && ++ ctx->actor != afs_lookup_one_filldir && ++ memcmp(dire->u.name, ".__afs", 6) == 0) ++ continue; ++ + /* found the next entry */ + if (!dir_emit(ctx, dire->u.name, nlen, + ntohl(dire->u.vnode), +-- +2.43.0 + diff --git a/queue-6.6/bnxt_en-prevent-kernel-warning-when-running-offline-.patch b/queue-6.6/bnxt_en-prevent-kernel-warning-when-running-offline-.patch new file mode 100644 index 00000000000..0775e05429e --- /dev/null +++ b/queue-6.6/bnxt_en-prevent-kernel-warning-when-running-offline-.patch @@ -0,0 +1,113 @@ +From 198096f917cac879e5cd7f5f65711b6ac2011576 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Jan 2024 15:45:14 -0800 +Subject: bnxt_en: Prevent kernel warning when running offline self test + +From: Michael Chan + +[ Upstream commit c20f482129a582455f02eb9a6dcb2a4215274599 ] + +We call bnxt_half_open_nic() to setup the chip partially to run +loopback tests. The rings and buffers are initialized normally +so that we can transmit and receive packets in loopback mode. +That means page pool buffers are allocated for the aggregation ring +just like the normal case. NAPI is not needed because we are just +polling for the loopback packets. + +When we're done with the loopback tests, we call bnxt_half_close_nic() +to clean up. When freeing the page pools, we hit a WARN_ON() +in page_pool_unlink_napi() because the NAPI state linked to the +page pool is uninitialized. + +The simplest way to avoid this warning is just to initialize the +NAPIs during half open and delete the NAPIs during half close. +Trying to skip the page pool initialization or skip linking of +NAPI during half open will be more complicated. + +This fix avoids this warning: + +WARNING: CPU: 4 PID: 46967 at net/core/page_pool.c:946 page_pool_unlink_napi+0x1f/0x30 +CPU: 4 PID: 46967 Comm: ethtool Tainted: G S W 6.7.0-rc5+ #22 +Hardware name: Dell Inc. PowerEdge R750/06V45N, BIOS 1.3.8 08/31/2021 +RIP: 0010:page_pool_unlink_napi+0x1f/0x30 +Code: 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 48 8b 47 18 48 85 c0 74 1b 48 8b 50 10 83 e2 01 74 08 8b 40 34 83 f8 ff 74 02 <0f> 0b 48 c7 47 18 00 00 00 00 c3 cc cc cc cc 66 90 90 90 90 90 90 +RSP: 0018:ffa000003d0dfbe8 EFLAGS: 00010246 +RAX: ff110003607ce640 RBX: ff110010baf5d000 RCX: 0000000000000008 +RDX: 0000000000000000 RSI: ff110001e5e522c0 RDI: ff110010baf5d000 +RBP: ff11000145539b40 R08: 0000000000000001 R09: ffffffffc063f641 +R10: ff110001361eddb8 R11: 000000000040000f R12: 0000000000000001 +R13: 000000000000001c R14: ff1100014553a080 R15: 0000000000003fc0 +FS: 00007f9301c4f740(0000) GS:ff1100103fd00000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00007f91344fa8f0 CR3: 00000003527cc005 CR4: 0000000000771ef0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +PKRU: 55555554 +Call Trace: + + ? __warn+0x81/0x140 + ? page_pool_unlink_napi+0x1f/0x30 + ? report_bug+0x102/0x200 + ? handle_bug+0x44/0x70 + ? exc_invalid_op+0x13/0x60 + ? asm_exc_invalid_op+0x16/0x20 + ? bnxt_free_ring.isra.123+0xb1/0xd0 [bnxt_en] + ? page_pool_unlink_napi+0x1f/0x30 + page_pool_destroy+0x3e/0x150 + bnxt_free_mem+0x441/0x5e0 [bnxt_en] + bnxt_half_close_nic+0x2a/0x40 [bnxt_en] + bnxt_self_test+0x21d/0x450 [bnxt_en] + __dev_ethtool+0xeda/0x2e30 + ? native_queued_spin_lock_slowpath+0x17f/0x2b0 + ? __link_object+0xa1/0x160 + ? _raw_spin_unlock_irqrestore+0x23/0x40 + ? __create_object+0x5f/0x90 + ? __kmem_cache_alloc_node+0x317/0x3c0 + ? dev_ethtool+0x59/0x170 + dev_ethtool+0xa7/0x170 + dev_ioctl+0xc3/0x530 + sock_do_ioctl+0xa8/0xf0 + sock_ioctl+0x270/0x310 + __x64_sys_ioctl+0x8c/0xc0 + do_syscall_64+0x3e/0xf0 + entry_SYSCALL_64_after_hwframe+0x6e/0x76 + +Fixes: 294e39e0d034 ("bnxt: hook NAPIs to page pools") +Reviewed-by: Andy Gospodarek +Reviewed-by: Ajit Khaparde +Signed-off-by: Michael Chan +Link: https://lore.kernel.org/r/20240117234515.226944-5-michael.chan@broadcom.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +index 9e04db1273a5..dac4f9510c17 100644 +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +@@ -10598,10 +10598,12 @@ int bnxt_half_open_nic(struct bnxt *bp) + netdev_err(bp->dev, "bnxt_alloc_mem err: %x\n", rc); + goto half_open_err; + } ++ bnxt_init_napi(bp); + set_bit(BNXT_STATE_HALF_OPEN, &bp->state); + rc = bnxt_init_nic(bp, true); + if (rc) { + clear_bit(BNXT_STATE_HALF_OPEN, &bp->state); ++ bnxt_del_napi(bp); + netdev_err(bp->dev, "bnxt_init_nic err: %x\n", rc); + goto half_open_err; + } +@@ -10620,6 +10622,7 @@ int bnxt_half_open_nic(struct bnxt *bp) + void bnxt_half_close_nic(struct bnxt *bp) + { + bnxt_hwrm_resource_free(bp, false, true); ++ bnxt_del_napi(bp); + bnxt_free_skbs(bp); + bnxt_free_mem(bp, true); + clear_bit(BNXT_STATE_HALF_OPEN, &bp->state); +-- +2.43.0 + diff --git a/queue-6.6/bnxt_en-wait-for-flr-to-complete-during-probe.patch b/queue-6.6/bnxt_en-wait-for-flr-to-complete-during-probe.patch new file mode 100644 index 00000000000..39ddaeb512d --- /dev/null +++ b/queue-6.6/bnxt_en-wait-for-flr-to-complete-during-probe.patch @@ -0,0 +1,43 @@ +From 9736c9e26022a20d1c33e799d7b117ca0d1b5f6f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Jan 2024 15:45:11 -0800 +Subject: bnxt_en: Wait for FLR to complete during probe + +From: Michael Chan + +[ Upstream commit 3c1069fa42872f95cf3c6fedf80723d391e12d57 ] + +The first message to firmware may fail if the device is undergoing FLR. +The driver has some recovery logic for this failure scenario but we must +wait 100 msec for FLR to complete before proceeding. Otherwise the +recovery will always fail. + +Fixes: ba02629ff6cb ("bnxt_en: log firmware status on firmware init failure") +Reviewed-by: Damodharam Ammepalli +Signed-off-by: Michael Chan +Link: https://lore.kernel.org/r/20240117234515.226944-2-michael.chan@broadcom.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +index 6039886a8544..9e04db1273a5 100644 +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +@@ -12261,6 +12261,11 @@ static int bnxt_fw_init_one_p1(struct bnxt *bp) + + bp->fw_cap = 0; + rc = bnxt_hwrm_ver_get(bp); ++ /* FW may be unresponsive after FLR. FLR must complete within 100 msec ++ * so wait before continuing with recovery. ++ */ ++ if (rc) ++ msleep(100); + bnxt_try_map_fw_health_reg(bp); + if (rc) { + rc = bnxt_try_recover_fw(bp); +-- +2.43.0 + diff --git a/queue-6.6/bpf-add-bpf_sock_addr_set_sun_path-to-allow-writing-.patch b/queue-6.6/bpf-add-bpf_sock_addr_set_sun_path-to-allow-writing-.patch new file mode 100644 index 00000000000..248346c38df --- /dev/null +++ b/queue-6.6/bpf-add-bpf_sock_addr_set_sun_path-to-allow-writing-.patch @@ -0,0 +1,116 @@ +From a2bf332849250cb0b7a5a168a0675fc16ac0ffb8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 11 Oct 2023 20:51:05 +0200 +Subject: bpf: Add bpf_sock_addr_set_sun_path() to allow writing unix sockaddr + from bpf + +From: Daan De Meyer + +[ Upstream commit 53e380d21441909b12b6e0782b77187ae4b971c4 ] + +As prep for adding unix socket support to the cgroup sockaddr hooks, +let's add a kfunc bpf_sock_addr_set_sun_path() that allows modifying a unix +sockaddr from bpf. While this is already possible for AF_INET and AF_INET6, +we'll need this kfunc when we add unix socket support since modifying the +address for those requires modifying both the address and the sockaddr +length. + +Signed-off-by: Daan De Meyer +Link: https://lore.kernel.org/r/20231011185113.140426-4-daan.j.demeyer@gmail.com +Signed-off-by: Martin KaFai Lau +Stable-dep-of: c5114710c8ce ("xsk: fix usage of multi-buffer BPF helpers for ZC XDP") +Signed-off-by: Sasha Levin +--- + kernel/bpf/btf.c | 1 + + net/core/filter.c | 35 ++++++++++++++++++++++++++++++++++- + 2 files changed, 35 insertions(+), 1 deletion(-) + +diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c +index 8090d7fb11ef..a31704a6bb61 100644 +--- a/kernel/bpf/btf.c ++++ b/kernel/bpf/btf.c +@@ -7832,6 +7832,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type) + case BPF_PROG_TYPE_SYSCALL: + return BTF_KFUNC_HOOK_SYSCALL; + case BPF_PROG_TYPE_CGROUP_SKB: ++ case BPF_PROG_TYPE_CGROUP_SOCK_ADDR: + return BTF_KFUNC_HOOK_CGROUP_SKB; + case BPF_PROG_TYPE_SCHED_ACT: + return BTF_KFUNC_HOOK_SCHED_ACT; +diff --git a/net/core/filter.c b/net/core/filter.c +index 90fe3e754383..cbc395d96479 100644 +--- a/net/core/filter.c ++++ b/net/core/filter.c +@@ -81,6 +81,7 @@ + #include + #include + #include ++#include + + static const struct bpf_func_proto * + bpf_sk_base_func_proto(enum bpf_func_id func_id); +@@ -11772,6 +11773,27 @@ __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_buff *xdp, u64 flags, + + return 0; + } ++ ++__bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern, ++ const u8 *sun_path, u32 sun_path__sz) ++{ ++ struct sockaddr_un *un; ++ ++ if (sa_kern->sk->sk_family != AF_UNIX) ++ return -EINVAL; ++ ++ /* We do not allow changing the address to unnamed or larger than the ++ * maximum allowed address size for a unix sockaddr. ++ */ ++ if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX) ++ return -EINVAL; ++ ++ un = (struct sockaddr_un *)sa_kern->uaddr; ++ memcpy(un->sun_path, sun_path, sun_path__sz); ++ sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz; ++ ++ return 0; ++} + __diag_pop(); + + int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags, +@@ -11796,6 +11818,10 @@ BTF_SET8_START(bpf_kfunc_check_set_xdp) + BTF_ID_FLAGS(func, bpf_dynptr_from_xdp) + BTF_SET8_END(bpf_kfunc_check_set_xdp) + ++BTF_SET8_START(bpf_kfunc_check_set_sock_addr) ++BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path) ++BTF_SET8_END(bpf_kfunc_check_set_sock_addr) ++ + static const struct btf_kfunc_id_set bpf_kfunc_set_skb = { + .owner = THIS_MODULE, + .set = &bpf_kfunc_check_set_skb, +@@ -11806,6 +11832,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = { + .set = &bpf_kfunc_check_set_xdp, + }; + ++static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = { ++ .owner = THIS_MODULE, ++ .set = &bpf_kfunc_check_set_sock_addr, ++}; ++ + static int __init bpf_kfunc_init(void) + { + int ret; +@@ -11820,7 +11851,9 @@ static int __init bpf_kfunc_init(void) + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb); +- return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ++ ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp); ++ return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR, ++ &bpf_kfunc_set_sock_addr); + } + late_initcall(bpf_kfunc_init); + +-- +2.43.0 + diff --git a/queue-6.6/bpf-propagate-modified-uaddrlen-from-cgroup-sockaddr.patch b/queue-6.6/bpf-propagate-modified-uaddrlen-from-cgroup-sockaddr.patch new file mode 100644 index 00000000000..43f9283b06d --- /dev/null +++ b/queue-6.6/bpf-propagate-modified-uaddrlen-from-cgroup-sockaddr.patch @@ -0,0 +1,449 @@ +From 04d0f77b90682fb29a22a4567a54e12bedbe4f13 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 11 Oct 2023 20:51:04 +0200 +Subject: bpf: Propagate modified uaddrlen from cgroup sockaddr programs + +From: Daan De Meyer + +[ Upstream commit fefba7d1ae198dcbf8b3b432de46a4e29f8dbd8c ] + +As prep for adding unix socket support to the cgroup sockaddr hooks, +let's propagate the sockaddr length back to the caller after running +a bpf cgroup sockaddr hook program. While not important for AF_INET or +AF_INET6, the sockaddr length is important when working with AF_UNIX +sockaddrs as the size of the sockaddr cannot be determined just from the +address family or the sockaddr's contents. + +__cgroup_bpf_run_filter_sock_addr() is modified to take the uaddrlen as +an input/output argument. After running the program, the modified sockaddr +length is stored in the uaddrlen pointer. + +Signed-off-by: Daan De Meyer +Link: https://lore.kernel.org/r/20231011185113.140426-3-daan.j.demeyer@gmail.com +Signed-off-by: Martin KaFai Lau +Stable-dep-of: c5114710c8ce ("xsk: fix usage of multi-buffer BPF helpers for ZC XDP") +Signed-off-by: Sasha Levin +--- + include/linux/bpf-cgroup.h | 73 +++++++++++++++++++------------------- + include/linux/filter.h | 1 + + kernel/bpf/cgroup.c | 17 +++++++-- + net/ipv4/af_inet.c | 7 ++-- + net/ipv4/ping.c | 2 +- + net/ipv4/tcp_ipv4.c | 2 +- + net/ipv4/udp.c | 9 +++-- + net/ipv6/af_inet6.c | 9 ++--- + net/ipv6/ping.c | 2 +- + net/ipv6/tcp_ipv6.c | 2 +- + net/ipv6/udp.c | 6 ++-- + 11 files changed, 76 insertions(+), 54 deletions(-) + +diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h +index 8506690dbb9c..31561e789715 100644 +--- a/include/linux/bpf-cgroup.h ++++ b/include/linux/bpf-cgroup.h +@@ -120,6 +120,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk, + + int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + struct sockaddr *uaddr, ++ int *uaddrlen, + enum cgroup_bpf_attach_type atype, + void *t_ctx, + u32 *flags); +@@ -230,22 +231,22 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, + #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) \ + BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND) + +-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) \ ++#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype) \ + ({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled(atype)) \ +- __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ +- NULL, NULL); \ ++ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \ ++ atype, NULL, NULL); \ + __ret; \ + }) + +-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) \ ++#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx) \ + ({ \ + int __ret = 0; \ + if (cgroup_bpf_enabled(atype)) { \ + lock_sock(sk); \ +- __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ +- t_ctx, NULL); \ ++ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \ ++ atype, t_ctx, NULL); \ + release_sock(sk); \ + } \ + __ret; \ +@@ -256,14 +257,14 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, + * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check + * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE). + */ +-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, bind_flags) \ ++#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, bind_flags) \ + ({ \ + u32 __flags = 0; \ + int __ret = 0; \ + if (cgroup_bpf_enabled(atype)) { \ + lock_sock(sk); \ +- __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype, \ +- NULL, &__flags); \ ++ __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \ ++ atype, NULL, &__flags); \ + release_sock(sk); \ + if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE) \ + *bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE; \ +@@ -276,29 +277,29 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk, + cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) && \ + (sk)->sk_prot->pre_connect) + +-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) \ +- BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET4_CONNECT) ++#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen) \ ++ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT) + +-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) \ +- BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET6_CONNECT) ++#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) \ ++ BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT) + +-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) \ +- BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_CONNECT, NULL) ++#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) \ ++ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT, NULL) + +-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) \ +- BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_CONNECT, NULL) ++#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) \ ++ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT, NULL) + +-#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) \ +- BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_SENDMSG, t_ctx) ++#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) \ ++ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_SENDMSG, t_ctx) + +-#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) \ +- BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_SENDMSG, t_ctx) ++#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) \ ++ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_SENDMSG, t_ctx) + +-#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) \ +- BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_RECVMSG, NULL) ++#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) \ ++ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_RECVMSG, NULL) + +-#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) \ +- BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_RECVMSG, NULL) ++#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) \ ++ BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_RECVMSG, NULL) + + /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a + * fullsock and its parent fullsock cannot be traced by +@@ -477,24 +478,24 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map, + } + + #define cgroup_bpf_enabled(atype) (0) +-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; }) +-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; }) ++#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx) ({ 0; }) ++#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype) ({ 0; }) + #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0) + #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; }) + #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; }) + #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; }) + #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; }) +-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, flags) ({ 0; }) ++#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, flags) ({ 0; }) + #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; }) + #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; }) +-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; }) +-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; }) +-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; }) +-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; }) +-#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) +-#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; }) +-#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; }) +-#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; }) ++#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen) ({ 0; }) ++#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; }) ++#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) ({ 0; }) ++#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; }) ++#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; }) ++#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; }) ++#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; }) ++#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; }) + #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; }) + #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; }) + #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; }) +diff --git a/include/linux/filter.h b/include/linux/filter.h +index 761af6b3cf2b..77db4263d68d 100644 +--- a/include/linux/filter.h ++++ b/include/linux/filter.h +@@ -1285,6 +1285,7 @@ struct bpf_sock_addr_kern { + */ + u64 tmp_reg; + void *t_ctx; /* Attach type specific context. */ ++ u32 uaddrlen; + }; + + struct bpf_sock_ops_kern { +diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c +index 03b3d4492980..ac37bd53aee0 100644 +--- a/kernel/bpf/cgroup.c ++++ b/kernel/bpf/cgroup.c +@@ -1450,6 +1450,9 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); + * provided by user sockaddr + * @sk: sock struct that will use sockaddr + * @uaddr: sockaddr struct provided by user ++ * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is ++ * read-only for AF_INET[6] uaddr but can be modified for AF_UNIX ++ * uaddr. + * @atype: The type of program to be executed + * @t_ctx: Pointer to attach type specific context + * @flags: Pointer to u32 which contains higher bits of BPF program +@@ -1462,6 +1465,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk); + */ + int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + struct sockaddr *uaddr, ++ int *uaddrlen, + enum cgroup_bpf_attach_type atype, + void *t_ctx, + u32 *flags) +@@ -1473,6 +1477,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + }; + struct sockaddr_storage unspec; + struct cgroup *cgrp; ++ int ret; + + /* Check socket family since not all sockets represent network + * endpoint (e.g. AF_UNIX). +@@ -1483,11 +1488,19 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk, + if (!ctx.uaddr) { + memset(&unspec, 0, sizeof(unspec)); + ctx.uaddr = (struct sockaddr *)&unspec; ++ ctx.uaddrlen = 0; ++ } else { ++ ctx.uaddrlen = *uaddrlen; + } + + cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); +- return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, +- 0, flags); ++ ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run, ++ 0, flags); ++ ++ if (!ret && uaddr) ++ *uaddrlen = ctx.uaddrlen; ++ ++ return ret; + } + EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr); + +diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c +index b739ddbef0f0..7d4d625471f7 100644 +--- a/net/ipv4/af_inet.c ++++ b/net/ipv4/af_inet.c +@@ -455,7 +455,7 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ +- err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, ++ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len, + CGROUP_INET4_BIND, &flags); + if (err) + return err; +@@ -797,6 +797,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr); ++ int sin_addr_len = sizeof(*sin); + + sin->sin_family = AF_INET; + lock_sock(sk); +@@ -809,7 +810,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, + } + sin->sin_port = inet->inet_dport; + sin->sin_addr.s_addr = inet->inet_daddr; +- BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, ++ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, + CGROUP_INET4_GETPEERNAME); + } else { + __be32 addr = inet->inet_rcv_saddr; +@@ -817,7 +818,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr, + addr = inet->inet_saddr; + sin->sin_port = inet->inet_sport; + sin->sin_addr.s_addr = addr; +- BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, ++ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, + CGROUP_INET4_GETSOCKNAME); + } + release_sock(sk); +diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c +index 75e0aee35eb7..4cb0c896caf9 100644 +--- a/net/ipv4/ping.c ++++ b/net/ipv4/ping.c +@@ -301,7 +301,7 @@ static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr, + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + +- return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr); ++ return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len); + } + + /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */ +diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c +index 4167e8a48b60..c7ffab37a34c 100644 +--- a/net/ipv4/tcp_ipv4.c ++++ b/net/ipv4/tcp_ipv4.c +@@ -194,7 +194,7 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr, + + sock_owned_by_me(sk); + +- return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr); ++ return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len); + } + + /* This will initiate an outgoing connection. */ +diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c +index 9cb22a6ae1dc..7be4ddc80d95 100644 +--- a/net/ipv4/udp.c ++++ b/net/ipv4/udp.c +@@ -1143,7 +1143,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) + + if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) { + err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, +- (struct sockaddr *)usin, &ipc.addr); ++ (struct sockaddr *)usin, ++ &msg->msg_namelen, ++ &ipc.addr); + if (err) + goto out_free; + if (usin) { +@@ -1865,7 +1867,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags, + *addr_len = sizeof(*sin); + + BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, +- (struct sockaddr *)sin); ++ (struct sockaddr *)sin, ++ addr_len); + } + + if (udp_test_bit(GRO_ENABLED, sk)) +@@ -1904,7 +1907,7 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + +- return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr); ++ return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len); + } + EXPORT_SYMBOL(udp_pre_connect); + +diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c +index b6c5b5e25a2f..4375bfa4f608 100644 +--- a/net/ipv6/af_inet6.c ++++ b/net/ipv6/af_inet6.c +@@ -456,7 +456,7 @@ int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len) + /* BPF prog is run before any checks are done so that if the prog + * changes context in a wrong way it will be caught. + */ +- err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, ++ err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len, + CGROUP_INET6_BIND, &flags); + if (err) + return err; +@@ -522,6 +522,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, + int peer) + { + struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr; ++ int sin_addr_len = sizeof(*sin); + struct sock *sk = sock->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); +@@ -541,7 +542,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, + sin->sin6_addr = sk->sk_v6_daddr; + if (np->sndflow) + sin->sin6_flowinfo = np->flow_label; +- BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, ++ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, + CGROUP_INET6_GETPEERNAME); + } else { + if (ipv6_addr_any(&sk->sk_v6_rcv_saddr)) +@@ -549,13 +550,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, + else + sin->sin6_addr = sk->sk_v6_rcv_saddr; + sin->sin6_port = inet->inet_sport; +- BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, ++ BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len, + CGROUP_INET6_GETSOCKNAME); + } + sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr, + sk->sk_bound_dev_if); + release_sock(sk); +- return sizeof(*sin); ++ return sin_addr_len; + } + EXPORT_SYMBOL(inet6_getname); + +diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c +index 5831aaa53d75..25243737fbc4 100644 +--- a/net/ipv6/ping.c ++++ b/net/ipv6/ping.c +@@ -56,7 +56,7 @@ static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + +- return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr); ++ return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len); + } + + static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) +diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c +index 44b6949d72b2..3783334ef233 100644 +--- a/net/ipv6/tcp_ipv6.c ++++ b/net/ipv6/tcp_ipv6.c +@@ -135,7 +135,7 @@ static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + + sock_owned_by_me(sk); + +- return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr); ++ return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len); + } + + static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, +diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c +index f1170dcc21d9..438476a31313 100644 +--- a/net/ipv6/udp.c ++++ b/net/ipv6/udp.c +@@ -410,7 +410,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, + *addr_len = sizeof(*sin6); + + BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, +- (struct sockaddr *)sin6); ++ (struct sockaddr *)sin6, ++ addr_len); + } + + if (udp_test_bit(GRO_ENABLED, sk)) +@@ -1157,7 +1158,7 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + +- return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr); ++ return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len); + } + + /** +@@ -1510,6 +1511,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) + if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) { + err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, + (struct sockaddr *)sin6, ++ &addr_len, + &fl6->saddr); + if (err) + goto out_no_dst; +-- +2.43.0 + diff --git a/queue-6.6/btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch b/queue-6.6/btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch new file mode 100644 index 00000000000..0b985adb435 --- /dev/null +++ b/queue-6.6/btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch @@ -0,0 +1,161 @@ +From 4ef6700db3661b10beae452d02a6fc8b0e7570d4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Jan 2024 11:02:25 +1030 +Subject: btrfs: scrub: avoid use-after-free when chunk length is not 64K + aligned + +From: Qu Wenruo + +[ Upstream commit f546c4282673497a06ecb6190b50ae7f6c85b02f ] + +[BUG] +There is a bug report that, on a ext4-converted btrfs, scrub leads to +various problems, including: + +- "unable to find chunk map" errors + BTRFS info (device vdb): scrub: started on devid 1 + BTRFS critical (device vdb): unable to find chunk map for logical 2214744064 length 4096 + BTRFS critical (device vdb): unable to find chunk map for logical 2214744064 length 45056 + + This would lead to unrepariable errors. + +- Use-after-free KASAN reports: + ================================================================== + BUG: KASAN: slab-use-after-free in __blk_rq_map_sg+0x18f/0x7c0 + Read of size 8 at addr ffff8881013c9040 by task btrfs/909 + CPU: 0 PID: 909 Comm: btrfs Not tainted 6.7.0-x64v3-dbg #11 c50636e9419a8354555555245df535e380563b2b + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 2023.11-2 12/24/2023 + Call Trace: + + dump_stack_lvl+0x43/0x60 + print_report+0xcf/0x640 + kasan_report+0xa6/0xd0 + __blk_rq_map_sg+0x18f/0x7c0 + virtblk_prep_rq.isra.0+0x215/0x6a0 [virtio_blk 19a65eeee9ae6fcf02edfad39bb9ddee07dcdaff] + virtio_queue_rqs+0xc4/0x310 [virtio_blk 19a65eeee9ae6fcf02edfad39bb9ddee07dcdaff] + blk_mq_flush_plug_list.part.0+0x780/0x860 + __blk_flush_plug+0x1ba/0x220 + blk_finish_plug+0x3b/0x60 + submit_initial_group_read+0x10a/0x290 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] + flush_scrub_stripes+0x38e/0x430 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] + scrub_stripe+0x82a/0xae0 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] + scrub_chunk+0x178/0x200 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] + scrub_enumerate_chunks+0x4bc/0xa30 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] + btrfs_scrub_dev+0x398/0x810 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] + btrfs_ioctl+0x4b9/0x3020 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] + __x64_sys_ioctl+0xbd/0x100 + do_syscall_64+0x5d/0xe0 + entry_SYSCALL_64_after_hwframe+0x63/0x6b + RIP: 0033:0x7f47e5e0952b + +- Crash, mostly due to above use-after-free + +[CAUSE] +The converted fs has the following data chunk layout: + + item 2 key (FIRST_CHUNK_TREE CHUNK_ITEM 2214658048) itemoff 16025 itemsize 80 + length 86016 owner 2 stripe_len 65536 type DATA|single + +For above logical bytenr 2214744064, it's at the chunk end +(2214658048 + 86016 = 2214744064). + +This means btrfs_submit_bio() would split the bio, and trigger endio +function for both of the two halves. + +However scrub_submit_initial_read() would only expect the endio function +to be called once, not any more. +This means the first endio function would already free the bbio::bio, +leaving the bvec freed, thus the 2nd endio call would lead to +use-after-free. + +[FIX] +- Make sure scrub_read_endio() only updates bits in its range + Since we may read less than 64K at the end of the chunk, we should not + touch the bits beyond chunk boundary. + +- Make sure scrub_submit_initial_read() only to read the chunk range + This is done by calculating the real number of sectors we need to + read, and add sector-by-sector to the bio. + +Thankfully the scrub read repair path won't need extra fixes: + +- scrub_stripe_submit_repair_read() + With above fixes, we won't update error bit for range beyond chunk, + thus scrub_stripe_submit_repair_read() should never submit any read + beyond the chunk. + +Reported-by: Rongrong +Fixes: e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure") +Tested-by: Rongrong +Reviewed-by: Johannes Thumshirn +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/scrub.c | 29 ++++++++++++++++++++++------- + 1 file changed, 22 insertions(+), 7 deletions(-) + +diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c +index 4445a52a0707..12147d0f2805 100644 +--- a/fs/btrfs/scrub.c ++++ b/fs/btrfs/scrub.c +@@ -1099,12 +1099,22 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) + static void scrub_read_endio(struct btrfs_bio *bbio) + { + struct scrub_stripe *stripe = bbio->private; ++ struct bio_vec *bvec; ++ int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); ++ int num_sectors; ++ u32 bio_size = 0; ++ int i; ++ ++ ASSERT(sector_nr < stripe->nr_sectors); ++ bio_for_each_bvec_all(bvec, &bbio->bio, i) ++ bio_size += bvec->bv_len; ++ num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; + + if (bbio->bio.bi_status) { +- bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors); +- bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors); ++ bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); ++ bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); + } else { +- bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors); ++ bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); + } + bio_put(&bbio->bio); + if (atomic_dec_and_test(&stripe->pending_io)) { +@@ -1640,6 +1650,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, + { + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_bio *bbio; ++ unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + ++ stripe->bg->length - stripe->logical) >> ++ fs_info->sectorsize_bits; + int mirror = stripe->mirror_num; + + ASSERT(stripe->bg); +@@ -1649,14 +1662,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, + bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, + scrub_read_endio, stripe); + +- /* Read the whole stripe. */ + bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; +- for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) { ++ /* Read the whole range inside the chunk boundary. */ ++ for (unsigned int cur = 0; cur < nr_sectors; cur++) { ++ struct page *page = scrub_stripe_get_page(stripe, cur); ++ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); + int ret; + +- ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0); ++ ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + /* We should have allocated enough bio vectors. */ +- ASSERT(ret == PAGE_SIZE); ++ ASSERT(ret == fs_info->sectorsize); + } + atomic_inc(&stripe->pending_io); + +-- +2.43.0 + diff --git a/queue-6.6/fjes-fix-memleaks-in-fjes_hw_setup.patch b/queue-6.6/fjes-fix-memleaks-in-fjes_hw_setup.patch new file mode 100644 index 00000000000..9d7928fef8d --- /dev/null +++ b/queue-6.6/fjes-fix-memleaks-in-fjes_hw_setup.patch @@ -0,0 +1,109 @@ +From 026713060d1f9306327ab4782d38b8cc303dfdbd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Jan 2024 01:24:42 +0800 +Subject: fjes: fix memleaks in fjes_hw_setup + +From: Zhipeng Lu + +[ Upstream commit f6cc4b6a3ae53df425771000e9c9540cce9b7bb1 ] + +In fjes_hw_setup, it allocates several memory and delay the deallocation +to the fjes_hw_exit in fjes_probe through the following call chain: + +fjes_probe + |-> fjes_hw_init + |-> fjes_hw_setup + |-> fjes_hw_exit + +However, when fjes_hw_setup fails, fjes_hw_exit won't be called and thus +all the resources allocated in fjes_hw_setup will be leaked. In this +patch, we free those resources in fjes_hw_setup and prevents such leaks. + +Fixes: 2fcbca687702 ("fjes: platform_driver's .probe and .remove routine") +Signed-off-by: Zhipeng Lu +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20240122172445.3841883-1-alexious@zju.edu.cn +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/fjes/fjes_hw.c | 37 ++++++++++++++++++++++++++++++------- + 1 file changed, 30 insertions(+), 7 deletions(-) + +diff --git a/drivers/net/fjes/fjes_hw.c b/drivers/net/fjes/fjes_hw.c +index 704e949484d0..b9b5554ea862 100644 +--- a/drivers/net/fjes/fjes_hw.c ++++ b/drivers/net/fjes/fjes_hw.c +@@ -221,21 +221,25 @@ static int fjes_hw_setup(struct fjes_hw *hw) + + mem_size = FJES_DEV_REQ_BUF_SIZE(hw->max_epid); + hw->hw_info.req_buf = kzalloc(mem_size, GFP_KERNEL); +- if (!(hw->hw_info.req_buf)) +- return -ENOMEM; ++ if (!(hw->hw_info.req_buf)) { ++ result = -ENOMEM; ++ goto free_ep_info; ++ } + + hw->hw_info.req_buf_size = mem_size; + + mem_size = FJES_DEV_RES_BUF_SIZE(hw->max_epid); + hw->hw_info.res_buf = kzalloc(mem_size, GFP_KERNEL); +- if (!(hw->hw_info.res_buf)) +- return -ENOMEM; ++ if (!(hw->hw_info.res_buf)) { ++ result = -ENOMEM; ++ goto free_req_buf; ++ } + + hw->hw_info.res_buf_size = mem_size; + + result = fjes_hw_alloc_shared_status_region(hw); + if (result) +- return result; ++ goto free_res_buf; + + hw->hw_info.buffer_share_bit = 0; + hw->hw_info.buffer_unshare_reserve_bit = 0; +@@ -246,11 +250,11 @@ static int fjes_hw_setup(struct fjes_hw *hw) + + result = fjes_hw_alloc_epbuf(&buf_pair->tx); + if (result) +- return result; ++ goto free_epbuf; + + result = fjes_hw_alloc_epbuf(&buf_pair->rx); + if (result) +- return result; ++ goto free_epbuf; + + spin_lock_irqsave(&hw->rx_status_lock, flags); + fjes_hw_setup_epbuf(&buf_pair->tx, mac, +@@ -273,6 +277,25 @@ static int fjes_hw_setup(struct fjes_hw *hw) + fjes_hw_init_command_registers(hw, ¶m); + + return 0; ++ ++free_epbuf: ++ for (epidx = 0; epidx < hw->max_epid ; epidx++) { ++ if (epidx == hw->my_epid) ++ continue; ++ fjes_hw_free_epbuf(&hw->ep_shm_info[epidx].tx); ++ fjes_hw_free_epbuf(&hw->ep_shm_info[epidx].rx); ++ } ++ fjes_hw_free_shared_status_region(hw); ++free_res_buf: ++ kfree(hw->hw_info.res_buf); ++ hw->hw_info.res_buf = NULL; ++free_req_buf: ++ kfree(hw->hw_info.req_buf); ++ hw->hw_info.req_buf = NULL; ++free_ep_info: ++ kfree(hw->ep_shm_info); ++ hw->ep_shm_info = NULL; ++ return result; + } + + static void fjes_hw_cleanup(struct fjes_hw *hw) +-- +2.43.0 + diff --git a/queue-6.6/i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch b/queue-6.6/i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch new file mode 100644 index 00000000000..bfb51a97493 --- /dev/null +++ b/queue-6.6/i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch @@ -0,0 +1,170 @@ +From 815b14ad14c128be5bd96027884270db7cf46ee7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:56 +0100 +Subject: i40e: handle multi-buffer packets that are shrunk by xdp prog + +From: Tirthendu Sarkar + +[ Upstream commit 83014323c642b8faa2d64a5f303b41c019322478 ] + +XDP programs can shrink packets by calling the bpf_xdp_adjust_tail() +helper function. For multi-buffer packets this may lead to reduction of +frag count stored in skb_shared_info area of the xdp_buff struct. This +results in issues with the current handling of XDP_PASS and XDP_DROP +cases. + +For XDP_PASS, currently skb is being built using frag count of +xdp_buffer before it was processed by XDP prog and thus will result in +an inconsistent skb when frag count gets reduced by XDP prog. To fix +this, get correct frag count while building the skb instead of using +pre-obtained frag count. + +For XDP_DROP, current page recycling logic will not reuse the page but +instead will adjust the pagecnt_bias so that the page can be freed. This +again results in inconsistent behavior as the page refcnt has already +been changed by the helper while freeing the frag(s) as part of +shrinking the packet. To fix this, only adjust pagecnt_bias for buffers +that are stillpart of the packet post-xdp prog run. + +Fixes: e213ced19bef ("i40e: add support for XDP multi-buffer Rx") +Reported-by: Maciej Fijalkowski +Signed-off-by: Tirthendu Sarkar +Link: https://lore.kernel.org/r/20240124191602.566724-6-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_txrx.c | 40 ++++++++++++--------- + 1 file changed, 23 insertions(+), 17 deletions(-) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c +index b047c587629b..2e5546e549d9 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c +@@ -2100,7 +2100,8 @@ static void i40e_put_rx_buffer(struct i40e_ring *rx_ring, + static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res, + struct xdp_buff *xdp) + { +- u32 next = rx_ring->next_to_clean; ++ u32 nr_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags; ++ u32 next = rx_ring->next_to_clean, i = 0; + struct i40e_rx_buffer *rx_buffer; + + xdp->flags = 0; +@@ -2113,10 +2114,10 @@ static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res, + if (!rx_buffer->page) + continue; + +- if (xdp_res == I40E_XDP_CONSUMED) +- rx_buffer->pagecnt_bias++; +- else ++ if (xdp_res != I40E_XDP_CONSUMED) + i40e_rx_buffer_flip(rx_buffer, xdp->frame_sz); ++ else if (i++ <= nr_frags) ++ rx_buffer->pagecnt_bias++; + + /* EOP buffer will be put in i40e_clean_rx_irq() */ + if (next == rx_ring->next_to_process) +@@ -2130,20 +2131,20 @@ static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res, + * i40e_construct_skb - Allocate skb and populate it + * @rx_ring: rx descriptor ring to transact packets on + * @xdp: xdp_buff pointing to the data +- * @nr_frags: number of buffers for the packet + * + * This function allocates an skb. It then populates it with the page + * data from the current receive descriptor, taking care to set up the + * skb correctly. + */ + static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, +- struct xdp_buff *xdp, +- u32 nr_frags) ++ struct xdp_buff *xdp) + { + unsigned int size = xdp->data_end - xdp->data; + struct i40e_rx_buffer *rx_buffer; ++ struct skb_shared_info *sinfo; + unsigned int headlen; + struct sk_buff *skb; ++ u32 nr_frags = 0; + + /* prefetch first cache line of first page */ + net_prefetch(xdp->data); +@@ -2181,6 +2182,10 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, + memcpy(__skb_put(skb, headlen), xdp->data, + ALIGN(headlen, sizeof(long))); + ++ if (unlikely(xdp_buff_has_frags(xdp))) { ++ sinfo = xdp_get_shared_info_from_buff(xdp); ++ nr_frags = sinfo->nr_frags; ++ } + rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); + /* update all of the pointers */ + size -= headlen; +@@ -2200,9 +2205,8 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, + } + + if (unlikely(xdp_buff_has_frags(xdp))) { +- struct skb_shared_info *sinfo, *skinfo = skb_shinfo(skb); ++ struct skb_shared_info *skinfo = skb_shinfo(skb); + +- sinfo = xdp_get_shared_info_from_buff(xdp); + memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0], + sizeof(skb_frag_t) * nr_frags); + +@@ -2225,17 +2229,17 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, + * i40e_build_skb - Build skb around an existing buffer + * @rx_ring: Rx descriptor ring to transact packets on + * @xdp: xdp_buff pointing to the data +- * @nr_frags: number of buffers for the packet + * + * This function builds an skb around an existing Rx buffer, taking care + * to set up the skb correctly and avoid any memcpy overhead. + */ + static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, +- struct xdp_buff *xdp, +- u32 nr_frags) ++ struct xdp_buff *xdp) + { + unsigned int metasize = xdp->data - xdp->data_meta; ++ struct skb_shared_info *sinfo; + struct sk_buff *skb; ++ u32 nr_frags; + + /* Prefetch first cache line of first page. If xdp->data_meta + * is unused, this points exactly as xdp->data, otherwise we +@@ -2244,6 +2248,11 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, + */ + net_prefetch(xdp->data_meta); + ++ if (unlikely(xdp_buff_has_frags(xdp))) { ++ sinfo = xdp_get_shared_info_from_buff(xdp); ++ nr_frags = sinfo->nr_frags; ++ } ++ + /* build an skb around the page buffer */ + skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz); + if (unlikely(!skb)) +@@ -2256,9 +2265,6 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, + skb_metadata_set(skb, metasize); + + if (unlikely(xdp_buff_has_frags(xdp))) { +- struct skb_shared_info *sinfo; +- +- sinfo = xdp_get_shared_info_from_buff(xdp); + xdp_update_skb_shared_info(skb, nr_frags, + sinfo->xdp_frags_size, + nr_frags * xdp->frame_sz, +@@ -2603,9 +2609,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget, + total_rx_bytes += size; + } else { + if (ring_uses_build_skb(rx_ring)) +- skb = i40e_build_skb(rx_ring, xdp, nfrags); ++ skb = i40e_build_skb(rx_ring, xdp); + else +- skb = i40e_construct_skb(rx_ring, xdp, nfrags); ++ skb = i40e_construct_skb(rx_ring, xdp); + + /* drop if we failed to retrieve a buffer */ + if (!skb) { +-- +2.43.0 + diff --git a/queue-6.6/i40e-set-xdp_rxq_info-frag_size.patch b/queue-6.6/i40e-set-xdp_rxq_info-frag_size.patch new file mode 100644 index 00000000000..e2118c1e7ab --- /dev/null +++ b/queue-6.6/i40e-set-xdp_rxq_info-frag_size.patch @@ -0,0 +1,130 @@ +From 1ea5026b3fb307201dd905d17d22c043163bd1fc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:16:01 +0100 +Subject: i40e: set xdp_rxq_info::frag_size + +From: Maciej Fijalkowski + +[ Upstream commit a045d2f2d03d23e7db6772dd83e0ba2705dfad93 ] + +i40e support XDP multi-buffer so it is supposed to use +__xdp_rxq_info_reg() instead of xdp_rxq_info_reg() and set the +frag_size. It can not be simply converted at existing callsite because +rx_buf_len could be un-initialized, so let us register xdp_rxq_info +within i40e_configure_rx_ring(), which happen to be called with already +initialized rx_buf_len value. + +Commit 5180ff1364bc ("i40e: use int for i40e_status") converted 'err' to +int, so two variables to deal with return codes are not needed within +i40e_configure_rx_ring(). Remove 'ret' and use 'err' to handle status +from xdp_rxq_info registration. + +Fixes: e213ced19bef ("i40e: add support for XDP multi-buffer Rx") +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-11-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_main.c | 40 ++++++++++++--------- + drivers/net/ethernet/intel/i40e/i40e_txrx.c | 9 ----- + 2 files changed, 24 insertions(+), 25 deletions(-) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c +index 5b20eba93d04..aadca7b3443c 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -3578,40 +3578,48 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) + struct i40e_hmc_obj_rxq rx_ctx; + int err = 0; + bool ok; +- int ret; + + bitmap_zero(ring->state, __I40E_RING_STATE_NBITS); + + /* clear the context structure first */ + memset(&rx_ctx, 0, sizeof(rx_ctx)); + +- if (ring->vsi->type == I40E_VSI_MAIN) +- xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); ++ ring->rx_buf_len = vsi->rx_buf_len; ++ ++ /* XDP RX-queue info only needed for RX rings exposed to XDP */ ++ if (ring->vsi->type != I40E_VSI_MAIN) ++ goto skip; ++ ++ if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { ++ err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ++ ring->queue_index, ++ ring->q_vector->napi.napi_id, ++ ring->rx_buf_len); ++ if (err) ++ return err; ++ } + + ring->xsk_pool = i40e_xsk_pool(ring); + if (ring->xsk_pool) { +- ring->rx_buf_len = +- xsk_pool_get_rx_frame_size(ring->xsk_pool); +- ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, ++ ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); ++ err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + MEM_TYPE_XSK_BUFF_POOL, + NULL); +- if (ret) +- return ret; ++ if (err) ++ return err; + dev_info(&vsi->back->pdev->dev, + "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", + ring->queue_index); + + } else { +- ring->rx_buf_len = vsi->rx_buf_len; +- if (ring->vsi->type == I40E_VSI_MAIN) { +- ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, +- MEM_TYPE_PAGE_SHARED, +- NULL); +- if (ret) +- return ret; +- } ++ err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, ++ MEM_TYPE_PAGE_SHARED, ++ NULL); ++ if (err) ++ return err; + } + ++skip: + xdp_init_buff(&ring->xdp, i40e_rx_pg_size(ring) / 2, &ring->xdp_rxq); + + rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len, +diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c +index 2e5546e549d9..1df2f9338812 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c +@@ -1556,7 +1556,6 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring) + int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) + { + struct device *dev = rx_ring->dev; +- int err; + + u64_stats_init(&rx_ring->syncp); + +@@ -1577,14 +1576,6 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) + rx_ring->next_to_process = 0; + rx_ring->next_to_use = 0; + +- /* XDP RX-queue info only needed for RX rings exposed to XDP */ +- if (rx_ring->vsi->type == I40E_VSI_MAIN) { +- err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, +- rx_ring->queue_index, rx_ring->q_vector->napi.napi_id); +- if (err < 0) +- return err; +- } +- + rx_ring->xdp_prog = rx_ring->vsi->xdp_prog; + + rx_ring->rx_bi = +-- +2.43.0 + diff --git a/queue-6.6/i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch b/queue-6.6/i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch new file mode 100644 index 00000000000..1e42f1b6c7b --- /dev/null +++ b/queue-6.6/i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch @@ -0,0 +1,46 @@ +From c2cb36664bc2bd6cd8915c9bd1e82b10d85c8a0c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:16:02 +0100 +Subject: i40e: update xdp_rxq_info::frag_size for ZC enabled Rx queue + +From: Maciej Fijalkowski + +[ Upstream commit 0cbb08707c932b3f004bc1a8ec6200ef572c1f5f ] + +Now that i40e driver correctly sets up frag_size in xdp_rxq_info, let us +make it work for ZC multi-buffer as well. i40e_ring::rx_buf_len for ZC +is being set via xsk_pool_get_rx_frame_size() and this needs to be +propagated up to xdp_rxq_info. + +Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support") +Acked-by: Magnus Karlsson +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-12-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_main.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c +index aadca7b3443c..aad39ebff4ab 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -3601,7 +3601,14 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) + + ring->xsk_pool = i40e_xsk_pool(ring); + if (ring->xsk_pool) { ++ xdp_rxq_info_unreg(&ring->xdp_rxq); + ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); ++ err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ++ ring->queue_index, ++ ring->q_vector->napi.napi_id, ++ ring->rx_buf_len); ++ if (err) ++ return err; + err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + MEM_TYPE_XSK_BUFF_POOL, + NULL); +-- +2.43.0 + diff --git a/queue-6.6/ice-remove-redundant-xdp_rxq_info-registration.patch b/queue-6.6/ice-remove-redundant-xdp_rxq_info-registration.patch new file mode 100644 index 00000000000..81d2c845f07 --- /dev/null +++ b/queue-6.6/ice-remove-redundant-xdp_rxq_info-registration.patch @@ -0,0 +1,58 @@ +From 0cdbd0273b7e38f65a667530e5aa5049a8a6634f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:57 +0100 +Subject: ice: remove redundant xdp_rxq_info registration + +From: Maciej Fijalkowski + +[ Upstream commit 2ee788c06493d02ee85855414cca39825e768aaf ] + +xdp_rxq_info struct can be registered by drivers via two functions - +xdp_rxq_info_reg() and __xdp_rxq_info_reg(). The latter one allows +drivers that support XDP multi-buffer to set up xdp_rxq_info::frag_size +which in turn will make it possible to grow the packet via +bpf_xdp_adjust_tail() BPF helper. + +Currently, ice registers xdp_rxq_info in two spots: +1) ice_setup_rx_ring() // via xdp_rxq_info_reg(), BUG +2) ice_vsi_cfg_rxq() // via __xdp_rxq_info_reg(), OK + +Cited commit under fixes tag took care of setting up frag_size and +updated registration scheme in 2) but it did not help as +1) is called before 2) and as shown above it uses old registration +function. This means that 2) sees that xdp_rxq_info is already +registered and never calls __xdp_rxq_info_reg() which leaves us with +xdp_rxq_info::frag_size being set to 0. + +To fix this misbehavior, simply remove xdp_rxq_info_reg() call from +ice_setup_rx_ring(). + +Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side") +Acked-by: Magnus Karlsson +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-7-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/ice/ice_txrx.c | 5 ----- + 1 file changed, 5 deletions(-) + +diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c +index 5b0f9e53f6b4..24c914015973 100644 +--- a/drivers/net/ethernet/intel/ice/ice_txrx.c ++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c +@@ -513,11 +513,6 @@ int ice_setup_rx_ring(struct ice_rx_ring *rx_ring) + if (ice_is_xdp_ena_vsi(rx_ring->vsi)) + WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog); + +- if (rx_ring->vsi->type == ICE_VSI_PF && +- !xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) +- if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, +- rx_ring->q_index, rx_ring->q_vector->napi.napi_id)) +- goto err; + return 0; + + err: +-- +2.43.0 + diff --git a/queue-6.6/ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch b/queue-6.6/ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch new file mode 100644 index 00000000000..775ff6b67ac --- /dev/null +++ b/queue-6.6/ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch @@ -0,0 +1,91 @@ +From f38782f1f2c6fc2c8e5b7c7f66b5ec2ca703bea6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:59 +0100 +Subject: ice: update xdp_rxq_info::frag_size for ZC enabled Rx queue + +From: Maciej Fijalkowski + +[ Upstream commit 3de38c87174225487fc93befeea7d380db80aef6 ] + +Now that ice driver correctly sets up frag_size in xdp_rxq_info, let us +make it work for ZC multi-buffer as well. ice_rx_ring::rx_buf_len for ZC +is being set via xsk_pool_get_rx_frame_size() and this needs to be +propagated up to xdp_rxq_info. + +Use a bigger hammer and instead of unregistering only xdp_rxq_info's +memory model, unregister it altogether and register it again and have +xdp_rxq_info with correct frag_size value. + +Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support") +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-9-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/ice/ice_base.c | 37 ++++++++++++++--------- + 1 file changed, 23 insertions(+), 14 deletions(-) + +diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c +index 7fa43827a3f0..4f3e65b47cdc 100644 +--- a/drivers/net/ethernet/intel/ice/ice_base.c ++++ b/drivers/net/ethernet/intel/ice/ice_base.c +@@ -534,19 +534,27 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) + ring->rx_buf_len = ring->vsi->rx_buf_len; + + if (ring->vsi->type == ICE_VSI_PF) { +- if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) +- /* coverity[check_return] */ +- __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, +- ring->q_index, +- ring->q_vector->napi.napi_id, +- ring->vsi->rx_buf_len); ++ if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { ++ err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ++ ring->q_index, ++ ring->q_vector->napi.napi_id, ++ ring->rx_buf_len); ++ if (err) ++ return err; ++ } + + ring->xsk_pool = ice_xsk_pool(ring); + if (ring->xsk_pool) { +- xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); ++ xdp_rxq_info_unreg(&ring->xdp_rxq); + + ring->rx_buf_len = + xsk_pool_get_rx_frame_size(ring->xsk_pool); ++ err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ++ ring->q_index, ++ ring->q_vector->napi.napi_id, ++ ring->rx_buf_len); ++ if (err) ++ return err; + err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + MEM_TYPE_XSK_BUFF_POOL, + NULL); +@@ -557,13 +565,14 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) + dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", + ring->q_index); + } else { +- if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) +- /* coverity[check_return] */ +- __xdp_rxq_info_reg(&ring->xdp_rxq, +- ring->netdev, +- ring->q_index, +- ring->q_vector->napi.napi_id, +- ring->vsi->rx_buf_len); ++ if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { ++ err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ++ ring->q_index, ++ ring->q_vector->napi.napi_id, ++ ring->rx_buf_len); ++ if (err) ++ return err; ++ } + + err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + MEM_TYPE_PAGE_SHARED, +-- +2.43.0 + diff --git a/queue-6.6/ice-work-on-pre-xdp-prog-frag-count.patch b/queue-6.6/ice-work-on-pre-xdp-prog-frag-count.patch new file mode 100644 index 00000000000..835b3583624 --- /dev/null +++ b/queue-6.6/ice-work-on-pre-xdp-prog-frag-count.patch @@ -0,0 +1,170 @@ +From f73ea006fb6cee5b55577c94988f0e702012fe2c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:55 +0100 +Subject: ice: work on pre-XDP prog frag count + +From: Maciej Fijalkowski + +[ Upstream commit ad2047cf5d9313200e308612aed516548873d124 ] + +Fix an OOM panic in XDP_DRV mode when a XDP program shrinks a +multi-buffer packet by 4k bytes and then redirects it to an AF_XDP +socket. + +Since support for handling multi-buffer frames was added to XDP, usage +of bpf_xdp_adjust_tail() helper within XDP program can free the page +that given fragment occupies and in turn decrease the fragment count +within skb_shared_info that is embedded in xdp_buff struct. In current +ice driver codebase, it can become problematic when page recycling logic +decides not to reuse the page. In such case, __page_frag_cache_drain() +is used with ice_rx_buf::pagecnt_bias that was not adjusted after +refcount of page was changed by XDP prog which in turn does not drain +the refcount to 0 and page is never freed. + +To address this, let us store the count of frags before the XDP program +was executed on Rx ring struct. This will be used to compare with +current frag count from skb_shared_info embedded in xdp_buff. A smaller +value in the latter indicates that XDP prog freed frag(s). Then, for +given delta decrement pagecnt_bias for XDP_DROP verdict. + +While at it, let us also handle the EOP frag within +ice_set_rx_bufs_act() to make our life easier, so all of the adjustments +needed to be applied against freed frags are performed in the single +place. + +Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side") +Acked-by: Magnus Karlsson +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-5-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/ice/ice_txrx.c | 14 ++++++--- + drivers/net/ethernet/intel/ice/ice_txrx.h | 1 + + drivers/net/ethernet/intel/ice/ice_txrx_lib.h | 31 +++++++++++++------ + 3 files changed, 32 insertions(+), 14 deletions(-) + +diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c +index 52d0a126eb61..5b0f9e53f6b4 100644 +--- a/drivers/net/ethernet/intel/ice/ice_txrx.c ++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c +@@ -600,9 +600,7 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, + ret = ICE_XDP_CONSUMED; + } + exit: +- rx_buf->act = ret; +- if (unlikely(xdp_buff_has_frags(xdp))) +- ice_set_rx_bufs_act(xdp, rx_ring, ret); ++ ice_set_rx_bufs_act(xdp, rx_ring, ret); + } + + /** +@@ -890,14 +888,17 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, + } + + if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) { +- if (unlikely(xdp_buff_has_frags(xdp))) +- ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED); ++ ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED); + return -ENOMEM; + } + + __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page, + rx_buf->page_offset, size); + sinfo->xdp_frags_size += size; ++ /* remember frag count before XDP prog execution; bpf_xdp_adjust_tail() ++ * can pop off frags but driver has to handle it on its own ++ */ ++ rx_ring->nr_frags = sinfo->nr_frags; + + if (page_is_pfmemalloc(rx_buf->page)) + xdp_buff_set_frag_pfmemalloc(xdp); +@@ -1249,6 +1250,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) + + xdp->data = NULL; + rx_ring->first_desc = ntc; ++ rx_ring->nr_frags = 0; + continue; + construct_skb: + if (likely(ice_ring_uses_build_skb(rx_ring))) +@@ -1264,10 +1266,12 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) + ICE_XDP_CONSUMED); + xdp->data = NULL; + rx_ring->first_desc = ntc; ++ rx_ring->nr_frags = 0; + break; + } + xdp->data = NULL; + rx_ring->first_desc = ntc; ++ rx_ring->nr_frags = 0; + + stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S); + if (unlikely(ice_test_staterr(rx_desc->wb.status_error0, +diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h +index 166413fc33f4..407d4c320097 100644 +--- a/drivers/net/ethernet/intel/ice/ice_txrx.h ++++ b/drivers/net/ethernet/intel/ice/ice_txrx.h +@@ -333,6 +333,7 @@ struct ice_rx_ring { + struct ice_channel *ch; + struct ice_tx_ring *xdp_ring; + struct xsk_buff_pool *xsk_pool; ++ u32 nr_frags; + dma_addr_t dma; /* physical address of ring */ + u64 cached_phctime; + u16 rx_buf_len; +diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h +index 115969ecdf7b..b0e56675f98b 100644 +--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h ++++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h +@@ -12,26 +12,39 @@ + * act: action to store onto Rx buffers related to XDP buffer parts + * + * Set action that should be taken before putting Rx buffer from first frag +- * to one before last. Last one is handled by caller of this function as it +- * is the EOP frag that is currently being processed. This function is +- * supposed to be called only when XDP buffer contains frags. ++ * to the last. + */ + static inline void + ice_set_rx_bufs_act(struct xdp_buff *xdp, const struct ice_rx_ring *rx_ring, + const unsigned int act) + { +- const struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); +- u32 first = rx_ring->first_desc; +- u32 nr_frags = sinfo->nr_frags; ++ u32 sinfo_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags; ++ u32 nr_frags = rx_ring->nr_frags + 1; ++ u32 idx = rx_ring->first_desc; + u32 cnt = rx_ring->count; + struct ice_rx_buf *buf; + + for (int i = 0; i < nr_frags; i++) { +- buf = &rx_ring->rx_buf[first]; ++ buf = &rx_ring->rx_buf[idx]; + buf->act = act; + +- if (++first == cnt) +- first = 0; ++ if (++idx == cnt) ++ idx = 0; ++ } ++ ++ /* adjust pagecnt_bias on frags freed by XDP prog */ ++ if (sinfo_frags < rx_ring->nr_frags && act == ICE_XDP_CONSUMED) { ++ u32 delta = rx_ring->nr_frags - sinfo_frags; ++ ++ while (delta) { ++ if (idx == 0) ++ idx = cnt - 1; ++ else ++ idx--; ++ buf = &rx_ring->rx_buf[idx]; ++ buf->pagecnt_bias--; ++ delta--; ++ } + } + } + +-- +2.43.0 + diff --git a/queue-6.6/intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch b/queue-6.6/intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch new file mode 100644 index 00000000000..a4686ed0152 --- /dev/null +++ b/queue-6.6/intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch @@ -0,0 +1,60 @@ +From 419c1ea790a3ac9d23f0cb06f5d86f0f6e4b2608 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:58 +0100 +Subject: intel: xsk: initialize skb_frag_t::bv_offset in ZC drivers + +From: Maciej Fijalkowski + +[ Upstream commit 290779905d09d5fdf6caa4f58ddefc3f4db0c0a9 ] + +Ice and i40e ZC drivers currently set offset of a frag within +skb_shared_info to 0, which is incorrect. xdp_buffs that come from +xsk_buff_pool always have 256 bytes of a headroom, so they need to be +taken into account to retrieve xdp_buff::data via skb_frag_address(). +Otherwise, bpf_xdp_frags_increase_tail() would be starting its job from +xdp_buff::data_hard_start which would result in overwriting existing +payload. + +Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support") +Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support") +Acked-by: Magnus Karlsson +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-8-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_xsk.c | 3 ++- + drivers/net/ethernet/intel/ice/ice_xsk.c | 3 ++- + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c +index b75e6b6d317c..1f8ae6f5d980 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c +@@ -418,7 +418,8 @@ i40e_add_xsk_frag(struct i40e_ring *rx_ring, struct xdp_buff *first, + } + + __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, +- virt_to_page(xdp->data_hard_start), 0, size); ++ virt_to_page(xdp->data_hard_start), ++ XDP_PACKET_HEADROOM, size); + sinfo->xdp_frags_size += size; + xsk_buff_add_frag(xdp); + +diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c +index 33f194c870bb..307c609137bd 100644 +--- a/drivers/net/ethernet/intel/ice/ice_xsk.c ++++ b/drivers/net/ethernet/intel/ice/ice_xsk.c +@@ -826,7 +826,8 @@ ice_add_xsk_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *first, + } + + __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, +- virt_to_page(xdp->data_hard_start), 0, size); ++ virt_to_page(xdp->data_hard_start), ++ XDP_PACKET_HEADROOM, size); + sinfo->xdp_frags_size += size; + xsk_buff_add_frag(xdp); + +-- +2.43.0 + diff --git a/queue-6.6/ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch b/queue-6.6/ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch new file mode 100644 index 00000000000..5d0ec48061b --- /dev/null +++ b/queue-6.6/ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch @@ -0,0 +1,70 @@ +From aaeffb5c99d462e2f6f3a27a6ce9171c60ed2741 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 18:20:01 +0800 +Subject: ipv6: init the accept_queue's spinlocks in inet6_create + +From: Zhengchao Shao + +[ Upstream commit 435e202d645c197dcfd39d7372eb2a56529b6640 ] + +In commit 198bc90e0e73("tcp: make sure init the accept_queue's spinlocks +once"), the spinlocks of accept_queue are initialized only when socket is +created in the inet4 scenario. The locks are not initialized when socket +is created in the inet6 scenario. The kernel reports the following error: +INFO: trying to register non-static key. +The code is fine but needs lockdep annotation, or maybe +you didn't initialize this object before use? +turning off the locking correctness validator. +Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 +Call Trace: + + dump_stack_lvl (lib/dump_stack.c:107) + register_lock_class (kernel/locking/lockdep.c:1289) + __lock_acquire (kernel/locking/lockdep.c:5015) + lock_acquire.part.0 (kernel/locking/lockdep.c:5756) + _raw_spin_lock_bh (kernel/locking/spinlock.c:178) + inet_csk_listen_stop (net/ipv4/inet_connection_sock.c:1386) + tcp_disconnect (net/ipv4/tcp.c:2981) + inet_shutdown (net/ipv4/af_inet.c:935) + __sys_shutdown (./include/linux/file.h:32 net/socket.c:2438) + __x64_sys_shutdown (net/socket.c:2445) + do_syscall_64 (arch/x86/entry/common.c:52) + entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) +RIP: 0033:0x7f52ecd05a3d +Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 +48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff +ff 73 01 c3 48 8b 0d ab a3 0e 00 f7 d8 64 89 01 48 +RSP: 002b:00007f52ecf5dde8 EFLAGS: 00000293 ORIG_RAX: 0000000000000030 +RAX: ffffffffffffffda RBX: 00007f52ecf5e640 RCX: 00007f52ecd05a3d +RDX: 00007f52ecc8b188 RSI: 0000000000000000 RDI: 0000000000000004 +RBP: 00007f52ecf5de20 R08: 00007ffdae45c69f R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000293 R12: 00007f52ecf5e640 +R13: 0000000000000000 R14: 00007f52ecc8b060 R15: 00007ffdae45c6e0 + +Fixes: 198bc90e0e73 ("tcp: make sure init the accept_queue's spinlocks once") +Signed-off-by: Zhengchao Shao +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/r/20240122102001.2851701-1-shaozhengchao@huawei.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + net/ipv6/af_inet6.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c +index 368824fe9719..b6c5b5e25a2f 100644 +--- a/net/ipv6/af_inet6.c ++++ b/net/ipv6/af_inet6.c +@@ -199,6 +199,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol, + if (INET_PROTOSW_REUSE & answer_flags) + sk->sk_reuse = SK_CAN_REUSE; + ++ if (INET_PROTOSW_ICSK & answer_flags) ++ inet_init_csk_locks(sk); ++ + inet = inet_sk(sk); + inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); + +-- +2.43.0 + diff --git a/queue-6.6/llc-drop-support-for-eth_p_tr_802_2.patch b/queue-6.6/llc-drop-support-for-eth_p_tr_802_2.patch new file mode 100644 index 00000000000..ea3b04dc066 --- /dev/null +++ b/queue-6.6/llc-drop-support-for-eth_p_tr_802_2.patch @@ -0,0 +1,130 @@ +From c75db2d6fe464bfca6de917ba19e9eb3c5e80953 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 17:55:15 -0800 +Subject: llc: Drop support for ETH_P_TR_802_2. + +From: Kuniyuki Iwashima + +[ Upstream commit e3f9bed9bee261e3347131764e42aeedf1ffea61 ] + +syzbot reported an uninit-value bug below. [0] + +llc supports ETH_P_802_2 (0x0004) and used to support ETH_P_TR_802_2 +(0x0011), and syzbot abused the latter to trigger the bug. + + write$tun(r0, &(0x7f0000000040)={@val={0x0, 0x11}, @val, @mpls={[], @llc={@snap={0xaa, 0x1, ')', "90e5dd"}}}}, 0x16) + +llc_conn_handler() initialises local variables {saddr,daddr}.mac +based on skb in llc_pdu_decode_sa()/llc_pdu_decode_da() and passes +them to __llc_lookup(). + +However, the initialisation is done only when skb->protocol is +htons(ETH_P_802_2), otherwise, __llc_lookup_established() and +__llc_lookup_listener() will read garbage. + +The missing initialisation existed prior to commit 211ed865108e +("net: delete all instances of special processing for token ring"). + +It removed the part to kick out the token ring stuff but forgot to +close the door allowing ETH_P_TR_802_2 packets to sneak into llc_rcv(). + +Let's remove llc_tr_packet_type and complete the deprecation. + +[0]: +BUG: KMSAN: uninit-value in __llc_lookup_established+0xe9d/0xf90 + __llc_lookup_established+0xe9d/0xf90 + __llc_lookup net/llc/llc_conn.c:611 [inline] + llc_conn_handler+0x4bd/0x1360 net/llc/llc_conn.c:791 + llc_rcv+0xfbb/0x14a0 net/llc/llc_input.c:206 + __netif_receive_skb_one_core net/core/dev.c:5527 [inline] + __netif_receive_skb+0x1a6/0x5a0 net/core/dev.c:5641 + netif_receive_skb_internal net/core/dev.c:5727 [inline] + netif_receive_skb+0x58/0x660 net/core/dev.c:5786 + tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1555 + tun_get_user+0x53af/0x66d0 drivers/net/tun.c:2002 + tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 + call_write_iter include/linux/fs.h:2020 [inline] + new_sync_write fs/read_write.c:491 [inline] + vfs_write+0x8ef/0x1490 fs/read_write.c:584 + ksys_write+0x20f/0x4c0 fs/read_write.c:637 + __do_sys_write fs/read_write.c:649 [inline] + __se_sys_write fs/read_write.c:646 [inline] + __x64_sys_write+0x93/0xd0 fs/read_write.c:646 + do_syscall_x64 arch/x86/entry/common.c:51 [inline] + do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 + entry_SYSCALL_64_after_hwframe+0x63/0x6b + +Local variable daddr created at: + llc_conn_handler+0x53/0x1360 net/llc/llc_conn.c:783 + llc_rcv+0xfbb/0x14a0 net/llc/llc_input.c:206 + +CPU: 1 PID: 5004 Comm: syz-executor994 Not tainted 6.6.0-syzkaller-14500-g1c41041124bd #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 + +Fixes: 211ed865108e ("net: delete all instances of special processing for token ring") +Reported-by: syzbot+b5ad66046b913bc04c6f@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=b5ad66046b913bc04c6f +Signed-off-by: Kuniyuki Iwashima +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/r/20240119015515.61898-1-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + include/net/llc_pdu.h | 6 ++---- + net/llc/llc_core.c | 7 ------- + 2 files changed, 2 insertions(+), 11 deletions(-) + +diff --git a/include/net/llc_pdu.h b/include/net/llc_pdu.h +index 7e73f8e5e497..1d55ba7c45be 100644 +--- a/include/net/llc_pdu.h ++++ b/include/net/llc_pdu.h +@@ -262,8 +262,7 @@ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type, + */ + static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) + { +- if (skb->protocol == htons(ETH_P_802_2)) +- memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); ++ memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); + } + + /** +@@ -275,8 +274,7 @@ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) + */ + static inline void llc_pdu_decode_da(struct sk_buff *skb, u8 *da) + { +- if (skb->protocol == htons(ETH_P_802_2)) +- memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); ++ memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); + } + + /** +diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c +index 6e387aadffce..4f16d9c88350 100644 +--- a/net/llc/llc_core.c ++++ b/net/llc/llc_core.c +@@ -135,22 +135,15 @@ static struct packet_type llc_packet_type __read_mostly = { + .func = llc_rcv, + }; + +-static struct packet_type llc_tr_packet_type __read_mostly = { +- .type = cpu_to_be16(ETH_P_TR_802_2), +- .func = llc_rcv, +-}; +- + static int __init llc_init(void) + { + dev_add_pack(&llc_packet_type); +- dev_add_pack(&llc_tr_packet_type); + return 0; + } + + static void __exit llc_exit(void) + { + dev_remove_pack(&llc_packet_type); +- dev_remove_pack(&llc_tr_packet_type); + } + + module_init(llc_init); +-- +2.43.0 + diff --git a/queue-6.6/llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch b/queue-6.6/llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch new file mode 100644 index 00000000000..7510b544110 --- /dev/null +++ b/queue-6.6/llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch @@ -0,0 +1,154 @@ +From 944594f7e0e1ca09dc19ebf731dd411ee07a1690 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 18:36:25 +0000 +Subject: llc: make llc_ui_sendmsg() more robust against bonding changes + +From: Eric Dumazet + +[ Upstream commit dad555c816a50c6a6a8a86be1f9177673918c647 ] + +syzbot was able to trick llc_ui_sendmsg(), allocating an skb with no +headroom, but subsequently trying to push 14 bytes of Ethernet header [1] + +Like some others, llc_ui_sendmsg() releases the socket lock before +calling sock_alloc_send_skb(). +Then it acquires it again, but does not redo all the sanity checks +that were performed. + +This fix: + +- Uses LL_RESERVED_SPACE() to reserve space. +- Check all conditions again after socket lock is held again. +- Do not account Ethernet header for mtu limitation. + +[1] + +skbuff: skb_under_panic: text:ffff800088baa334 len:1514 put:14 head:ffff0000c9c37000 data:ffff0000c9c36ff2 tail:0x5dc end:0x6c0 dev:bond0 + + kernel BUG at net/core/skbuff.c:193 ! +Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP +Modules linked in: +CPU: 0 PID: 6875 Comm: syz-executor.0 Not tainted 6.7.0-rc8-syzkaller-00101-g0802e17d9aca-dirty #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 +pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) + pc : skb_panic net/core/skbuff.c:189 [inline] + pc : skb_under_panic+0x13c/0x140 net/core/skbuff.c:203 + lr : skb_panic net/core/skbuff.c:189 [inline] + lr : skb_under_panic+0x13c/0x140 net/core/skbuff.c:203 +sp : ffff800096f97000 +x29: ffff800096f97010 x28: ffff80008cc8d668 x27: dfff800000000000 +x26: ffff0000cb970c90 x25: 00000000000005dc x24: ffff0000c9c36ff2 +x23: ffff0000c9c37000 x22: 00000000000005ea x21: 00000000000006c0 +x20: 000000000000000e x19: ffff800088baa334 x18: 1fffe000368261ce +x17: ffff80008e4ed000 x16: ffff80008a8310f8 x15: 0000000000000001 +x14: 1ffff00012df2d58 x13: 0000000000000000 x12: 0000000000000000 +x11: 0000000000000001 x10: 0000000000ff0100 x9 : e28a51f1087e8400 +x8 : e28a51f1087e8400 x7 : ffff80008028f8d0 x6 : 0000000000000000 +x5 : 0000000000000001 x4 : 0000000000000001 x3 : ffff800082b78714 +x2 : 0000000000000001 x1 : 0000000100000000 x0 : 0000000000000089 +Call trace: + skb_panic net/core/skbuff.c:189 [inline] + skb_under_panic+0x13c/0x140 net/core/skbuff.c:203 + skb_push+0xf0/0x108 net/core/skbuff.c:2451 + eth_header+0x44/0x1f8 net/ethernet/eth.c:83 + dev_hard_header include/linux/netdevice.h:3188 [inline] + llc_mac_hdr_init+0x110/0x17c net/llc/llc_output.c:33 + llc_sap_action_send_xid_c+0x170/0x344 net/llc/llc_s_ac.c:85 + llc_exec_sap_trans_actions net/llc/llc_sap.c:153 [inline] + llc_sap_next_state net/llc/llc_sap.c:182 [inline] + llc_sap_state_process+0x1ec/0x774 net/llc/llc_sap.c:209 + llc_build_and_send_xid_pkt+0x12c/0x1c0 net/llc/llc_sap.c:270 + llc_ui_sendmsg+0x7bc/0xb1c net/llc/af_llc.c:997 + sock_sendmsg_nosec net/socket.c:730 [inline] + __sock_sendmsg net/socket.c:745 [inline] + sock_sendmsg+0x194/0x274 net/socket.c:767 + splice_to_socket+0x7cc/0xd58 fs/splice.c:881 + do_splice_from fs/splice.c:933 [inline] + direct_splice_actor+0xe4/0x1c0 fs/splice.c:1142 + splice_direct_to_actor+0x2a0/0x7e4 fs/splice.c:1088 + do_splice_direct+0x20c/0x348 fs/splice.c:1194 + do_sendfile+0x4bc/0xc70 fs/read_write.c:1254 + __do_sys_sendfile64 fs/read_write.c:1322 [inline] + __se_sys_sendfile64 fs/read_write.c:1308 [inline] + __arm64_sys_sendfile64+0x160/0x3b4 fs/read_write.c:1308 + __invoke_syscall arch/arm64/kernel/syscall.c:37 [inline] + invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:51 + el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:136 + do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:155 + el0_svc+0x54/0x158 arch/arm64/kernel/entry-common.c:678 + el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:696 + el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:595 +Code: aa1803e6 aa1903e7 a90023f5 94792f6a (d4210000) + +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Reported-and-tested-by: syzbot+2a7024e9502df538e8ef@syzkaller.appspotmail.com +Signed-off-by: Eric Dumazet +Reviewed-by: Kuniyuki Iwashima +Link: https://lore.kernel.org/r/20240118183625.4007013-1-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/llc/af_llc.c | 24 ++++++++++++++++-------- + 1 file changed, 16 insertions(+), 8 deletions(-) + +diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c +index 9b06c380866b..20551cfb7da6 100644 +--- a/net/llc/af_llc.c ++++ b/net/llc/af_llc.c +@@ -928,14 +928,15 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + */ + static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) + { ++ DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); +- DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); + int flags = msg->msg_flags; + int noblock = flags & MSG_DONTWAIT; ++ int rc = -EINVAL, copied = 0, hdrlen, hh_len; + struct sk_buff *skb = NULL; ++ struct net_device *dev; + size_t size = 0; +- int rc = -EINVAL, copied = 0, hdrlen; + + dprintk("%s: sending from %02X to %02X\n", __func__, + llc->laddr.lsap, llc->daddr.lsap); +@@ -955,22 +956,29 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) + if (rc) + goto out; + } +- hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr); ++ dev = llc->dev; ++ hh_len = LL_RESERVED_SPACE(dev); ++ hdrlen = llc_ui_header_len(sk, addr); + size = hdrlen + len; +- if (size > llc->dev->mtu) +- size = llc->dev->mtu; ++ size = min_t(size_t, size, READ_ONCE(dev->mtu)); + copied = size - hdrlen; + rc = -EINVAL; + if (copied < 0) + goto out; + release_sock(sk); +- skb = sock_alloc_send_skb(sk, size, noblock, &rc); ++ skb = sock_alloc_send_skb(sk, hh_len + size, noblock, &rc); + lock_sock(sk); + if (!skb) + goto out; +- skb->dev = llc->dev; ++ if (sock_flag(sk, SOCK_ZAPPED) || ++ llc->dev != dev || ++ hdrlen != llc_ui_header_len(sk, addr) || ++ hh_len != LL_RESERVED_SPACE(dev) || ++ size > READ_ONCE(dev->mtu)) ++ goto out; ++ skb->dev = dev; + skb->protocol = llc_proto_type(addr->sllc_arphrd); +- skb_reserve(skb, hdrlen); ++ skb_reserve(skb, hh_len + hdrlen); + rc = memcpy_from_msg(skb_put(skb, copied), msg, copied); + if (rc) + goto out; +-- +2.43.0 + diff --git a/queue-6.6/net-fec-fix-the-unhandled-context-fault-from-smmu.patch b/queue-6.6/net-fec-fix-the-unhandled-context-fault-from-smmu.patch new file mode 100644 index 00000000000..ff8706c35ad --- /dev/null +++ b/queue-6.6/net-fec-fix-the-unhandled-context-fault-from-smmu.patch @@ -0,0 +1,58 @@ +From 1c045bea0adc7b49aa3ff4b77a482810aca37ccb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Jan 2024 10:51:41 -0600 +Subject: net: fec: fix the unhandled context fault from smmu + +From: Shenwei Wang + +[ Upstream commit 5e344807735023cd3a67c37a1852b849caa42620 ] + +When repeatedly changing the interface link speed using the command below: + +ethtool -s eth0 speed 100 duplex full +ethtool -s eth0 speed 1000 duplex full + +The following errors may sometimes be reported by the ARM SMMU driver: + +[ 5395.035364] fec 5b040000.ethernet eth0: Link is Down +[ 5395.039255] arm-smmu 51400000.iommu: Unhandled context fault: +fsr=0x402, iova=0x00000000, fsynr=0x100001, cbfrsynra=0x852, cb=2 +[ 5398.108460] fec 5b040000.ethernet eth0: Link is Up - 100Mbps/Full - +flow control off + +It is identified that the FEC driver does not properly stop the TX queue +during the link speed transitions, and this results in the invalid virtual +I/O address translations from the SMMU and causes the context faults. + +Fixes: dbc64a8ea231 ("net: fec: move calls to quiesce/resume packet processing out of fec_restart()") +Signed-off-by: Shenwei Wang +Link: https://lore.kernel.org/r/20240123165141.2008104-1-shenwei.wang@nxp.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/freescale/fec_main.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c +index 35c95f07fd6d..54da59286df4 100644 +--- a/drivers/net/ethernet/freescale/fec_main.c ++++ b/drivers/net/ethernet/freescale/fec_main.c +@@ -2011,6 +2011,7 @@ static void fec_enet_adjust_link(struct net_device *ndev) + + /* if any of the above changed restart the FEC */ + if (status_change) { ++ netif_stop_queue(ndev); + napi_disable(&fep->napi); + netif_tx_lock_bh(ndev); + fec_restart(ndev); +@@ -2020,6 +2021,7 @@ static void fec_enet_adjust_link(struct net_device *ndev) + } + } else { + if (fep->link) { ++ netif_stop_queue(ndev); + napi_disable(&fep->napi); + netif_tx_lock_bh(ndev); + fec_stop(ndev); +-- +2.43.0 + diff --git a/queue-6.6/net-fix-removing-a-namespace-with-conflicting-altnam.patch b/queue-6.6/net-fix-removing-a-namespace-with-conflicting-altnam.patch new file mode 100644 index 00000000000..2fbce0faff8 --- /dev/null +++ b/queue-6.6/net-fix-removing-a-namespace-with-conflicting-altnam.patch @@ -0,0 +1,81 @@ +From 43571c969a161b4d3602a3f69d022be34180c33f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 16:58:59 -0800 +Subject: net: fix removing a namespace with conflicting altnames +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Jakub Kicinski + +[ Upstream commit d09486a04f5da0a812c26217213b89a3b1acf836 ] + +Mark reports a BUG() when a net namespace is removed. + + kernel BUG at net/core/dev.c:11520! + +Physical interfaces moved outside of init_net get "refunded" +to init_net when that namespace disappears. The main interface +name may get overwritten in the process if it would have +conflicted. We need to also discard all conflicting altnames. +Recent fixes addressed ensuring that altnames get moved +with the main interface, which surfaced this problem. + +Reported-by: Марк Коренберг +Link: https://lore.kernel.org/all/CAEmTpZFZ4Sv3KwqFOY2WKDHeZYdi0O7N5H1nTvcGp=SAEavtDg@mail.gmail.com/ +Fixes: 7663d522099e ("net: check for altname conflicts when changing netdev's netns") +Signed-off-by: Jakub Kicinski +Reviewed-by: Eric Dumazet +Reviewed-by: Jiri Pirko +Reviewed-by: Xin Long +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/core/dev.c | 9 +++++++++ + net/core/dev.h | 3 +++ + 2 files changed, 12 insertions(+) + +diff --git a/net/core/dev.c b/net/core/dev.c +index e480afb50d4c..d72a4ff689ca 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -11491,6 +11491,7 @@ static struct pernet_operations __net_initdata netdev_net_ops = { + + static void __net_exit default_device_exit_net(struct net *net) + { ++ struct netdev_name_node *name_node, *tmp; + struct net_device *dev, *aux; + /* + * Push all migratable network devices back to the +@@ -11513,6 +11514,14 @@ static void __net_exit default_device_exit_net(struct net *net) + snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); + if (netdev_name_in_use(&init_net, fb_name)) + snprintf(fb_name, IFNAMSIZ, "dev%%d"); ++ ++ netdev_for_each_altname_safe(dev, name_node, tmp) ++ if (netdev_name_in_use(&init_net, name_node->name)) { ++ netdev_name_node_del(name_node); ++ synchronize_rcu(); ++ __netdev_name_node_alt_destroy(name_node); ++ } ++ + err = dev_change_net_namespace(dev, &init_net, fb_name); + if (err) { + pr_emerg("%s: failed to move %s to init_net: %d\n", +diff --git a/net/core/dev.h b/net/core/dev.h +index fa2e9c5c4122..f2037d402144 100644 +--- a/net/core/dev.h ++++ b/net/core/dev.h +@@ -64,6 +64,9 @@ int dev_change_name(struct net_device *dev, const char *newname); + + #define netdev_for_each_altname(dev, namenode) \ + list_for_each_entry((namenode), &(dev)->name_node->list, list) ++#define netdev_for_each_altname_safe(dev, namenode, next) \ ++ list_for_each_entry_safe((namenode), (next), &(dev)->name_node->list, \ ++ list) + + int netdev_name_node_alt_create(struct net_device *dev, const char *name); + int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); +-- +2.43.0 + diff --git a/queue-6.6/net-micrel-fix-ptp-frame-parsing-for-lan8814.patch b/queue-6.6/net-micrel-fix-ptp-frame-parsing-for-lan8814.patch new file mode 100644 index 00000000000..f6c06fdd472 --- /dev/null +++ b/queue-6.6/net-micrel-fix-ptp-frame-parsing-for-lan8814.patch @@ -0,0 +1,61 @@ +From fa78e95cef05ac17e903022c324f296ede7b9f45 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 11:47:50 +0100 +Subject: net: micrel: Fix PTP frame parsing for lan8814 + +From: Horatiu Vultur + +[ Upstream commit aaf632f7ab6dec57bc9329a438f94504fe8034b9 ] + +The HW has the capability to check each frame if it is a PTP frame, +which domain it is, which ptp frame type it is, different ip address in +the frame. And if one of these checks fail then the frame is not +timestamp. Most of these checks were disabled except checking the field +minorVersionPTP inside the PTP header. Meaning that once a partner sends +a frame compliant to 8021AS which has minorVersionPTP set to 1, then the +frame was not timestamp because the HW expected by default a value of 0 +in minorVersionPTP. This is exactly the same issue as on lan8841. +Fix this issue by removing this check so the userspace can decide on this. + +Fixes: ece19502834d ("net: phy: micrel: 1588 support for LAN8814 phy") +Signed-off-by: Horatiu Vultur +Reviewed-by: Maxime Chevallier +Reviewed-by: Divya Koppera +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/phy/micrel.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c +index dfd5f8e78e29..27ca25bbd141 100644 +--- a/drivers/net/phy/micrel.c ++++ b/drivers/net/phy/micrel.c +@@ -120,6 +120,11 @@ + */ + #define LAN8814_1PPM_FORMAT 17179 + ++#define PTP_RX_VERSION 0x0248 ++#define PTP_TX_VERSION 0x0288 ++#define PTP_MAX_VERSION(x) (((x) & GENMASK(7, 0)) << 8) ++#define PTP_MIN_VERSION(x) ((x) & GENMASK(7, 0)) ++ + #define PTP_RX_MOD 0x024F + #define PTP_RX_MOD_BAD_UDPV4_CHKSUM_FORCE_FCS_DIS_ BIT(3) + #define PTP_RX_TIMESTAMP_EN 0x024D +@@ -3125,6 +3130,12 @@ static void lan8814_ptp_init(struct phy_device *phydev) + lanphy_write_page_reg(phydev, 5, PTP_TX_PARSE_IP_ADDR_EN, 0); + lanphy_write_page_reg(phydev, 5, PTP_RX_PARSE_IP_ADDR_EN, 0); + ++ /* Disable checking for minorVersionPTP field */ ++ lanphy_write_page_reg(phydev, 5, PTP_RX_VERSION, ++ PTP_MAX_VERSION(0xff) | PTP_MIN_VERSION(0x0)); ++ lanphy_write_page_reg(phydev, 5, PTP_TX_VERSION, ++ PTP_MAX_VERSION(0xff) | PTP_MIN_VERSION(0x0)); ++ + skb_queue_head_init(&ptp_priv->tx_queue); + skb_queue_head_init(&ptp_priv->rx_queue); + INIT_LIST_HEAD(&ptp_priv->rx_ts_list); +-- +2.43.0 + diff --git a/queue-6.6/net-mlx5-bridge-enable-mcast-in-smfs-steering-mode.patch b/queue-6.6/net-mlx5-bridge-enable-mcast-in-smfs-steering-mode.patch new file mode 100644 index 00000000000..fc3a774fc3b --- /dev/null +++ b/queue-6.6/net-mlx5-bridge-enable-mcast-in-smfs-steering-mode.patch @@ -0,0 +1,77 @@ +From 25ad4a55df0ff2b5837783caa8ab9befc0635bd1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 28 Aug 2023 14:20:00 +0300 +Subject: net/mlx5: Bridge, Enable mcast in smfs steering mode + +From: Erez Shitrit + +[ Upstream commit 653b7eb9d74426397c95061fd57da3063625af65 ] + +In order to have mcast offloads the driver needs the following: +It should know if that mcast comes from wire port, in addition the flow +should not be marked as any specific source, that way it will give the +flexibility for the driver not to be depended on the way iterator +implemented in the FW. + +Signed-off-by: Erez Shitrit +Reviewed-by: Moshe Shemesh +Reviewed-by: Vlad Buslov +Signed-off-by: Saeed Mahameed +Stable-dep-of: ec7cc38ef9f8 ("net/mlx5: Bridge, fix multicast packets sent to uplink") +Signed-off-by: Sasha Levin +--- + .../ethernet/mellanox/mlx5/core/esw/bridge_mcast.c | 11 ++--------- + include/linux/mlx5/fs.h | 1 + + 2 files changed, 3 insertions(+), 9 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c +index 7a01714b3780..a7ed87e9d842 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c +@@ -78,6 +78,8 @@ mlx5_esw_bridge_mdb_flow_create(u16 esw_owner_vhca_id, struct mlx5_esw_bridge_md + xa_for_each(&entry->ports, idx, port) { + dests[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE; + dests[i].ft = port->mcast.ft; ++ if (port->vport_num == MLX5_VPORT_UPLINK) ++ dests[i].ft->flags |= MLX5_FLOW_TABLE_UPLINK_VPORT; + i++; + } + +@@ -585,10 +587,6 @@ mlx5_esw_bridge_mcast_vlan_flow_create(u16 vlan_proto, struct mlx5_esw_bridge_po + if (!rule_spec) + return ERR_PTR(-ENOMEM); + +- if (MLX5_CAP_ESW_FLOWTABLE(bridge->br_offloads->esw->dev, flow_source) && +- port->vport_num == MLX5_VPORT_UPLINK) +- rule_spec->flow_context.flow_source = +- MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT; + rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; +@@ -660,11 +658,6 @@ mlx5_esw_bridge_mcast_fwd_flow_create(struct mlx5_esw_bridge_port *port) + if (!rule_spec) + return ERR_PTR(-ENOMEM); + +- if (MLX5_CAP_ESW_FLOWTABLE(bridge->br_offloads->esw->dev, flow_source) && +- port->vport_num == MLX5_VPORT_UPLINK) +- rule_spec->flow_context.flow_source = +- MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT; +- + if (MLX5_CAP_ESW(bridge->br_offloads->esw->dev, merged_eswitch)) { + dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID; + dest.vport.vhca_id = port->esw_owner_vhca_id; +diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h +index 1e00c2436377..6f7725238abc 100644 +--- a/include/linux/mlx5/fs.h ++++ b/include/linux/mlx5/fs.h +@@ -67,6 +67,7 @@ enum { + MLX5_FLOW_TABLE_TERMINATION = BIT(2), + MLX5_FLOW_TABLE_UNMANAGED = BIT(3), + MLX5_FLOW_TABLE_OTHER_VPORT = BIT(4), ++ MLX5_FLOW_TABLE_UPLINK_VPORT = BIT(5), + }; + + #define LEFTOVERS_RULE_NUM 2 +-- +2.43.0 + diff --git a/queue-6.6/net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch b/queue-6.6/net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch new file mode 100644 index 00000000000..e77312254ad --- /dev/null +++ b/queue-6.6/net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch @@ -0,0 +1,94 @@ +From 3e9a05b0b360985ccfc992da823351a870910b36 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 30 Dec 2023 22:40:37 +0200 +Subject: net/mlx5: Bridge, fix multicast packets sent to uplink + +From: Moshe Shemesh + +[ Upstream commit ec7cc38ef9f83553102e84c82536971a81630739 ] + +To enable multicast packets which are offloaded in bridge multicast +offload mode to be sent also to uplink, FTE bit uplink_hairpin_en should +be set. Add this bit to FTE for the bridge multicast offload rules. + +Fixes: 18c2916cee12 ("net/mlx5: Bridge, snoop igmp/mld packets") +Signed-off-by: Moshe Shemesh +Reviewed-by: Gal Pressman +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c | 3 +++ + drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 2 ++ + include/linux/mlx5/fs.h | 1 + + include/linux/mlx5/mlx5_ifc.h | 2 +- + 4 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c +index a7ed87e9d842..22dd30cf8033 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c +@@ -83,6 +83,7 @@ mlx5_esw_bridge_mdb_flow_create(u16 esw_owner_vhca_id, struct mlx5_esw_bridge_md + i++; + } + ++ rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN; + rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + dmac_v = MLX5_ADDR_OF(fte_match_param, rule_spec->match_value, outer_headers.dmac_47_16); + ether_addr_copy(dmac_v, entry->key.addr); +@@ -587,6 +588,7 @@ mlx5_esw_bridge_mcast_vlan_flow_create(u16 vlan_proto, struct mlx5_esw_bridge_po + if (!rule_spec) + return ERR_PTR(-ENOMEM); + ++ rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN; + rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; +@@ -662,6 +664,7 @@ mlx5_esw_bridge_mcast_fwd_flow_create(struct mlx5_esw_bridge_port *port) + dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID; + dest.vport.vhca_id = port->esw_owner_vhca_id; + } ++ rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN; + handle = mlx5_add_flow_rules(port->mcast.ft, rule_spec, &flow_act, &dest, 1); + + kvfree(rule_spec); +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +index a4b925331661..b29299c49ab3 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +@@ -566,6 +566,8 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, + fte->flow_context.flow_tag); + MLX5_SET(flow_context, in_flow_context, flow_source, + fte->flow_context.flow_source); ++ MLX5_SET(flow_context, in_flow_context, uplink_hairpin_en, ++ !!(fte->flow_context.flags & FLOW_CONTEXT_UPLINK_HAIRPIN_EN)); + + MLX5_SET(flow_context, in_flow_context, extended_destination, + extended_dest); +diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h +index 6f7725238abc..3fb428ce7d1c 100644 +--- a/include/linux/mlx5/fs.h ++++ b/include/linux/mlx5/fs.h +@@ -132,6 +132,7 @@ struct mlx5_flow_handle; + + enum { + FLOW_CONTEXT_HAS_TAG = BIT(0), ++ FLOW_CONTEXT_UPLINK_HAIRPIN_EN = BIT(1), + }; + + struct mlx5_flow_context { +diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h +index 8ac6ae79e083..51eb83f77938 100644 +--- a/include/linux/mlx5/mlx5_ifc.h ++++ b/include/linux/mlx5/mlx5_ifc.h +@@ -3536,7 +3536,7 @@ struct mlx5_ifc_flow_context_bits { + u8 action[0x10]; + + u8 extended_destination[0x1]; +- u8 reserved_at_81[0x1]; ++ u8 uplink_hairpin_en[0x1]; + u8 flow_source[0x2]; + u8 encrypt_decrypt_type[0x4]; + u8 destination_list_size[0x18]; +-- +2.43.0 + diff --git a/queue-6.6/net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch b/queue-6.6/net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch new file mode 100644 index 00000000000..db273b880ea --- /dev/null +++ b/queue-6.6/net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch @@ -0,0 +1,51 @@ +From 3f533d9b6010bac88a223b8ebc3d9db43dec4cf5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 17 Dec 2023 13:20:36 +0200 +Subject: net/mlx5: DR, Can't go to uplink vport on RX rule + +From: Yevgeny Kliteynik + +[ Upstream commit 5b2a2523eeea5f03d39a9d1ff1bad2e9f8eb98d2 ] + +Go-To-Vport action on RX is not allowed when the vport is uplink. +In such case, the packet should be dropped. + +Fixes: 9db810ed2d37 ("net/mlx5: DR, Expose steering action functionality") +Signed-off-by: Yevgeny Kliteynik +Reviewed-by: Erez Shitrit +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + .../mellanox/mlx5/core/steering/dr_action.c | 16 +++++++++++----- + 1 file changed, 11 insertions(+), 5 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +index 1a5aee8a7f13..90c38cbbde18 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +@@ -867,11 +867,17 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher, + action->sampler->tx_icm_addr; + break; + case DR_ACTION_TYP_VPORT: +- attr.hit_gvmi = action->vport->caps->vhca_gvmi; +- dest_action = action; +- attr.final_icm_addr = rx_rule ? +- action->vport->caps->icm_address_rx : +- action->vport->caps->icm_address_tx; ++ if (unlikely(rx_rule && action->vport->caps->num == MLX5_VPORT_UPLINK)) { ++ /* can't go to uplink on RX rule - dropping instead */ ++ attr.final_icm_addr = nic_dmn->drop_icm_addr; ++ attr.hit_gvmi = nic_dmn->drop_icm_addr >> 48; ++ } else { ++ attr.hit_gvmi = action->vport->caps->vhca_gvmi; ++ dest_action = action; ++ attr.final_icm_addr = rx_rule ? ++ action->vport->caps->icm_address_rx : ++ action->vport->caps->icm_address_tx; ++ } + break; + case DR_ACTION_TYP_POP_VLAN: + if (!rx_rule && !(dmn->ste_ctx->actions_caps & +-- +2.43.0 + diff --git a/queue-6.6/net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch b/queue-6.6/net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch new file mode 100644 index 00000000000..dab133ff729 --- /dev/null +++ b/queue-6.6/net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch @@ -0,0 +1,39 @@ +From 7970f7db5b50822e26e6e1c43136dace5779369f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 17 Dec 2023 11:24:08 +0200 +Subject: net/mlx5: DR, Use the right GVMI number for drop action + +From: Yevgeny Kliteynik + +[ Upstream commit 5665954293f13642f9c052ead83c1e9d8cff186f ] + +When FW provides ICM addresses for drop RX/TX, the provided capability +is 64 bits that contain its GVMI as well as the ICM address itself. +In case of TX DROP this GVMI is different from the GVMI that the +domain is operating on. + +This patch fixes the action to use these GVMI IDs, as provided by FW. + +Fixes: 9db810ed2d37 ("net/mlx5: DR, Expose steering action functionality") +Signed-off-by: Yevgeny Kliteynik +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +index 5b83da08692d..1a5aee8a7f13 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +@@ -781,6 +781,7 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher, + switch (action_type) { + case DR_ACTION_TYP_DROP: + attr.final_icm_addr = nic_dmn->drop_icm_addr; ++ attr.hit_gvmi = nic_dmn->drop_icm_addr >> 48; + break; + case DR_ACTION_TYP_FT: + dest_action = action; +-- +2.43.0 + diff --git a/queue-6.6/net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch b/queue-6.6/net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch new file mode 100644 index 00000000000..e346ee3430d --- /dev/null +++ b/queue-6.6/net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch @@ -0,0 +1,149 @@ +From 578d3902c8cede082b4623c2bb915387b622b49d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 31 Dec 2023 15:19:50 +0200 +Subject: net/mlx5: Fix a WARN upon a callback command failure + +From: Yishai Hadas + +[ Upstream commit cc8091587779cfaddb6b29c9e9edb9079a282cad ] + +The below WARN [1] is reported once a callback command failed. + +As a callback runs under an interrupt context, needs to use the IRQ +save/restore variant. + +[1] +DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context()) +WARNING: CPU: 15 PID: 0 at kernel/locking/lockdep.c:4353 + lockdep_hardirqs_on_prepare+0x11b/0x180 +Modules linked in: vhost_net vhost tap mlx5_vfio_pci +vfio_pci vfio_pci_core vfio_iommu_type1 vfio mlx5_vdpa vringh +vhost_iotlb vdpa nfnetlink_cttimeout openvswitch nsh ip6table_mangle +ip6table_nat ip6table_filter ip6_tables iptable_mangle +xt_conntrackxt_MASQUERADE nf_conntrack_netlink nfnetlink +xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 +auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi +scsi_transport_iscsi rdma_cm iw_cm ib_umad ib_ipoib ib_cm +mlx5_ib ib_uverbs ib_core fuse mlx5_core +CPU: 15 PID: 0 Comm: swapper/15 Tainted: G W 6.7.0-rc4+ #1587 +Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS +rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 +RIP: 0010:lockdep_hardirqs_on_prepare+0x11b/0x180 +Code: 00 5b c3 c3 e8 e6 0d 58 00 85 c0 74 d6 8b 15 f0 c3 + 76 01 85 d2 75 cc 48 c7 c6 04 a5 3b 82 48 c7 c7 f1 + e9 39 82 e8 95 12 f9 ff <0f> 0b 5b c3 e8 bc 0d 58 00 + 85 c0 74 ac 8b 3d c6 c3 76 01 85 ff 75 +RSP: 0018:ffffc900003ecd18 EFLAGS: 00010086 +RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000027 +RDX: 0000000000000000 RSI: ffff88885fbdb880 RDI: ffff88885fbdb888 +RBP: 00000000ffffff87 R08: 0000000000000000 R09: 0000000000000001 +R10: 0000000000000000 R11: 284e4f5f4e524157 R12: 00000000002c9aa1 +R13: ffff88810aace980 R14: ffff88810aace9b8 R15: 0000000000000003 +FS: 0000000000000000(0000) GS:ffff88885fbc0000(0000) +knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00007f731436f4c8 CR3: 000000010aae6001 CR4: 0000000000372eb0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +Call Trace: + +? __warn+0x81/0x170 +? lockdep_hardirqs_on_prepare+0x11b/0x180 +? report_bug+0xf8/0x1c0 +? handle_bug+0x3f/0x70 +? exc_invalid_op+0x13/0x60 +? asm_exc_invalid_op+0x16/0x20 +? lockdep_hardirqs_on_prepare+0x11b/0x180 +? lockdep_hardirqs_on_prepare+0x11b/0x180 +trace_hardirqs_on+0x4a/0xa0 +raw_spin_unlock_irq+0x24/0x30 +cmd_status_err+0xc0/0x1a0 [mlx5_core] +cmd_status_err+0x1a0/0x1a0 [mlx5_core] +mlx5_cmd_exec_cb_handler+0x24/0x40 [mlx5_core] +mlx5_cmd_comp_handler+0x129/0x4b0 [mlx5_core] +cmd_comp_notifier+0x1a/0x20 [mlx5_core] +notifier_call_chain+0x3e/0xe0 +atomic_notifier_call_chain+0x5f/0x130 +mlx5_eq_async_int+0xe7/0x200 [mlx5_core] +notifier_call_chain+0x3e/0xe0 +atomic_notifier_call_chain+0x5f/0x130 +irq_int_handler+0x11/0x20 [mlx5_core] +__handle_irq_event_percpu+0x99/0x220 +? tick_irq_enter+0x5d/0x80 +handle_irq_event_percpu+0xf/0x40 +handle_irq_event+0x3a/0x60 +handle_edge_irq+0xa2/0x1c0 +__common_interrupt+0x55/0x140 +common_interrupt+0x7d/0xa0 + + +asm_common_interrupt+0x22/0x40 +RIP: 0010:default_idle+0x13/0x20 +Code: c0 08 00 00 00 4d 29 c8 4c 01 c7 4c 29 c2 e9 72 ff +ff ff cc cc cc cc 8b 05 ea 08 25 01 85 c0 7e 07 0f 00 2d 7f b0 26 00 fb +f4 c3 90 66 2e 0f 1f 84 00 00 00 00 00 65 48 8b 04 25 80 d0 02 00 +RSP: 0018:ffffc9000010fec8 EFLAGS: 00000242 +RAX: 0000000000000001 RBX: 000000000000000f RCX: 4000000000000000 +RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff811c410c +RBP: ffffffff829478c0 R08: 0000000000000001 R09: 0000000000000001 +R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 +R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 +? do_idle+0x1ec/0x210 +default_idle_call+0x6c/0x90 +do_idle+0x1ec/0x210 +cpu_startup_entry+0x26/0x30 +start_secondary+0x11b/0x150 +secondary_startup_64_no_verify+0x165/0x16b + +irq event stamp: 833284 +hardirqs last enabled at (833283): [] +do_idle+0x1ec/0x210 +hardirqs last disabled at (833284): [] +common_interrupt+0xf/0xa0 +softirqs last enabled at (833224): [] +__do_softirq+0x2bf/0x40e +softirqs last disabled at (833177): [] +irq_exit_rcu+0x7f/0xa0 + +Fixes: 34f46ae0d4b3 ("net/mlx5: Add command failures data to debugfs") +Signed-off-by: Yishai Hadas +Reviewed-by: Moshe Shemesh +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +index 7013e1c8741a..55efb932ab2c 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +@@ -1921,6 +1921,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status, + { + const char *namep = mlx5_command_str(opcode); + struct mlx5_cmd_stats *stats; ++ unsigned long flags; + + if (!err || !(strcmp(namep, "unknown command opcode"))) + return; +@@ -1928,7 +1929,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status, + stats = xa_load(&dev->cmd.stats, opcode); + if (!stats) + return; +- spin_lock_irq(&stats->lock); ++ spin_lock_irqsave(&stats->lock, flags); + stats->failed++; + if (err < 0) + stats->last_failed_errno = -err; +@@ -1937,7 +1938,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status, + stats->last_failed_mbox_status = status; + stats->last_failed_syndrome = syndrome; + } +- spin_unlock_irq(&stats->lock); ++ spin_unlock_irqrestore(&stats->lock, flags); + } + + /* preserve -EREMOTEIO for outbox.status != OK, otherwise return err as is */ +-- +2.43.0 + diff --git a/queue-6.6/net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch b/queue-6.6/net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch new file mode 100644 index 00000000000..8df4369bb66 --- /dev/null +++ b/queue-6.6/net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch @@ -0,0 +1,39 @@ +From 5a6c2772e477fb69792c2f463d174d6e3960cb54 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 28 Nov 2023 14:01:54 -0800 +Subject: net/mlx5: Use mlx5 device constant for selecting CQ period mode for + ASO + +From: Rahul Rameshbabu + +[ Upstream commit 20cbf8cbb827094197f3b17db60d71449415db1e ] + +mlx5 devices have specific constants for choosing the CQ period mode. These +constants do not have to match the constants used by the kernel software +API for DIM period mode selection. + +Fixes: cdd04f4d4d71 ("net/mlx5: Add support to create SQ and CQ for ASO") +Signed-off-by: Rahul Rameshbabu +Reviewed-by: Jianbo Liu +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c +index 40c7be124041..58bd749b5e4d 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c +@@ -98,7 +98,7 @@ static int create_aso_cq(struct mlx5_aso_cq *cq, void *cqc_data) + mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, + (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas)); + +- MLX5_SET(cqc, cqc, cq_period_mode, DIM_CQ_PERIOD_MODE_START_FROM_EQE); ++ MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); + MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - +-- +2.43.0 + diff --git a/queue-6.6/net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch b/queue-6.6/net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch new file mode 100644 index 00000000000..86dbe341fb1 --- /dev/null +++ b/queue-6.6/net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch @@ -0,0 +1,39 @@ +From 7c90bd2ab1fafff65de6ffe9d396b9d712ecda24 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Dec 2023 13:52:55 +0200 +Subject: net/mlx5e: Allow software parsing when IPsec crypto is enabled + +From: Leon Romanovsky + +[ Upstream commit 20f5468a7988dedd94a57ba8acd65ebda6a59723 ] + +All ConnectX devices have software parsing capability enabled, but it is +more correct to set allow_swp only if capability exists, which for IPsec +means that crypto offload is supported. + +Fixes: 2451da081a34 ("net/mlx5: Unify device IPsec capabilities check") +Signed-off-by: Leon Romanovsky +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/en/params.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +index e097f336e1c4..30507b7c2fb1 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +@@ -1062,8 +1062,8 @@ void mlx5e_build_sq_param(struct mlx5_core_dev *mdev, + void *wq = MLX5_ADDR_OF(sqc, sqc, wq); + bool allow_swp; + +- allow_swp = +- mlx5_geneve_tx_allowed(mdev) || !!mlx5_ipsec_device_caps(mdev); ++ allow_swp = mlx5_geneve_tx_allowed(mdev) || ++ (mlx5_ipsec_device_caps(mdev) & MLX5_IPSEC_CAP_CRYPTO); + mlx5e_build_sq_param_common(mdev, param); + MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size); + MLX5_SET(sqc, sqc, allow_swp, allow_swp); +-- +2.43.0 + diff --git a/queue-6.6/net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch b/queue-6.6/net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch new file mode 100644 index 00000000000..eabb1752b40 --- /dev/null +++ b/queue-6.6/net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch @@ -0,0 +1,100 @@ +From a999382790f95d77671c82459ded8a59f3b43b45 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Jan 2024 15:17:36 +0800 +Subject: net/mlx5e: fix a double-free in arfs_create_groups + +From: Zhipeng Lu + +[ Upstream commit 3c6d5189246f590e4e1f167991558bdb72a4738b ] + +When `in` allocated by kvzalloc fails, arfs_create_groups will free +ft->g and return an error. However, arfs_create_table, the only caller of +arfs_create_groups, will hold this error and call to +mlx5e_destroy_flow_table, in which the ft->g will be freed again. + +Fixes: 1cabe6b0965e ("net/mlx5e: Create aRFS flow tables") +Signed-off-by: Zhipeng Lu +Reviewed-by: Simon Horman +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + .../net/ethernet/mellanox/mlx5/core/en_arfs.c | 26 +++++++++++-------- + 1 file changed, 15 insertions(+), 11 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +index bb7f86c993e5..e66f486faafe 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +@@ -254,11 +254,13 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft, + + ft->g = kcalloc(MLX5E_ARFS_NUM_GROUPS, + sizeof(*ft->g), GFP_KERNEL); +- in = kvzalloc(inlen, GFP_KERNEL); +- if (!in || !ft->g) { +- kfree(ft->g); +- kvfree(in); ++ if (!ft->g) + return -ENOMEM; ++ ++ in = kvzalloc(inlen, GFP_KERNEL); ++ if (!in) { ++ err = -ENOMEM; ++ goto err_free_g; + } + + mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); +@@ -278,7 +280,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft, + break; + default: + err = -EINVAL; +- goto out; ++ goto err_free_in; + } + + switch (type) { +@@ -300,7 +302,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft, + break; + default: + err = -EINVAL; +- goto out; ++ goto err_free_in; + } + + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); +@@ -309,7 +311,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft, + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) +- goto err; ++ goto err_clean_group; + ft->num_groups++; + + memset(in, 0, inlen); +@@ -318,18 +320,20 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft, + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) +- goto err; ++ goto err_clean_group; + ft->num_groups++; + + kvfree(in); + return 0; + +-err: ++err_clean_group: + err = PTR_ERR(ft->g[ft->num_groups]); + ft->g[ft->num_groups] = NULL; +-out: ++err_free_in: + kvfree(in); +- ++err_free_g: ++ kfree(ft->g); ++ ft->g = NULL; + return err; + } + +-- +2.43.0 + diff --git a/queue-6.6/net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch b/queue-6.6/net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch new file mode 100644 index 00000000000..dcab6777b53 --- /dev/null +++ b/queue-6.6/net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch @@ -0,0 +1,40 @@ +From d7ccda1acf1057d69981953e66087f55dc0daf9e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 28 Nov 2023 17:29:01 +0800 +Subject: net/mlx5e: fix a potential double-free in fs_any_create_groups + +From: Dinghao Liu + +[ Upstream commit aef855df7e1bbd5aa4484851561211500b22707e ] + +When kcalloc() for ft->g succeeds but kvzalloc() for in fails, +fs_any_create_groups() will free ft->g. However, its caller +fs_any_create_table() will free ft->g again through calling +mlx5e_destroy_flow_table(), which will lead to a double-free. +Fix this by setting ft->g to NULL in fs_any_create_groups(). + +Fixes: 0f575c20bf06 ("net/mlx5e: Introduce Flow Steering ANY API") +Signed-off-by: Dinghao Liu +Reviewed-by: Tariq Toukan +Reviewed-by: Simon Horman +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c b/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c +index e1283531e0b8..671adbad0a40 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c +@@ -436,6 +436,7 @@ static int fs_any_create_groups(struct mlx5e_flow_table *ft) + in = kvzalloc(inlen, GFP_KERNEL); + if (!in || !ft->g) { + kfree(ft->g); ++ ft->g = NULL; + kvfree(in); + return -ENOMEM; + } +-- +2.43.0 + diff --git a/queue-6.6/net-mlx5e-fix-operation-precedence-bug-in-port-times.patch b/queue-6.6/net-mlx5e-fix-operation-precedence-bug-in-port-times.patch new file mode 100644 index 00000000000..3aa24f8827f --- /dev/null +++ b/queue-6.6/net-mlx5e-fix-operation-precedence-bug-in-port-times.patch @@ -0,0 +1,41 @@ +From 7393f91797c248e3fd72d418ad4692e02bba7058 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 22 Nov 2023 18:32:11 -0800 +Subject: net/mlx5e: Fix operation precedence bug in port timestamping + napi_poll context + +From: Rahul Rameshbabu + +[ Upstream commit 3876638b2c7ebb2c9d181de1191db0de8cac143a ] + +Indirection (*) is of lower precedence than postfix increment (++). Logic +in napi_poll context would cause an out-of-bound read by first increment +the pointer address by byte address space and then dereference the value. +Rather, the intended logic was to dereference first and then increment the +underlying value. + +Fixes: 92214be5979c ("net/mlx5e: Update doorbell for port timestamping CQ before the software counter") +Signed-off-by: Rahul Rameshbabu +Reviewed-by: Tariq Toukan +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +index af3928eddafd..803035d4e597 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +@@ -213,7 +213,7 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq, + mlx5e_ptpsq_mark_ts_cqes_undelivered(ptpsq, hwtstamp); + out: + napi_consume_skb(skb, budget); +- md_buff[*md_buff_sz++] = metadata_id; ++ md_buff[(*md_buff_sz)++] = metadata_id; + if (unlikely(mlx5e_ptp_metadata_map_unhealthy(&ptpsq->metadata_map)) && + !test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) + queue_work(ptpsq->txqsq.priv->wq, &ptpsq->report_unhealthy_work); +-- +2.43.0 + diff --git a/queue-6.6/net-mlx5e-fix-peer-flow-lists-handling.patch b/queue-6.6/net-mlx5e-fix-peer-flow-lists-handling.patch new file mode 100644 index 00000000000..6090d43a50a --- /dev/null +++ b/queue-6.6/net-mlx5e-fix-peer-flow-lists-handling.patch @@ -0,0 +1,126 @@ +From fc863a43647a9252fc4207aae912d7822ced707d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 10 Nov 2023 11:10:22 +0100 +Subject: net/mlx5e: Fix peer flow lists handling + +From: Vlad Buslov + +[ Upstream commit d76fdd31f953ac5046555171620f2562715e9b71 ] + +The cited change refactored mlx5e_tc_del_fdb_peer_flow() to only clear DUP +flag when list of peer flows has become empty. However, if any concurrent +user holds a reference to a peer flow (for example, the neighbor update +workqueue task is updating peer flow's parent encap entry concurrently), +then the flow will not be removed from the peer list and, consecutively, +DUP flag will remain set. Since mlx5e_tc_del_fdb_peers_flow() calls +mlx5e_tc_del_fdb_peer_flow() for every possible peer index the algorithm +will try to remove the flow from eswitch instances that it has never peered +with causing either NULL pointer dereference when trying to remove the flow +peer list head of peer_index that was never initialized or a warning if the +list debug config is enabled[0]. + +Fix the issue by always removing the peer flow from the list even when not +releasing the last reference to it. + +[0]: + +[ 3102.985806] ------------[ cut here ]------------ +[ 3102.986223] list_del corruption, ffff888139110698->next is NULL +[ 3102.986757] WARNING: CPU: 2 PID: 22109 at lib/list_debug.c:53 __list_del_entry_valid_or_report+0x4f/0xc0 +[ 3102.987561] Modules linked in: act_ct nf_flow_table bonding act_tunnel_key act_mirred act_skbedit vxlan cls_matchall nfnetlink_cttimeout act_gact cls_flower sch_ingress mlx5_vdpa vringh vhost_iotlb vdpa openvswitch nsh xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcg +ss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5_core [last unloaded: bonding] +[ 3102.991113] CPU: 2 PID: 22109 Comm: revalidator28 Not tainted 6.6.0-rc6+ #3 +[ 3102.991695] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 +[ 3102.992605] RIP: 0010:__list_del_entry_valid_or_report+0x4f/0xc0 +[ 3102.993122] Code: 39 c2 74 56 48 8b 32 48 39 fe 75 62 48 8b 51 08 48 39 f2 75 73 b8 01 00 00 00 c3 48 89 fe 48 c7 c7 48 fd 0a 82 e8 41 0b ad ff <0f> 0b 31 c0 c3 48 89 fe 48 c7 c7 70 fd 0a 82 e8 2d 0b ad ff 0f 0b +[ 3102.994615] RSP: 0018:ffff8881383e7710 EFLAGS: 00010286 +[ 3102.995078] RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000000 +[ 3102.995670] RDX: 0000000000000001 RSI: ffff88885f89b640 RDI: ffff88885f89b640 +[ 3102.997188] DEL flow 00000000be367878 on port 0 +[ 3102.998594] RBP: dead000000000122 R08: 0000000000000000 R09: c0000000ffffdfff +[ 3102.999604] R10: 0000000000000008 R11: ffff8881383e7598 R12: dead000000000100 +[ 3103.000198] R13: 0000000000000002 R14: ffff888139110000 R15: ffff888101901240 +[ 3103.000790] FS: 00007f424cde4700(0000) GS:ffff88885f880000(0000) knlGS:0000000000000000 +[ 3103.001486] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 3103.001986] CR2: 00007fd42e8dcb70 CR3: 000000011e68a003 CR4: 0000000000370ea0 +[ 3103.002596] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[ 3103.003190] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[ 3103.003787] Call Trace: +[ 3103.004055] +[ 3103.004297] ? __warn+0x7d/0x130 +[ 3103.004623] ? __list_del_entry_valid_or_report+0x4f/0xc0 +[ 3103.005094] ? report_bug+0xf1/0x1c0 +[ 3103.005439] ? console_unlock+0x4a/0xd0 +[ 3103.005806] ? handle_bug+0x3f/0x70 +[ 3103.006149] ? exc_invalid_op+0x13/0x60 +[ 3103.006531] ? asm_exc_invalid_op+0x16/0x20 +[ 3103.007430] ? __list_del_entry_valid_or_report+0x4f/0xc0 +[ 3103.007910] mlx5e_tc_del_fdb_peers_flow+0xcf/0x240 [mlx5_core] +[ 3103.008463] mlx5e_tc_del_flow+0x46/0x270 [mlx5_core] +[ 3103.008944] mlx5e_flow_put+0x26/0x50 [mlx5_core] +[ 3103.009401] mlx5e_delete_flower+0x25f/0x380 [mlx5_core] +[ 3103.009901] tc_setup_cb_destroy+0xab/0x180 +[ 3103.010292] fl_hw_destroy_filter+0x99/0xc0 [cls_flower] +[ 3103.010779] __fl_delete+0x2d4/0x2f0 [cls_flower] +[ 3103.011207] fl_delete+0x36/0x80 [cls_flower] +[ 3103.011614] tc_del_tfilter+0x56f/0x750 +[ 3103.011982] rtnetlink_rcv_msg+0xff/0x3a0 +[ 3103.012362] ? netlink_ack+0x1c7/0x4e0 +[ 3103.012719] ? rtnl_calcit.isra.44+0x130/0x130 +[ 3103.013134] netlink_rcv_skb+0x54/0x100 +[ 3103.013533] netlink_unicast+0x1ca/0x2b0 +[ 3103.013902] netlink_sendmsg+0x361/0x4d0 +[ 3103.014269] __sock_sendmsg+0x38/0x60 +[ 3103.014643] ____sys_sendmsg+0x1f2/0x200 +[ 3103.015018] ? copy_msghdr_from_user+0x72/0xa0 +[ 3103.015265] ___sys_sendmsg+0x87/0xd0 +[ 3103.016608] ? copy_msghdr_from_user+0x72/0xa0 +[ 3103.017014] ? ___sys_recvmsg+0x9b/0xd0 +[ 3103.017381] ? ttwu_do_activate.isra.137+0x58/0x180 +[ 3103.017821] ? wake_up_q+0x49/0x90 +[ 3103.018157] ? futex_wake+0x137/0x160 +[ 3103.018521] ? __sys_sendmsg+0x51/0x90 +[ 3103.018882] __sys_sendmsg+0x51/0x90 +[ 3103.019230] ? exit_to_user_mode_prepare+0x56/0x130 +[ 3103.019670] do_syscall_64+0x3c/0x80 +[ 3103.020017] entry_SYSCALL_64_after_hwframe+0x46/0xb0 +[ 3103.020469] RIP: 0033:0x7f4254811ef4 +[ 3103.020816] Code: 89 f3 48 83 ec 10 48 89 7c 24 08 48 89 14 24 e8 42 eb ff ff 48 8b 14 24 41 89 c0 48 89 de 48 8b 7c 24 08 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 30 44 89 c7 48 89 04 24 e8 78 eb ff ff 48 8b +[ 3103.022290] RSP: 002b:00007f424cdd9480 EFLAGS: 00000293 ORIG_RAX: 000000000000002e +[ 3103.022970] RAX: ffffffffffffffda RBX: 00007f424cdd9510 RCX: 00007f4254811ef4 +[ 3103.023564] RDX: 0000000000000000 RSI: 00007f424cdd9510 RDI: 0000000000000012 +[ 3103.024158] RBP: 00007f424cdda238 R08: 0000000000000000 R09: 00007f41d801a4b0 +[ 3103.024748] R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000001 +[ 3103.025341] R13: 00007f424cdd9510 R14: 00007f424cdda240 R15: 00007f424cdd99a0 +[ 3103.025931] +[ 3103.026182] ---[ end trace 0000000000000000 ]--- +[ 3103.027033] ------------[ cut here ]------------ + +Fixes: 9be6c21fdcf8 ("net/mlx5e: Handle offloads flows per peer") +Signed-off-by: Vlad Buslov +Reviewed-by: Mark Bloch +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +index 25e44ee5121a..dc9b157a4499 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +@@ -2012,9 +2012,10 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow, + list_for_each_entry_safe(peer_flow, tmp, &flow->peer_flows, peer_flows) { + if (peer_index != mlx5_get_dev_index(peer_flow->priv->mdev)) + continue; ++ ++ list_del(&peer_flow->peer_flows); + if (refcount_dec_and_test(&peer_flow->refcnt)) { + mlx5e_tc_del_fdb_flow(peer_flow->priv, peer_flow); +- list_del(&peer_flow->peer_flows); + kfree(peer_flow); + } + } +-- +2.43.0 + diff --git a/queue-6.6/net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch b/queue-6.6/net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch new file mode 100644 index 00000000000..f5cf8835490 --- /dev/null +++ b/queue-6.6/net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch @@ -0,0 +1,68 @@ +From fe6083067bcb4589a237bb79bfe1c9821384c328 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 26 Nov 2023 11:08:10 +0200 +Subject: net/mlx5e: Ignore IPsec replay window values on sender side + +From: Leon Romanovsky + +[ Upstream commit 315a597f9bcfe7fe9980985031413457bee95510 ] + +XFRM stack doesn't prevent from users to configure replay window +in TX side and strongswan sets replay_window to be 1. It causes +to failures in validation logic when trying to offload the SA. + +Replay window is not relevant in TX side and should be ignored. + +Fixes: cded6d80129b ("net/mlx5e: Store replay window in XFRM attributes") +Signed-off-by: Aya Levin +Signed-off-by: Leon Romanovsky +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + .../net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +index 5834e47e72d8..e2ffc572de18 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +@@ -336,12 +336,17 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry, + /* iv len */ + aes_gcm->icv_len = x->aead->alg_icv_len; + ++ attrs->dir = x->xso.dir; ++ + /* esn */ + if (x->props.flags & XFRM_STATE_ESN) { + attrs->replay_esn.trigger = true; + attrs->replay_esn.esn = sa_entry->esn_state.esn; + attrs->replay_esn.esn_msb = sa_entry->esn_state.esn_msb; + attrs->replay_esn.overlap = sa_entry->esn_state.overlap; ++ if (attrs->dir == XFRM_DEV_OFFLOAD_OUT) ++ goto skip_replay_window; ++ + switch (x->replay_esn->replay_window) { + case 32: + attrs->replay_esn.replay_window = +@@ -365,7 +370,7 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry, + } + } + +- attrs->dir = x->xso.dir; ++skip_replay_window: + /* spi */ + attrs->spi = be32_to_cpu(x->id.spi); + +@@ -501,7 +506,8 @@ static int mlx5e_xfrm_validate_state(struct mlx5_core_dev *mdev, + return -EINVAL; + } + +- if (x->replay_esn && x->replay_esn->replay_window != 32 && ++ if (x->replay_esn && x->xso.dir == XFRM_DEV_OFFLOAD_IN && ++ x->replay_esn->replay_window != 32 && + x->replay_esn->replay_window != 64 && + x->replay_esn->replay_window != 128 && + x->replay_esn->replay_window != 256) { +-- +2.43.0 + diff --git a/queue-6.6/net-mvpp2-clear-bm-pool-before-initialization.patch b/queue-6.6/net-mvpp2-clear-bm-pool-before-initialization.patch new file mode 100644 index 00000000000..43fb96db9a6 --- /dev/null +++ b/queue-6.6/net-mvpp2-clear-bm-pool-before-initialization.patch @@ -0,0 +1,77 @@ +From fd80834d784fc993055f1ff5e1194a7e02c0b83a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 19:59:14 -0800 +Subject: net: mvpp2: clear BM pool before initialization + +From: Jenishkumar Maheshbhai Patel + +[ Upstream commit 9f538b415db862e74b8c5d3abbccfc1b2b6caa38 ] + +Register value persist after booting the kernel using +kexec which results in kernel panic. Thus clear the +BM pool registers before initialisation to fix the issue. + +Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") +Signed-off-by: Jenishkumar Maheshbhai Patel +Reviewed-by: Maxime Chevallier +Link: https://lore.kernel.org/r/20240119035914.2595665-1-jpatel2@marvell.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + .../net/ethernet/marvell/mvpp2/mvpp2_main.c | 27 ++++++++++++++++++- + 1 file changed, 26 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +index 21c3f9b015c8..aca17082b9ec 100644 +--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c ++++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +@@ -614,12 +614,38 @@ static void mvpp23_bm_set_8pool_mode(struct mvpp2 *priv) + mvpp2_write(priv, MVPP22_BM_POOL_BASE_ADDR_HIGH_REG, val); + } + ++/* Cleanup pool before actual initialization in the OS */ ++static void mvpp2_bm_pool_cleanup(struct mvpp2 *priv, int pool_id) ++{ ++ unsigned int thread = mvpp2_cpu_to_thread(priv, get_cpu()); ++ u32 val; ++ int i; ++ ++ /* Drain the BM from all possible residues left by firmware */ ++ for (i = 0; i < MVPP2_BM_POOL_SIZE_MAX; i++) ++ mvpp2_thread_read(priv, thread, MVPP2_BM_PHY_ALLOC_REG(pool_id)); ++ ++ put_cpu(); ++ ++ /* Stop the BM pool */ ++ val = mvpp2_read(priv, MVPP2_BM_POOL_CTRL_REG(pool_id)); ++ val |= MVPP2_BM_STOP_MASK; ++ mvpp2_write(priv, MVPP2_BM_POOL_CTRL_REG(pool_id), val); ++} ++ + static int mvpp2_bm_init(struct device *dev, struct mvpp2 *priv) + { + enum dma_data_direction dma_dir = DMA_FROM_DEVICE; + int i, err, poolnum = MVPP2_BM_POOLS_NUM; + struct mvpp2_port *port; + ++ if (priv->percpu_pools) ++ poolnum = mvpp2_get_nrxqs(priv) * 2; ++ ++ /* Clean up the pool state in case it contains stale state */ ++ for (i = 0; i < poolnum; i++) ++ mvpp2_bm_pool_cleanup(priv, i); ++ + if (priv->percpu_pools) { + for (i = 0; i < priv->port_count; i++) { + port = priv->port_list[i]; +@@ -629,7 +655,6 @@ static int mvpp2_bm_init(struct device *dev, struct mvpp2 *priv) + } + } + +- poolnum = mvpp2_get_nrxqs(priv) * 2; + for (i = 0; i < poolnum; i++) { + /* the pool in use */ + int pn = i / (poolnum / 2); +-- +2.43.0 + diff --git a/queue-6.6/net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch b/queue-6.6/net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch new file mode 100644 index 00000000000..2bb1bc3b099 --- /dev/null +++ b/queue-6.6/net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch @@ -0,0 +1,71 @@ +From a767ed016555274d24d06e30826c54ba74e0c0a8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 17:48:39 -0800 +Subject: net/rds: Fix UBSAN: array-index-out-of-bounds in rds_cmsg_recv + +From: Sharath Srinivasan + +[ Upstream commit 13e788deb7348cc88df34bed736c3b3b9927ea52 ] + +Syzcaller UBSAN crash occurs in rds_cmsg_recv(), +which reads inc->i_rx_lat_trace[j + 1] with index 4 (3 + 1), +but with array size of 4 (RDS_RX_MAX_TRACES). +Here 'j' is assigned from rs->rs_rx_trace[i] and in-turn from +trace.rx_trace_pos[i] in rds_recv_track_latency(), +with both arrays sized 3 (RDS_MSG_RX_DGRAM_TRACE_MAX). So fix the +off-by-one bounds check in rds_recv_track_latency() to prevent +a potential crash in rds_cmsg_recv(). + +Found by syzcaller: +================================================================= +UBSAN: array-index-out-of-bounds in net/rds/recv.c:585:39 +index 4 is out of range for type 'u64 [4]' +CPU: 1 PID: 8058 Comm: syz-executor228 Not tainted 6.6.0-gd2f51b3516da #1 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), +BIOS 1.15.0-1 04/01/2014 +Call Trace: + + __dump_stack lib/dump_stack.c:88 [inline] + dump_stack_lvl+0x136/0x150 lib/dump_stack.c:106 + ubsan_epilogue lib/ubsan.c:217 [inline] + __ubsan_handle_out_of_bounds+0xd5/0x130 lib/ubsan.c:348 + rds_cmsg_recv+0x60d/0x700 net/rds/recv.c:585 + rds_recvmsg+0x3fb/0x1610 net/rds/recv.c:716 + sock_recvmsg_nosec net/socket.c:1044 [inline] + sock_recvmsg+0xe2/0x160 net/socket.c:1066 + __sys_recvfrom+0x1b6/0x2f0 net/socket.c:2246 + __do_sys_recvfrom net/socket.c:2264 [inline] + __se_sys_recvfrom net/socket.c:2260 [inline] + __x64_sys_recvfrom+0xe0/0x1b0 net/socket.c:2260 + do_syscall_x64 arch/x86/entry/common.c:51 [inline] + do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82 + entry_SYSCALL_64_after_hwframe+0x63/0x6b +================================================================== + +Fixes: 3289025aedc0 ("RDS: add receive message trace used by application") +Reported-by: Chenyuan Yang +Closes: https://lore.kernel.org/linux-rdma/CALGdzuoVdq-wtQ4Az9iottBqC5cv9ZhcE5q8N7LfYFvkRsOVcw@mail.gmail.com/ +Signed-off-by: Sharath Srinivasan +Reviewed-by: Simon Horman +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/rds/af_rds.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c +index 01c4cdfef45d..8435a20968ef 100644 +--- a/net/rds/af_rds.c ++++ b/net/rds/af_rds.c +@@ -419,7 +419,7 @@ static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval, + + rs->rs_rx_traces = trace.rx_traces; + for (i = 0; i < rs->rs_rx_traces; i++) { +- if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) { ++ if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) { + rs->rs_rx_traces = 0; + return -EFAULT; + } +-- +2.43.0 + diff --git a/queue-6.6/net-sched-flower-fix-chain-template-offload.patch b/queue-6.6/net-sched-flower-fix-chain-template-offload.patch new file mode 100644 index 00000000000..43250e920ff --- /dev/null +++ b/queue-6.6/net-sched-flower-fix-chain-template-offload.patch @@ -0,0 +1,190 @@ +From d67e18be087db2c13dd8a5330ae3b06720b9b591 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 15:28:43 +0200 +Subject: net/sched: flower: Fix chain template offload + +From: Ido Schimmel + +[ Upstream commit 32f2a0afa95fae0d1ceec2ff06e0e816939964b8 ] + +When a qdisc is deleted from a net device the stack instructs the +underlying driver to remove its flow offload callback from the +associated filter block using the 'FLOW_BLOCK_UNBIND' command. The stack +then continues to replay the removal of the filters in the block for +this driver by iterating over the chains in the block and invoking the +'reoffload' operation of the classifier being used. In turn, the +classifier in its 'reoffload' operation prepares and emits a +'FLOW_CLS_DESTROY' command for each filter. + +However, the stack does not do the same for chain templates and the +underlying driver never receives a 'FLOW_CLS_TMPLT_DESTROY' command when +a qdisc is deleted. This results in a memory leak [1] which can be +reproduced using [2]. + +Fix by introducing a 'tmplt_reoffload' operation and have the stack +invoke it with the appropriate arguments as part of the replay. +Implement the operation in the sole classifier that supports chain +templates (flower) by emitting the 'FLOW_CLS_TMPLT_{CREATE,DESTROY}' +command based on whether a flow offload callback is being bound to a +filter block or being unbound from one. + +As far as I can tell, the issue happens since cited commit which +reordered tcf_block_offload_unbind() before tcf_block_flush_all_chains() +in __tcf_block_put(). The order cannot be reversed as the filter block +is expected to be freed after flushing all the chains. + +[1] +unreferenced object 0xffff888107e28800 (size 2048): + comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s) + hex dump (first 32 bytes): + b1 a6 7c 11 81 88 ff ff e0 5b b3 10 81 88 ff ff ..|......[...... + 01 00 00 00 00 00 00 00 e0 aa b0 84 ff ff ff ff ................ + backtrace: + [] __kmem_cache_alloc_node+0x1e8/0x320 + [] __kmalloc+0x4e/0x90 + [] mlxsw_sp_acl_ruleset_get+0x34d/0x7a0 + [] mlxsw_sp_flower_tmplt_create+0x145/0x180 + [] mlxsw_sp_flow_block_cb+0x1ea/0x280 + [] tc_setup_cb_call+0x183/0x340 + [] fl_tmplt_create+0x3da/0x4c0 + [] tc_ctl_chain+0xa15/0x1170 + [] rtnetlink_rcv_msg+0x3cc/0xed0 + [] netlink_rcv_skb+0x170/0x440 + [] netlink_unicast+0x540/0x820 + [] netlink_sendmsg+0x8d8/0xda0 + [] ____sys_sendmsg+0x30f/0xa80 + [] ___sys_sendmsg+0x13a/0x1e0 + [] __sys_sendmsg+0x11c/0x1f0 + [] do_syscall_64+0x40/0xe0 +unreferenced object 0xffff88816d2c0400 (size 1024): + comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s) + hex dump (first 32 bytes): + 40 00 00 00 00 00 00 00 57 f6 38 be 00 00 00 00 @.......W.8..... + 10 04 2c 6d 81 88 ff ff 10 04 2c 6d 81 88 ff ff ..,m......,m.... + backtrace: + [] __kmem_cache_alloc_node+0x1e8/0x320 + [] __kmalloc_node+0x51/0x90 + [] kvmalloc_node+0xa6/0x1f0 + [] bucket_table_alloc.isra.0+0x83/0x460 + [] rhashtable_init+0x43b/0x7c0 + [] mlxsw_sp_acl_ruleset_get+0x428/0x7a0 + [] mlxsw_sp_flower_tmplt_create+0x145/0x180 + [] mlxsw_sp_flow_block_cb+0x1ea/0x280 + [] tc_setup_cb_call+0x183/0x340 + [] fl_tmplt_create+0x3da/0x4c0 + [] tc_ctl_chain+0xa15/0x1170 + [] rtnetlink_rcv_msg+0x3cc/0xed0 + [] netlink_rcv_skb+0x170/0x440 + [] netlink_unicast+0x540/0x820 + [] netlink_sendmsg+0x8d8/0xda0 + [] ____sys_sendmsg+0x30f/0xa80 + +[2] + # tc qdisc add dev swp1 clsact + # tc chain add dev swp1 ingress proto ip chain 1 flower dst_ip 0.0.0.0/32 + # tc qdisc del dev swp1 clsact + # devlink dev reload pci/0000:06:00.0 + +Fixes: bbf73830cd48 ("net: sched: traverse chains in block with tcf_get_next_chain()") +Signed-off-by: Ido Schimmel +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + include/net/sch_generic.h | 4 ++++ + net/sched/cls_api.c | 9 ++++++++- + net/sched/cls_flower.c | 23 +++++++++++++++++++++++ + 3 files changed, 35 insertions(+), 1 deletion(-) + +diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h +index f232512505f8..e940debac400 100644 +--- a/include/net/sch_generic.h ++++ b/include/net/sch_generic.h +@@ -376,6 +376,10 @@ struct tcf_proto_ops { + struct nlattr **tca, + struct netlink_ext_ack *extack); + void (*tmplt_destroy)(void *tmplt_priv); ++ void (*tmplt_reoffload)(struct tcf_chain *chain, ++ bool add, ++ flow_setup_cb_t *cb, ++ void *cb_priv); + struct tcf_exts * (*get_exts)(const struct tcf_proto *tp, + u32 handle); + +diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c +index a193cc7b3241..84e18b5f72a3 100644 +--- a/net/sched/cls_api.c ++++ b/net/sched/cls_api.c +@@ -1536,6 +1536,9 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb, + chain_prev = chain, + chain = __tcf_get_next_chain(block, chain), + tcf_chain_put(chain_prev)) { ++ if (chain->tmplt_ops && add) ++ chain->tmplt_ops->tmplt_reoffload(chain, true, cb, ++ cb_priv); + for (tp = __tcf_get_next_proto(chain, NULL); tp; + tp_prev = tp, + tp = __tcf_get_next_proto(chain, tp), +@@ -1551,6 +1554,9 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb, + goto err_playback_remove; + } + } ++ if (chain->tmplt_ops && !add) ++ chain->tmplt_ops->tmplt_reoffload(chain, false, cb, ++ cb_priv); + } + + return 0; +@@ -2950,7 +2956,8 @@ static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net, + ops = tcf_proto_lookup_ops(name, true, extack); + if (IS_ERR(ops)) + return PTR_ERR(ops); +- if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) { ++ if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump || ++ !ops->tmplt_reoffload) { + NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier"); + module_put(ops->owner); + return -EOPNOTSUPP; +diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c +index e5314a31f75a..efb9d2811b73 100644 +--- a/net/sched/cls_flower.c ++++ b/net/sched/cls_flower.c +@@ -2721,6 +2721,28 @@ static void fl_tmplt_destroy(void *tmplt_priv) + kfree(tmplt); + } + ++static void fl_tmplt_reoffload(struct tcf_chain *chain, bool add, ++ flow_setup_cb_t *cb, void *cb_priv) ++{ ++ struct fl_flow_tmplt *tmplt = chain->tmplt_priv; ++ struct flow_cls_offload cls_flower = {}; ++ ++ cls_flower.rule = flow_rule_alloc(0); ++ if (!cls_flower.rule) ++ return; ++ ++ cls_flower.common.chain_index = chain->index; ++ cls_flower.command = add ? FLOW_CLS_TMPLT_CREATE : ++ FLOW_CLS_TMPLT_DESTROY; ++ cls_flower.cookie = (unsigned long) tmplt; ++ cls_flower.rule->match.dissector = &tmplt->dissector; ++ cls_flower.rule->match.mask = &tmplt->mask; ++ cls_flower.rule->match.key = &tmplt->dummy_key; ++ ++ cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv); ++ kfree(cls_flower.rule); ++} ++ + static int fl_dump_key_val(struct sk_buff *skb, + void *val, int val_type, + void *mask, int mask_type, int len) +@@ -3628,6 +3650,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { + .bind_class = fl_bind_class, + .tmplt_create = fl_tmplt_create, + .tmplt_destroy = fl_tmplt_destroy, ++ .tmplt_reoffload = fl_tmplt_reoffload, + .tmplt_dump = fl_tmplt_dump, + .get_exts = fl_get_exts, + .owner = THIS_MODULE, +-- +2.43.0 + diff --git a/queue-6.6/net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch b/queue-6.6/net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch new file mode 100644 index 00000000000..677e01666ff --- /dev/null +++ b/queue-6.6/net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch @@ -0,0 +1,87 @@ +From fff92e91ba328439aad1ad95a571f9d01628d9cd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 12:32:10 +0800 +Subject: net/smc: fix illegal rmb_desc access in SMC-D connection dump + +From: Wen Gu + +[ Upstream commit dbc153fd3c142909e564bb256da087e13fbf239c ] + +A crash was found when dumping SMC-D connections. It can be reproduced +by following steps: + +- run nginx/wrk test: + smc_run nginx + smc_run wrk -t 16 -c 1000 -d -H 'Connection: Close' + +- continuously dump SMC-D connections in parallel: + watch -n 1 'smcss -D' + + BUG: kernel NULL pointer dereference, address: 0000000000000030 + CPU: 2 PID: 7204 Comm: smcss Kdump: loaded Tainted: G E 6.7.0+ #55 + RIP: 0010:__smc_diag_dump.constprop.0+0x5e5/0x620 [smc_diag] + Call Trace: + + ? __die+0x24/0x70 + ? page_fault_oops+0x66/0x150 + ? exc_page_fault+0x69/0x140 + ? asm_exc_page_fault+0x26/0x30 + ? __smc_diag_dump.constprop.0+0x5e5/0x620 [smc_diag] + ? __kmalloc_node_track_caller+0x35d/0x430 + ? __alloc_skb+0x77/0x170 + smc_diag_dump_proto+0xd0/0xf0 [smc_diag] + smc_diag_dump+0x26/0x60 [smc_diag] + netlink_dump+0x19f/0x320 + __netlink_dump_start+0x1dc/0x300 + smc_diag_handler_dump+0x6a/0x80 [smc_diag] + ? __pfx_smc_diag_dump+0x10/0x10 [smc_diag] + sock_diag_rcv_msg+0x121/0x140 + ? __pfx_sock_diag_rcv_msg+0x10/0x10 + netlink_rcv_skb+0x5a/0x110 + sock_diag_rcv+0x28/0x40 + netlink_unicast+0x22a/0x330 + netlink_sendmsg+0x1f8/0x420 + __sock_sendmsg+0xb0/0xc0 + ____sys_sendmsg+0x24e/0x300 + ? copy_msghdr_from_user+0x62/0x80 + ___sys_sendmsg+0x7c/0xd0 + ? __do_fault+0x34/0x160 + ? do_read_fault+0x5f/0x100 + ? do_fault+0xb0/0x110 + ? __handle_mm_fault+0x2b0/0x6c0 + __sys_sendmsg+0x4d/0x80 + do_syscall_64+0x69/0x180 + entry_SYSCALL_64_after_hwframe+0x6e/0x76 + +It is possible that the connection is in process of being established +when we dump it. Assumed that the connection has been registered in a +link group by smc_conn_create() but the rmb_desc has not yet been +initialized by smc_buf_create(), thus causing the illegal access to +conn->rmb_desc. So fix it by checking before dump. + +Fixes: 4b1b7d3b30a6 ("net/smc: add SMC-D diag support") +Signed-off-by: Wen Gu +Reviewed-by: Dust Li +Reviewed-by: Wenjia Zhang +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/smc/smc_diag.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c +index 2c464d76b06c..37833b96b508 100644 +--- a/net/smc/smc_diag.c ++++ b/net/smc/smc_diag.c +@@ -163,7 +163,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, + } + if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd && + (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && +- !list_empty(&smc->conn.lgr->list)) { ++ !list_empty(&smc->conn.lgr->list) && smc->conn.rmb_desc) { + struct smc_connection *conn = &smc->conn; + struct smcd_diag_dmbinfo dinfo; + struct smcd_dev *smcd = conn->lgr->smcd; +-- +2.43.0 + diff --git a/queue-6.6/net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch b/queue-6.6/net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch new file mode 100644 index 00000000000..3e7263d0be0 --- /dev/null +++ b/queue-6.6/net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch @@ -0,0 +1,63 @@ +From 264fa9d0041aa2c4c7ce85b9734ad9883997d136 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 19:19:09 +0100 +Subject: net: stmmac: Wait a bit for the reset to take effect + +From: Bernd Edlinger + +[ Upstream commit a5f5eee282a0aae80227697e1d9c811b1726d31d ] + +otherwise the synopsys_id value may be read out wrong, +because the GMAC_VERSION register might still be in reset +state, for at least 1 us after the reset is de-asserted. + +Add a wait for 10 us before continuing to be on the safe side. + +> From what have you got that delay value? + +Just try and error, with very old linux versions and old gcc versions +the synopsys_id was read out correctly most of the time (but not always), +with recent linux versions and recnet gcc versions it was read out +wrongly most of the time, but again not always. +I don't have access to the VHDL code in question, so I cannot +tell why it takes so long to get the correct values, I also do not +have more than a few hardware samples, so I cannot tell how long +this timeout must be in worst case. +Experimentally I can tell that the register is read several times +as zero immediately after the reset is de-asserted, also adding several +no-ops is not enough, adding a printk is enough, also udelay(1) seems to +be enough but I tried that not very often, and I have not access to many +hardware samples to be 100% sure about the necessary delay. +And since the udelay here is only executed once per device instance, +it seems acceptable to delay the boot for 10 us. + +BTW: my hardware's synopsys id is 0x37. + +Fixes: c5e4ddbdfa11 ("net: stmmac: Add support for optional reset control") +Signed-off-by: Bernd Edlinger +Reviewed-by: Jiri Pirko +Reviewed-by: Serge Semin +Link: https://lore.kernel.org/r/AS8P193MB1285A810BD78C111E7F6AA34E4752@AS8P193MB1285.EURP193.PROD.OUTLOOK.COM +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +index 684ec7058c82..292857c0e601 100644 +--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c ++++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +@@ -7441,6 +7441,9 @@ int stmmac_dvr_probe(struct device *device, + dev_err(priv->device, "unable to bring out of ahb reset: %pe\n", + ERR_PTR(ret)); + ++ /* Wait a bit for the reset to take effect */ ++ udelay(10); ++ + /* Init MAC and get the capabilities */ + ret = stmmac_hw_init(priv); + if (ret) +-- +2.43.0 + diff --git a/queue-6.6/netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch b/queue-6.6/netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch new file mode 100644 index 00000000000..f0752c3e61b --- /dev/null +++ b/queue-6.6/netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch @@ -0,0 +1,60 @@ +From 7c981514c167ba49ad7ee5dd97a3e0e67b119874 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 13:34:32 +0100 +Subject: netfilter: nf_tables: restrict anonymous set and map names to 16 + bytes + +From: Florian Westphal + +[ Upstream commit b462579b2b86a8f5230543cadd3a4836be27baf7 ] + +nftables has two types of sets/maps, one where userspace defines the +name, and anonymous sets/maps, where userspace defines a template name. + +For the latter, kernel requires presence of exactly one "%d". +nftables uses "__set%d" and "__map%d" for this. The kernel will +expand the format specifier and replaces it with the smallest unused +number. + +As-is, userspace could define a template name that allows to move +the set name past the 256 bytes upperlimit (post-expansion). + +I don't see how this could be a problem, but I would prefer if userspace +cannot do this, so add a limit of 16 bytes for the '%d' template name. + +16 bytes is the old total upper limit for set names that existed when +nf_tables was merged initially. + +Fixes: 387454901bd6 ("netfilter: nf_tables: Allow set names of up to 255 chars") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_tables_api.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index b28fbcb86e94..bad58df478a7 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -24,6 +24,7 @@ + #include + + #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) ++#define NFT_SET_MAX_ANONLEN 16 + + unsigned int nf_tables_net_id __read_mostly; + +@@ -4351,6 +4352,9 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, + if (p[1] != 'd' || strchr(p + 2, '%')) + return -EINVAL; + ++ if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN) ++ return -EINVAL; ++ + inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL); + if (inuse == NULL) + return -ENOMEM; +-- +2.43.0 + diff --git a/queue-6.6/netfilter-nf_tables-validate-nfproto_-family.patch b/queue-6.6/netfilter-nf_tables-validate-nfproto_-family.patch new file mode 100644 index 00000000000..8006b23b7ef --- /dev/null +++ b/queue-6.6/netfilter-nf_tables-validate-nfproto_-family.patch @@ -0,0 +1,196 @@ +From b425af5f70851260303b3d9c5d0d89f4ed55c6b0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Jan 2024 16:38:25 +0100 +Subject: netfilter: nf_tables: validate NFPROTO_* family + +From: Pablo Neira Ayuso + +[ Upstream commit d0009effa8862c20a13af4cb7475d9771b905693 ] + +Several expressions explicitly refer to NF_INET_* hook definitions +from expr->ops->validate, however, family is not validated. + +Bail out with EOPNOTSUPP in case they are used from unsupported +families. + +Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables") +Fixes: a3c90f7a2323 ("netfilter: nf_tables: flow offload expression") +Fixes: 2fa841938c64 ("netfilter: nf_tables: introduce routing expression") +Fixes: 554ced0a6e29 ("netfilter: nf_tables: add support for native socket matching") +Fixes: ad49d86e07a4 ("netfilter: nf_tables: Add synproxy support") +Fixes: 4ed8eb6570a4 ("netfilter: nf_tables: Add native tproxy support") +Fixes: 6c47260250fc ("netfilter: nf_tables: add xfrm expression") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_compat.c | 12 ++++++++++++ + net/netfilter/nft_flow_offload.c | 5 +++++ + net/netfilter/nft_nat.c | 5 +++++ + net/netfilter/nft_rt.c | 5 +++++ + net/netfilter/nft_socket.c | 5 +++++ + net/netfilter/nft_synproxy.c | 7 +++++-- + net/netfilter/nft_tproxy.c | 5 +++++ + net/netfilter/nft_xfrm.c | 5 +++++ + 8 files changed, 47 insertions(+), 2 deletions(-) + +diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c +index 5284cd2ad532..f0eeda97bfcd 100644 +--- a/net/netfilter/nft_compat.c ++++ b/net/netfilter/nft_compat.c +@@ -350,6 +350,12 @@ static int nft_target_validate(const struct nft_ctx *ctx, + unsigned int hook_mask = 0; + int ret; + ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_BRIDGE && ++ ctx->family != NFPROTO_ARP) ++ return -EOPNOTSUPP; ++ + if (nft_is_base_chain(ctx->chain)) { + const struct nft_base_chain *basechain = + nft_base_chain(ctx->chain); +@@ -595,6 +601,12 @@ static int nft_match_validate(const struct nft_ctx *ctx, + unsigned int hook_mask = 0; + int ret; + ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_BRIDGE && ++ ctx->family != NFPROTO_ARP) ++ return -EOPNOTSUPP; ++ + if (nft_is_base_chain(ctx->chain)) { + const struct nft_base_chain *basechain = + nft_base_chain(ctx->chain); +diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c +index ab3362c483b4..397351fa4d5f 100644 +--- a/net/netfilter/nft_flow_offload.c ++++ b/net/netfilter/nft_flow_offload.c +@@ -384,6 +384,11 @@ static int nft_flow_offload_validate(const struct nft_ctx *ctx, + { + unsigned int hook_mask = (1 << NF_INET_FORWARD); + ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_INET) ++ return -EOPNOTSUPP; ++ + return nft_chain_validate_hooks(ctx->chain, hook_mask); + } + +diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c +index 583885ce7232..808f5802c270 100644 +--- a/net/netfilter/nft_nat.c ++++ b/net/netfilter/nft_nat.c +@@ -143,6 +143,11 @@ static int nft_nat_validate(const struct nft_ctx *ctx, + struct nft_nat *priv = nft_expr_priv(expr); + int err; + ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_INET) ++ return -EOPNOTSUPP; ++ + err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); + if (err < 0) + return err; +diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c +index 35a2c28caa60..24d977138572 100644 +--- a/net/netfilter/nft_rt.c ++++ b/net/netfilter/nft_rt.c +@@ -166,6 +166,11 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp + const struct nft_rt *priv = nft_expr_priv(expr); + unsigned int hooks; + ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_INET) ++ return -EOPNOTSUPP; ++ + switch (priv->key) { + case NFT_RT_NEXTHOP4: + case NFT_RT_NEXTHOP6: +diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c +index 9ed85be79452..f30163e2ca62 100644 +--- a/net/netfilter/nft_socket.c ++++ b/net/netfilter/nft_socket.c +@@ -242,6 +242,11 @@ static int nft_socket_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) + { ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_INET) ++ return -EOPNOTSUPP; ++ + return nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | +diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c +index 13da882669a4..1d737f89dfc1 100644 +--- a/net/netfilter/nft_synproxy.c ++++ b/net/netfilter/nft_synproxy.c +@@ -186,7 +186,6 @@ static int nft_synproxy_do_init(const struct nft_ctx *ctx, + break; + #endif + case NFPROTO_INET: +- case NFPROTO_BRIDGE: + err = nf_synproxy_ipv4_init(snet, ctx->net); + if (err) + goto nf_ct_failure; +@@ -219,7 +218,6 @@ static void nft_synproxy_do_destroy(const struct nft_ctx *ctx) + break; + #endif + case NFPROTO_INET: +- case NFPROTO_BRIDGE: + nf_synproxy_ipv4_fini(snet, ctx->net); + nf_synproxy_ipv6_fini(snet, ctx->net); + break; +@@ -253,6 +251,11 @@ static int nft_synproxy_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) + { ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_INET) ++ return -EOPNOTSUPP; ++ + return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD)); + } +diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c +index ae15cd693f0e..71412adb73d4 100644 +--- a/net/netfilter/nft_tproxy.c ++++ b/net/netfilter/nft_tproxy.c +@@ -316,6 +316,11 @@ static int nft_tproxy_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) + { ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_INET) ++ return -EOPNOTSUPP; ++ + return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING); + } + +diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c +index 452f8587adda..1c866757db55 100644 +--- a/net/netfilter/nft_xfrm.c ++++ b/net/netfilter/nft_xfrm.c +@@ -235,6 +235,11 @@ static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *e + const struct nft_xfrm *priv = nft_expr_priv(expr); + unsigned int hooks; + ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_INET) ++ return -EOPNOTSUPP; ++ + switch (priv->dir) { + case XFRM_POLICY_IN: + hooks = (1 << NF_INET_FORWARD) | +-- +2.43.0 + diff --git a/queue-6.6/netfilter-nft_limit-reject-configurations-that-cause.patch b/queue-6.6/netfilter-nft_limit-reject-configurations-that-cause.patch new file mode 100644 index 00000000000..b34eb8cf876 --- /dev/null +++ b/queue-6.6/netfilter-nft_limit-reject-configurations-that-cause.patch @@ -0,0 +1,83 @@ +From 998d4aae3b7603145fb0d6e462c4b7677e2c634b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 13:11:32 +0100 +Subject: netfilter: nft_limit: reject configurations that cause integer + overflow + +From: Florian Westphal + +[ Upstream commit c9d9eb9c53d37cdebbad56b91e40baf42d5a97aa ] + +Reject bogus configs where internal token counter wraps around. +This only occurs with very very large requests, such as 17gbyte/s. + +Its better to reject this rather than having incorrect ratelimit. + +Fixes: d2168e849ebf ("netfilter: nft_limit: add per-byte limiting") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_limit.c | 23 ++++++++++++++++------- + 1 file changed, 16 insertions(+), 7 deletions(-) + +diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c +index 79039afde34e..cefa25e0dbb0 100644 +--- a/net/netfilter/nft_limit.c ++++ b/net/netfilter/nft_limit.c +@@ -58,17 +58,19 @@ static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost) + static int nft_limit_init(struct nft_limit_priv *priv, + const struct nlattr * const tb[], bool pkts) + { ++ u64 unit, tokens, rate_with_burst; + bool invert = false; +- u64 unit, tokens; + + if (tb[NFTA_LIMIT_RATE] == NULL || + tb[NFTA_LIMIT_UNIT] == NULL) + return -EINVAL; + + priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); ++ if (priv->rate == 0) ++ return -EINVAL; ++ + unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); +- priv->nsecs = unit * NSEC_PER_SEC; +- if (priv->rate == 0 || priv->nsecs < unit) ++ if (check_mul_overflow(unit, NSEC_PER_SEC, &priv->nsecs)) + return -EOVERFLOW; + + if (tb[NFTA_LIMIT_BURST]) +@@ -77,18 +79,25 @@ static int nft_limit_init(struct nft_limit_priv *priv, + if (pkts && priv->burst == 0) + priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT; + +- if (priv->rate + priv->burst < priv->rate) ++ if (check_add_overflow(priv->rate, priv->burst, &rate_with_burst)) + return -EOVERFLOW; + + if (pkts) { +- tokens = div64_u64(priv->nsecs, priv->rate) * priv->burst; ++ u64 tmp = div64_u64(priv->nsecs, priv->rate); ++ ++ if (check_mul_overflow(tmp, priv->burst, &tokens)) ++ return -EOVERFLOW; + } else { ++ u64 tmp; ++ + /* The token bucket size limits the number of tokens can be + * accumulated. tokens_max specifies the bucket size. + * tokens_max = unit * (rate + burst) / rate. + */ +- tokens = div64_u64(priv->nsecs * (priv->rate + priv->burst), +- priv->rate); ++ if (check_mul_overflow(priv->nsecs, rate_with_burst, &tmp)) ++ return -EOVERFLOW; ++ ++ tokens = div64_u64(tmp, priv->rate); + } + + if (tb[NFTA_LIMIT_FLAGS]) { +-- +2.43.0 + diff --git a/queue-6.6/netfs-fscache-prevent-oops-in-fscache_put_cache.patch b/queue-6.6/netfs-fscache-prevent-oops-in-fscache_put_cache.patch new file mode 100644 index 00000000000..19546655008 --- /dev/null +++ b/queue-6.6/netfs-fscache-prevent-oops-in-fscache_put_cache.patch @@ -0,0 +1,44 @@ +From c64fd15dd673caa6944140c91c582526022281a7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Jan 2024 09:59:41 +0300 +Subject: netfs, fscache: Prevent Oops in fscache_put_cache() + +From: Dan Carpenter + +[ Upstream commit 3be0b3ed1d76c6703b9ee482b55f7e01c369cc68 ] + +This function dereferences "cache" and then checks if it's +IS_ERR_OR_NULL(). Check first, then dereference. + +Fixes: 9549332df4ed ("fscache: Implement cache registration") +Signed-off-by: Dan Carpenter +Signed-off-by: David Howells +Link: https://lore.kernel.org/r/e84bc740-3502-4f16-982a-a40d5676615c@moroto.mountain/ # v2 +Signed-off-by: Sasha Levin +--- + fs/fscache/cache.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c +index d645f8b302a2..9397ed39b0b4 100644 +--- a/fs/fscache/cache.c ++++ b/fs/fscache/cache.c +@@ -179,13 +179,14 @@ EXPORT_SYMBOL(fscache_acquire_cache); + void fscache_put_cache(struct fscache_cache *cache, + enum fscache_cache_trace where) + { +- unsigned int debug_id = cache->debug_id; ++ unsigned int debug_id; + bool zero; + int ref; + + if (IS_ERR_OR_NULL(cache)) + return; + ++ debug_id = cache->debug_id; + zero = __refcount_dec_and_test(&cache->ref, &ref); + trace_fscache_cache(debug_id, ref - 1, where); + +-- +2.43.0 + diff --git a/queue-6.6/netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch b/queue-6.6/netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch new file mode 100644 index 00000000000..d09128870ce --- /dev/null +++ b/queue-6.6/netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch @@ -0,0 +1,76 @@ +From 1212fe5426423b34e0f08d1c34182d9395c23de8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 09:18:07 +0800 +Subject: netlink: fix potential sleeping issue in mqueue_flush_file + +From: Zhengchao Shao + +[ Upstream commit 234ec0b6034b16869d45128b8cd2dc6ffe596f04 ] + +I analyze the potential sleeping issue of the following processes: +Thread A Thread B +... netlink_create //ref = 1 +do_mq_notify ... + sock = netlink_getsockbyfilp ... //ref = 2 + info->notify_sock = sock; ... +... netlink_sendmsg +... skb = netlink_alloc_large_skb //skb->head is vmalloced +... netlink_unicast +... sk = netlink_getsockbyportid //ref = 3 +... netlink_sendskb +... __netlink_sendskb +... skb_queue_tail //put skb to sk_receive_queue +... sock_put //ref = 2 +... ... +... netlink_release +... deferred_put_nlk_sk //ref = 1 +mqueue_flush_file + spin_lock + remove_notification + netlink_sendskb + sock_put //ref = 0 + sk_free + ... + __sk_destruct + netlink_sock_destruct + skb_queue_purge //get skb from sk_receive_queue + ... + __skb_queue_purge_reason + kfree_skb_reason + __kfree_skb + ... + skb_release_all + skb_release_head_state + netlink_skb_destructor + vfree(skb->head) //sleeping while holding spinlock + +In netlink_sendmsg, if the memory pointed to by skb->head is allocated by +vmalloc, and is put to sk_receive_queue queue, also the skb is not freed. +When the mqueue executes flush, the sleeping bug will occur. Use +vfree_atomic instead of vfree in netlink_skb_destructor to solve the issue. + +Fixes: c05cdb1b864f ("netlink: allow large data transfers from user-space") +Signed-off-by: Zhengchao Shao +Link: https://lore.kernel.org/r/20240122011807.2110357-1-shaozhengchao@huawei.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + net/netlink/af_netlink.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c +index eb086b06d60d..d9107b545d36 100644 +--- a/net/netlink/af_netlink.c ++++ b/net/netlink/af_netlink.c +@@ -374,7 +374,7 @@ static void netlink_skb_destructor(struct sk_buff *skb) + if (is_vmalloc_addr(skb->head)) { + if (!skb->cloned || + !atomic_dec_return(&(skb_shinfo(skb)->dataref))) +- vfree(skb->head); ++ vfree_atomic(skb->head); + + skb->head = NULL; + } +-- +2.43.0 + diff --git a/queue-6.6/rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch b/queue-6.6/rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch new file mode 100644 index 00000000000..d01daef608d --- /dev/null +++ b/queue-6.6/rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch @@ -0,0 +1,141 @@ +From 25442abcd36f14a6891e57e93be65b28cb2cb890 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 19 Dec 2023 00:19:15 +0100 +Subject: rcu: Defer RCU kthreads wakeup when CPU is dying + +From: Frederic Weisbecker + +[ Upstream commit e787644caf7628ad3269c1fbd321c3255cf51710 ] + +When the CPU goes idle for the last time during the CPU down hotplug +process, RCU reports a final quiescent state for the current CPU. If +this quiescent state propagates up to the top, some tasks may then be +woken up to complete the grace period: the main grace period kthread +and/or the expedited main workqueue (or kworker). + +If those kthreads have a SCHED_FIFO policy, the wake up can indirectly +arm the RT bandwith timer to the local offline CPU. Since this happens +after hrtimers have been migrated at CPUHP_AP_HRTIMERS_DYING stage, the +timer gets ignored. Therefore if the RCU kthreads are waiting for RT +bandwidth to be available, they may never be actually scheduled. + +This triggers TREE03 rcutorture hangs: + + rcu: INFO: rcu_preempt self-detected stall on CPU + rcu: 4-...!: (1 GPs behind) idle=9874/1/0x4000000000000000 softirq=0/0 fqs=20 rcuc=21071 jiffies(starved) + rcu: (t=21035 jiffies g=938281 q=40787 ncpus=6) + rcu: rcu_preempt kthread starved for 20964 jiffies! g938281 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x0 ->cpu=0 + rcu: Unless rcu_preempt kthread gets sufficient CPU time, OOM is now expected behavior. + rcu: RCU grace-period kthread stack dump: + task:rcu_preempt state:R running task stack:14896 pid:14 tgid:14 ppid:2 flags:0x00004000 + Call Trace: + + __schedule+0x2eb/0xa80 + schedule+0x1f/0x90 + schedule_timeout+0x163/0x270 + ? __pfx_process_timeout+0x10/0x10 + rcu_gp_fqs_loop+0x37c/0x5b0 + ? __pfx_rcu_gp_kthread+0x10/0x10 + rcu_gp_kthread+0x17c/0x200 + kthread+0xde/0x110 + ? __pfx_kthread+0x10/0x10 + ret_from_fork+0x2b/0x40 + ? __pfx_kthread+0x10/0x10 + ret_from_fork_asm+0x1b/0x30 + + +The situation can't be solved with just unpinning the timer. The hrtimer +infrastructure and the nohz heuristics involved in finding the best +remote target for an unpinned timer would then also need to handle +enqueues from an offline CPU in the most horrendous way. + +So fix this on the RCU side instead and defer the wake up to an online +CPU if it's too late for the local one. + +Reported-by: Paul E. McKenney +Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") +Signed-off-by: Frederic Weisbecker +Signed-off-by: Paul E. McKenney +Signed-off-by: Neeraj Upadhyay (AMD) +Signed-off-by: Sasha Levin +--- + kernel/rcu/tree.c | 34 +++++++++++++++++++++++++++++++++- + kernel/rcu/tree_exp.h | 3 +-- + 2 files changed, 34 insertions(+), 3 deletions(-) + +diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c +index 9af42eae1ba3..4fe47ed95eeb 100644 +--- a/kernel/rcu/tree.c ++++ b/kernel/rcu/tree.c +@@ -1013,6 +1013,38 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) + return needmore; + } + ++static void swake_up_one_online_ipi(void *arg) ++{ ++ struct swait_queue_head *wqh = arg; ++ ++ swake_up_one(wqh); ++} ++ ++static void swake_up_one_online(struct swait_queue_head *wqh) ++{ ++ int cpu = get_cpu(); ++ ++ /* ++ * If called from rcutree_report_cpu_starting(), wake up ++ * is dangerous that late in the CPU-down hotplug process. The ++ * scheduler might queue an ignored hrtimer. Defer the wake up ++ * to an online CPU instead. ++ */ ++ if (unlikely(cpu_is_offline(cpu))) { ++ int target; ++ ++ target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU), ++ cpu_online_mask); ++ ++ smp_call_function_single(target, swake_up_one_online_ipi, ++ wqh, 0); ++ put_cpu(); ++ } else { ++ put_cpu(); ++ swake_up_one(wqh); ++ } ++} ++ + /* + * Awaken the grace-period kthread. Don't do a self-awaken (unless in an + * interrupt or softirq handler, in which case we just might immediately +@@ -1037,7 +1069,7 @@ static void rcu_gp_kthread_wake(void) + return; + WRITE_ONCE(rcu_state.gp_wake_time, jiffies); + WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); +- swake_up_one(&rcu_state.gp_wq); ++ swake_up_one_online(&rcu_state.gp_wq); + } + + /* +diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h +index 8239b39d945b..6e87dc764f47 100644 +--- a/kernel/rcu/tree_exp.h ++++ b/kernel/rcu/tree_exp.h +@@ -173,7 +173,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) + return ret; + } + +- + /* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU +@@ -201,7 +200,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ +- swake_up_one(&rcu_state.expedited_wq); ++ swake_up_one_online(&rcu_state.expedited_wq); + } + break; + } +-- +2.43.0 + diff --git a/queue-6.6/selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch b/queue-6.6/selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch new file mode 100644 index 00000000000..6cb6a67b9ae --- /dev/null +++ b/queue-6.6/selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch @@ -0,0 +1,231 @@ +From e172b3627d5871d6afe0da3566ba055ff9f14ca7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 19:16:42 -0800 +Subject: selftest: Don't reuse port for SO_INCOMING_CPU test. + +From: Kuniyuki Iwashima + +[ Upstream commit 97de5a15edf2d22184f5ff588656030bbb7fa358 ] + +Jakub reported that ASSERT_EQ(cpu, i) in so_incoming_cpu.c seems to +fire somewhat randomly. + + # # RUN so_incoming_cpu.before_reuseport.test3 ... + # # so_incoming_cpu.c:191:test3:Expected cpu (32) == i (0) + # # test3: Test terminated by assertion + # # FAIL so_incoming_cpu.before_reuseport.test3 + # not ok 3 so_incoming_cpu.before_reuseport.test3 + +When the test failed, not-yet-accepted CLOSE_WAIT sockets received +SYN with a "challenging" SEQ number, which was sent from an unexpected +CPU that did not create the receiver. + +The test basically does: + + 1. for each cpu: + 1-1. create a server + 1-2. set SO_INCOMING_CPU + + 2. for each cpu: + 2-1. set cpu affinity + 2-2. create some clients + 2-3. let clients connect() to the server on the same cpu + 2-4. close() clients + + 3. for each server: + 3-1. accept() all child sockets + 3-2. check if all children have the same SO_INCOMING_CPU with the server + +The root cause was the close() in 2-4. and net.ipv4.tcp_tw_reuse. + +In a loop of 2., close() changed the client state to FIN_WAIT_2, and +the peer transitioned to CLOSE_WAIT. + +In another loop of 2., connect() happened to select the same port of +the FIN_WAIT_2 socket, and it was reused as the default value of +net.ipv4.tcp_tw_reuse is 2. + +As a result, the new client sent SYN to the CLOSE_WAIT socket from +a different CPU, and the receiver's sk_incoming_cpu was overwritten +with unexpected CPU ID. + +Also, the SYN had a different SEQ number, so the CLOSE_WAIT socket +responded with Challenge ACK. The new client properly returned RST +and effectively killed the CLOSE_WAIT socket. + +This way, all clients were created successfully, but the error was +detected later by 3-2., ASSERT_EQ(cpu, i). + +To avoid the failure, let's make sure that (i) the number of clients +is less than the number of available ports and (ii) such reuse never +happens. + +Fixes: 6df96146b202 ("selftest: Add test for SO_INCOMING_CPU.") +Reported-by: Jakub Kicinski +Signed-off-by: Kuniyuki Iwashima +Tested-by: Jakub Kicinski +Link: https://lore.kernel.org/r/20240120031642.67014-1-kuniyu@amazon.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/net/so_incoming_cpu.c | 68 ++++++++++++++----- + 1 file changed, 50 insertions(+), 18 deletions(-) + +diff --git a/tools/testing/selftests/net/so_incoming_cpu.c b/tools/testing/selftests/net/so_incoming_cpu.c +index a14818164102..e9fa14e10732 100644 +--- a/tools/testing/selftests/net/so_incoming_cpu.c ++++ b/tools/testing/selftests/net/so_incoming_cpu.c +@@ -3,19 +3,16 @@ + #define _GNU_SOURCE + #include + ++#include ++ + #include + #include + #include + + #include "../kselftest_harness.h" + +-#define CLIENT_PER_SERVER 32 /* More sockets, more reliable */ +-#define NR_SERVER self->nproc +-#define NR_CLIENT (CLIENT_PER_SERVER * NR_SERVER) +- + FIXTURE(so_incoming_cpu) + { +- int nproc; + int *servers; + union { + struct sockaddr addr; +@@ -56,12 +53,47 @@ FIXTURE_VARIANT_ADD(so_incoming_cpu, after_all_listen) + .when_to_set = AFTER_ALL_LISTEN, + }; + ++static void write_sysctl(struct __test_metadata *_metadata, ++ char *filename, char *string) ++{ ++ int fd, len, ret; ++ ++ fd = open(filename, O_WRONLY); ++ ASSERT_NE(fd, -1); ++ ++ len = strlen(string); ++ ret = write(fd, string, len); ++ ASSERT_EQ(ret, len); ++} ++ ++static void setup_netns(struct __test_metadata *_metadata) ++{ ++ ASSERT_EQ(unshare(CLONE_NEWNET), 0); ++ ASSERT_EQ(system("ip link set lo up"), 0); ++ ++ write_sysctl(_metadata, "/proc/sys/net/ipv4/ip_local_port_range", "10000 60001"); ++ write_sysctl(_metadata, "/proc/sys/net/ipv4/tcp_tw_reuse", "0"); ++} ++ ++#define NR_PORT (60001 - 10000 - 1) ++#define NR_CLIENT_PER_SERVER_DEFAULT 32 ++static int nr_client_per_server, nr_server, nr_client; ++ + FIXTURE_SETUP(so_incoming_cpu) + { +- self->nproc = get_nprocs(); +- ASSERT_LE(2, self->nproc); ++ setup_netns(_metadata); ++ ++ nr_server = get_nprocs(); ++ ASSERT_LE(2, nr_server); ++ ++ if (NR_CLIENT_PER_SERVER_DEFAULT * nr_server < NR_PORT) ++ nr_client_per_server = NR_CLIENT_PER_SERVER_DEFAULT; ++ else ++ nr_client_per_server = NR_PORT / nr_server; ++ ++ nr_client = nr_client_per_server * nr_server; + +- self->servers = malloc(sizeof(int) * NR_SERVER); ++ self->servers = malloc(sizeof(int) * nr_server); + ASSERT_NE(self->servers, NULL); + + self->in_addr.sin_family = AF_INET; +@@ -74,7 +106,7 @@ FIXTURE_TEARDOWN(so_incoming_cpu) + { + int i; + +- for (i = 0; i < NR_SERVER; i++) ++ for (i = 0; i < nr_server; i++) + close(self->servers[i]); + + free(self->servers); +@@ -110,10 +142,10 @@ int create_server(struct __test_metadata *_metadata, + if (variant->when_to_set == BEFORE_LISTEN) + set_so_incoming_cpu(_metadata, fd, cpu); + +- /* We don't use CLIENT_PER_SERVER here not to block ++ /* We don't use nr_client_per_server here not to block + * this test at connect() if SO_INCOMING_CPU is broken. + */ +- ret = listen(fd, NR_CLIENT); ++ ret = listen(fd, nr_client); + ASSERT_EQ(ret, 0); + + if (variant->when_to_set == AFTER_LISTEN) +@@ -128,7 +160,7 @@ void create_servers(struct __test_metadata *_metadata, + { + int i, ret; + +- for (i = 0; i < NR_SERVER; i++) { ++ for (i = 0; i < nr_server; i++) { + self->servers[i] = create_server(_metadata, self, variant, i); + + if (i == 0) { +@@ -138,7 +170,7 @@ void create_servers(struct __test_metadata *_metadata, + } + + if (variant->when_to_set == AFTER_ALL_LISTEN) { +- for (i = 0; i < NR_SERVER; i++) ++ for (i = 0; i < nr_server; i++) + set_so_incoming_cpu(_metadata, self->servers[i], i); + } + } +@@ -149,7 +181,7 @@ void create_clients(struct __test_metadata *_metadata, + cpu_set_t cpu_set; + int i, j, fd, ret; + +- for (i = 0; i < NR_SERVER; i++) { ++ for (i = 0; i < nr_server; i++) { + CPU_ZERO(&cpu_set); + + CPU_SET(i, &cpu_set); +@@ -162,7 +194,7 @@ void create_clients(struct __test_metadata *_metadata, + ret = sched_setaffinity(0, sizeof(cpu_set), &cpu_set); + ASSERT_EQ(ret, 0); + +- for (j = 0; j < CLIENT_PER_SERVER; j++) { ++ for (j = 0; j < nr_client_per_server; j++) { + fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_NE(fd, -1); + +@@ -180,8 +212,8 @@ void verify_incoming_cpu(struct __test_metadata *_metadata, + int i, j, fd, cpu, ret, total = 0; + socklen_t len = sizeof(int); + +- for (i = 0; i < NR_SERVER; i++) { +- for (j = 0; j < CLIENT_PER_SERVER; j++) { ++ for (i = 0; i < nr_server; i++) { ++ for (j = 0; j < nr_client_per_server; j++) { + /* If we see -EAGAIN here, SO_INCOMING_CPU is broken */ + fd = accept(self->servers[i], &self->addr, &self->addrlen); + ASSERT_NE(fd, -1); +@@ -195,7 +227,7 @@ void verify_incoming_cpu(struct __test_metadata *_metadata, + } + } + +- ASSERT_EQ(total, NR_CLIENT); ++ ASSERT_EQ(total, nr_client); + TH_LOG("SO_INCOMING_CPU is very likely to be " + "working correctly with %d sockets.", total); + } +-- +2.43.0 + diff --git a/queue-6.6/selftests-bonding-do-not-test-arp-ns-target-with-mod.patch b/queue-6.6/selftests-bonding-do-not-test-arp-ns-target-with-mod.patch new file mode 100644 index 00000000000..4f873ef3379 --- /dev/null +++ b/queue-6.6/selftests-bonding-do-not-test-arp-ns-target-with-mod.patch @@ -0,0 +1,63 @@ +From 5e571a63fbab26f1f130953fc67d7180143df056 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Jan 2024 15:59:17 +0800 +Subject: selftests: bonding: do not test arp/ns target with mode + balance-alb/tlb + +From: Hangbin Liu + +[ Upstream commit a2933a8759a62269754e54733d993b19de870e84 ] + +The prio_arp/ns tests hard code the mode to active-backup. At the same +time, The balance-alb/tlb modes do not support arp/ns target. So remove +the prio_arp/ns tests from the loop and only test active-backup mode. + +Fixes: 481b56e0391e ("selftests: bonding: re-format bond option tests") +Reported-by: Jay Vosburgh +Closes: https://lore.kernel.org/netdev/17415.1705965957@famine/ +Signed-off-by: Hangbin Liu +Acked-by: Jay Vosburgh +Link: https://lore.kernel.org/r/20240123075917.1576360-1-liuhangbin@gmail.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + .../testing/selftests/drivers/net/bonding/bond_options.sh | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh +index c54d1697f439..d508486cc0bd 100755 +--- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh ++++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh +@@ -162,7 +162,7 @@ prio_arp() + local mode=$1 + + for primary_reselect in 0 1 2; do +- prio_test "mode active-backup arp_interval 100 arp_ip_target ${g_ip4} primary eth1 primary_reselect $primary_reselect" ++ prio_test "mode $mode arp_interval 100 arp_ip_target ${g_ip4} primary eth1 primary_reselect $primary_reselect" + log_test "prio" "$mode arp_ip_target primary_reselect $primary_reselect" + done + } +@@ -178,7 +178,7 @@ prio_ns() + fi + + for primary_reselect in 0 1 2; do +- prio_test "mode active-backup arp_interval 100 ns_ip6_target ${g_ip6} primary eth1 primary_reselect $primary_reselect" ++ prio_test "mode $mode arp_interval 100 ns_ip6_target ${g_ip6} primary eth1 primary_reselect $primary_reselect" + log_test "prio" "$mode ns_ip6_target primary_reselect $primary_reselect" + done + } +@@ -194,9 +194,9 @@ prio() + + for mode in $modes; do + prio_miimon $mode +- prio_arp $mode +- prio_ns $mode + done ++ prio_arp "active-backup" ++ prio_ns "active-backup" + } + + arp_validate_test() +-- +2.43.0 + diff --git a/queue-6.6/selftests-bonding-increase-timeout-to-1200s.patch b/queue-6.6/selftests-bonding-increase-timeout-to-1200s.patch new file mode 100644 index 00000000000..0b6844de64f --- /dev/null +++ b/queue-6.6/selftests-bonding-increase-timeout-to-1200s.patch @@ -0,0 +1,56 @@ +From 36694814a356ab1babaef9c760f7939542ef77e3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Jan 2024 19:12:32 -0500 +Subject: selftests: bonding: Increase timeout to 1200s + +From: Benjamin Poirier + +[ Upstream commit b01f15a7571b7aa222458bc9bf26ab59bd84e384 ] + +When tests are run by runner.sh, bond_options.sh gets killed before +it can complete: + +make -C tools/testing/selftests run_tests TARGETS="drivers/net/bonding" + [...] + # timeout set to 120 + # selftests: drivers/net/bonding: bond_options.sh + # TEST: prio (active-backup miimon primary_reselect 0) [ OK ] + # TEST: prio (active-backup miimon primary_reselect 1) [ OK ] + # TEST: prio (active-backup miimon primary_reselect 2) [ OK ] + # TEST: prio (active-backup arp_ip_target primary_reselect 0) [ OK ] + # TEST: prio (active-backup arp_ip_target primary_reselect 1) [ OK ] + # TEST: prio (active-backup arp_ip_target primary_reselect 2) [ OK ] + # + not ok 7 selftests: drivers/net/bonding: bond_options.sh # TIMEOUT 120 seconds + +This test includes many sleep statements, at least some of which are +related to timers in the operation of the bonding driver itself. Increase +the test timeout to allow the test to complete. + +I ran the test in slightly different VMs (including one without HW +virtualization support) and got runtimes of 13m39.760s, 13m31.238s, and +13m2.956s. Use a ~1.5x "safety factor" and set the timeout to 1200s. + +Fixes: 42a8d4aaea84 ("selftests: bonding: add bonding prio option test") +Reported-by: Jakub Kicinski +Closes: https://lore.kernel.org/netdev/20240116104402.1203850a@kernel.org/#t +Suggested-by: Jakub Kicinski +Signed-off-by: Benjamin Poirier +Reviewed-by: Hangbin Liu +Link: https://lore.kernel.org/r/20240118001233.304759-1-bpoirier@nvidia.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/drivers/net/bonding/settings | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/drivers/net/bonding/settings b/tools/testing/selftests/drivers/net/bonding/settings +index 6091b45d226b..79b65bdf05db 100644 +--- a/tools/testing/selftests/drivers/net/bonding/settings ++++ b/tools/testing/selftests/drivers/net/bonding/settings +@@ -1 +1 @@ +-timeout=120 ++timeout=1200 +-- +2.43.0 + diff --git a/queue-6.6/selftests-fill-in-some-missing-configs-for-net.patch b/queue-6.6/selftests-fill-in-some-missing-configs-for-net.patch new file mode 100644 index 00000000000..6d390f902a1 --- /dev/null +++ b/queue-6.6/selftests-fill-in-some-missing-configs-for-net.patch @@ -0,0 +1,117 @@ +From 747c876e953ce89b719a939e459240ba7cb67b6b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 12:35:28 -0800 +Subject: selftests: fill in some missing configs for net + +From: Jakub Kicinski + +[ Upstream commit 04fe7c5029cbdbcdb28917f09a958d939a8f19f7 ] + +We are missing a lot of config options from net selftests, +it seems: + +tun/tap: CONFIG_TUN, CONFIG_MACVLAN, CONFIG_MACVTAP +fib_tests: CONFIG_NET_SCH_FQ_CODEL +l2tp: CONFIG_L2TP, CONFIG_L2TP_V3, CONFIG_L2TP_IP, CONFIG_L2TP_ETH +sctp-vrf: CONFIG_INET_DIAG +txtimestamp: CONFIG_NET_CLS_U32 +vxlan_mdb: CONFIG_BRIDGE_VLAN_FILTERING +gre_gso: CONFIG_NET_IPGRE_DEMUX, CONFIG_IP_GRE, CONFIG_IPV6_GRE +srv6_end_dt*_l3vpn: CONFIG_IPV6_SEG6_LWTUNNEL +ip_local_port_range: CONFIG_MPTCP +fib_test: CONFIG_NET_CLS_BASIC +rtnetlink: CONFIG_MACSEC, CONFIG_NET_SCH_HTB, CONFIG_XFRM_INTERFACE + CONFIG_NET_IPGRE, CONFIG_BONDING +fib_nexthops: CONFIG_MPLS, CONFIG_MPLS_ROUTING +vxlan_mdb: CONFIG_NET_ACT_GACT +tls: CONFIG_TLS, CONFIG_CRYPTO_CHACHA20POLY1305 +psample: CONFIG_PSAMPLE +fcnal: CONFIG_TCP_MD5SIG + +Try to add them in a semi-alphabetical order. + +Fixes: 62199e3f1658 ("selftests: net: Add VXLAN MDB test") +Fixes: c12e0d5f267d ("self-tests: introduce self-tests for RPS default mask") +Fixes: 122db5e3634b ("selftests/net: add MPTCP coverage for IP_LOCAL_PORT_RANGE") +Link: https://lore.kernel.org/r/20240122203528.672004-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/net/config | 28 ++++++++++++++++++++++++++++ + 1 file changed, 28 insertions(+) + +diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config +index 8da562a9ae87..19ff75051660 100644 +--- a/tools/testing/selftests/net/config ++++ b/tools/testing/selftests/net/config +@@ -1,5 +1,6 @@ + CONFIG_USER_NS=y + CONFIG_NET_NS=y ++CONFIG_BONDING=m + CONFIG_BPF_SYSCALL=y + CONFIG_TEST_BPF=m + CONFIG_NUMA=y +@@ -14,9 +15,13 @@ CONFIG_VETH=y + CONFIG_NET_IPVTI=y + CONFIG_IPV6_VTI=y + CONFIG_DUMMY=y ++CONFIG_BRIDGE_VLAN_FILTERING=y + CONFIG_BRIDGE=y ++CONFIG_CRYPTO_CHACHA20POLY1305=m + CONFIG_VLAN_8021Q=y + CONFIG_IFB=y ++CONFIG_INET_DIAG=y ++CONFIG_IP_GRE=m + CONFIG_NETFILTER=y + CONFIG_NETFILTER_ADVANCED=y + CONFIG_NF_CONNTRACK=m +@@ -25,15 +30,36 @@ CONFIG_IP6_NF_IPTABLES=m + CONFIG_IP_NF_IPTABLES=m + CONFIG_IP6_NF_NAT=m + CONFIG_IP_NF_NAT=m ++CONFIG_IPV6_GRE=m ++CONFIG_IPV6_SEG6_LWTUNNEL=y ++CONFIG_L2TP_ETH=m ++CONFIG_L2TP_IP=m ++CONFIG_L2TP=m ++CONFIG_L2TP_V3=y ++CONFIG_MACSEC=m ++CONFIG_MACVLAN=y ++CONFIG_MACVTAP=y ++CONFIG_MPLS=y ++CONFIG_MPTCP=y + CONFIG_NF_TABLES=m + CONFIG_NF_TABLES_IPV6=y + CONFIG_NF_TABLES_IPV4=y + CONFIG_NFT_NAT=m ++CONFIG_NET_ACT_GACT=m ++CONFIG_NET_CLS_BASIC=m ++CONFIG_NET_CLS_U32=m ++CONFIG_NET_IPGRE_DEMUX=m ++CONFIG_NET_IPGRE=m ++CONFIG_NET_SCH_FQ_CODEL=m ++CONFIG_NET_SCH_HTB=m + CONFIG_NET_SCH_FQ=m + CONFIG_NET_SCH_ETF=m + CONFIG_NET_SCH_NETEM=y ++CONFIG_PSAMPLE=m ++CONFIG_TCP_MD5SIG=y + CONFIG_TEST_BLACKHOLE_DEV=m + CONFIG_KALLSYMS=y ++CONFIG_TLS=m + CONFIG_TRACEPOINTS=y + CONFIG_NET_DROP_MONITOR=m + CONFIG_NETDEVSIM=m +@@ -48,7 +74,9 @@ CONFIG_BAREUDP=m + CONFIG_IPV6_IOAM6_LWTUNNEL=y + CONFIG_CRYPTO_SM4_GENERIC=y + CONFIG_AMT=m ++CONFIG_TUN=y + CONFIG_VXLAN=m + CONFIG_IP_SCTP=m + CONFIG_NETFILTER_XT_MATCH_POLICY=m + CONFIG_CRYPTO_ARIA=y ++CONFIG_XFRM_INTERFACE=m +-- +2.43.0 + diff --git a/queue-6.6/selftests-net-fix-rps_default_mask-with-32-cpus.patch b/queue-6.6/selftests-net-fix-rps_default_mask-with-32-cpus.patch new file mode 100644 index 00000000000..e2cdabfa420 --- /dev/null +++ b/queue-6.6/selftests-net-fix-rps_default_mask-with-32-cpus.patch @@ -0,0 +1,51 @@ +From 06ca639fba800dcbdc8d7762501231c341aaf82f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 11:58:15 -0800 +Subject: selftests: net: fix rps_default_mask with >32 CPUs + +From: Jakub Kicinski + +[ Upstream commit 0719b5338a0cbe80d1637a5fb03d8141b5bfc7a1 ] + +If there is more than 32 cpus the bitmask will start to contain +commas, leading to: + +./rps_default_mask.sh: line 36: [: 00000000,00000000: integer expression expected + +Remove the commas, bash doesn't interpret leading zeroes as oct +so that should be good enough. Switch to bash, Simon reports that +not all shells support this type of substitution. + +Fixes: c12e0d5f267d ("self-tests: introduce self-tests for RPS default mask") +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20240122195815.638997-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/net/rps_default_mask.sh | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/net/rps_default_mask.sh b/tools/testing/selftests/net/rps_default_mask.sh +index a26c5624429f..4287a8529890 100755 +--- a/tools/testing/selftests/net/rps_default_mask.sh ++++ b/tools/testing/selftests/net/rps_default_mask.sh +@@ -1,4 +1,4 @@ +-#!/bin/sh ++#!/bin/bash + # SPDX-License-Identifier: GPL-2.0 + + readonly ksft_skip=4 +@@ -33,6 +33,10 @@ chk_rps() { + + rps_mask=$($cmd /sys/class/net/$dev_name/queues/rx-0/rps_cpus) + printf "%-60s" "$msg" ++ ++ # In case there is more than 32 CPUs we need to remove commas from masks ++ rps_mask=${rps_mask//,} ++ expected_rps_mask=${expected_rps_mask//,} + if [ $rps_mask -eq $expected_rps_mask ]; then + echo "[ ok ]" + else +-- +2.43.0 + diff --git a/queue-6.6/selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch b/queue-6.6/selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch new file mode 100644 index 00000000000..0efde9159ed --- /dev/null +++ b/queue-6.6/selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch @@ -0,0 +1,102 @@ +From 59cd40d48593ee5845289d25d185c890c62b8c2d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 22:05:29 -0800 +Subject: selftests: netdevsim: fix the udp_tunnel_nic test + +From: Jakub Kicinski + +[ Upstream commit 0879020a7817e7ce636372c016b4528f541c9f4d ] + +This test is missing a whole bunch of checks for interface +renaming and one ifup. Presumably it was only used on a system +with renaming disabled and NetworkManager running. + +Fixes: 91f430b2c49d ("selftests: net: add a test for UDP tunnel info infra") +Acked-by: Paolo Abeni +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20240123060529.1033912-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + .../selftests/drivers/net/netdevsim/udp_tunnel_nic.sh | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh +index 1b08e042cf94..185b02d2d4cd 100755 +--- a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh ++++ b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh +@@ -269,6 +269,7 @@ for port in 0 1; do + echo 1 > $NSIM_DEV_SYS/new_port + fi + NSIM_NETDEV=`get_netdev_name old_netdevs` ++ ifconfig $NSIM_NETDEV up + + msg="new NIC device created" + exp0=( 0 0 0 0 ) +@@ -430,6 +431,7 @@ for port in 0 1; do + fi + + echo $port > $NSIM_DEV_SYS/new_port ++ NSIM_NETDEV=`get_netdev_name old_netdevs` + ifconfig $NSIM_NETDEV up + + overflow_table0 "overflow NIC table" +@@ -487,6 +489,7 @@ for port in 0 1; do + fi + + echo $port > $NSIM_DEV_SYS/new_port ++ NSIM_NETDEV=`get_netdev_name old_netdevs` + ifconfig $NSIM_NETDEV up + + overflow_table0 "overflow NIC table" +@@ -543,6 +546,7 @@ for port in 0 1; do + fi + + echo $port > $NSIM_DEV_SYS/new_port ++ NSIM_NETDEV=`get_netdev_name old_netdevs` + ifconfig $NSIM_NETDEV up + + overflow_table0 "destroy NIC" +@@ -572,6 +576,7 @@ for port in 0 1; do + fi + + echo $port > $NSIM_DEV_SYS/new_port ++ NSIM_NETDEV=`get_netdev_name old_netdevs` + ifconfig $NSIM_NETDEV up + + msg="create VxLANs v6" +@@ -632,6 +637,7 @@ for port in 0 1; do + fi + + echo $port > $NSIM_DEV_SYS/new_port ++ NSIM_NETDEV=`get_netdev_name old_netdevs` + ifconfig $NSIM_NETDEV up + + echo 110 > $NSIM_DEV_DFS/ports/$port/udp_ports_inject_error +@@ -687,6 +693,7 @@ for port in 0 1; do + fi + + echo $port > $NSIM_DEV_SYS/new_port ++ NSIM_NETDEV=`get_netdev_name old_netdevs` + ifconfig $NSIM_NETDEV up + + msg="create VxLANs v6" +@@ -746,6 +753,7 @@ for port in 0 1; do + fi + + echo $port > $NSIM_DEV_SYS/new_port ++ NSIM_NETDEV=`get_netdev_name old_netdevs` + ifconfig $NSIM_NETDEV up + + msg="create VxLANs v6" +@@ -876,6 +884,7 @@ msg="re-add a port" + + echo 2 > $NSIM_DEV_SYS/del_port + echo 2 > $NSIM_DEV_SYS/new_port ++NSIM_NETDEV=`get_netdev_name old_netdevs` + check_tables + + msg="replace VxLAN in overflow table" +-- +2.43.0 + diff --git a/queue-6.6/series b/queue-6.6/series index f30f2c6324a..44c98c2e35c 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -164,3 +164,67 @@ revert-drm-amd-enable-pcie-pme-from-d3.patch cifs-fix-lock-ordering-while-disabling-multichannel.patch cifs-fix-a-pending-undercount-of-srv_count.patch cifs-after-disabling-multichannel-mark-tcon-for-reconnect.patch +sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch +wifi-mac80211-fix-potential-sta-link-leak.patch +btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch +net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch +selftests-bonding-increase-timeout-to-1200s.patch +tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch +bnxt_en-wait-for-flr-to-complete-during-probe.patch +bnxt_en-prevent-kernel-warning-when-running-offline-.patch +vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch +llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch +llc-drop-support-for-eth_p_tr_802_2.patch +udp-fix-busy-polling.patch +net-fix-removing-a-namespace-with-conflicting-altnam.patch +tun-fix-missing-dropped-counter-in-tun_xdp_act.patch +tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch +net-micrel-fix-ptp-frame-parsing-for-lan8814.patch +net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch +netfs-fscache-prevent-oops-in-fscache_put_cache.patch +tracing-ensure-visibility-when-inserting-an-element-.patch +afs-hide-silly-rename-files-from-userspace.patch +tcp-add-memory-barrier-to-tcp_push.patch +selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch +netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch +ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch +selftests-fill-in-some-missing-configs-for-net.patch +net-sched-flower-fix-chain-template-offload.patch +net-mlx5e-fix-operation-precedence-bug-in-port-times.patch +net-mlx5e-fix-peer-flow-lists-handling.patch +net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch +net-mlx5-bridge-enable-mcast-in-smfs-steering-mode.patch +net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch +net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch +net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch +net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch +net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch +net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch +net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch +net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch +rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch +netfilter-nft_limit-reject-configurations-that-cause.patch +netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch +netfilter-nf_tables-validate-nfproto_-family.patch +net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch +net-mvpp2-clear-bm-pool-before-initialization.patch +selftests-net-fix-rps_default_mask-with-32-cpus.patch +selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch +xsk-recycle-buffer-in-case-rx-queue-was-full.patch +xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch +bpf-propagate-modified-uaddrlen-from-cgroup-sockaddr.patch +bpf-add-bpf_sock_addr_set_sun_path-to-allow-writing-.patch +xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch +ice-work-on-pre-xdp-prog-frag-count.patch +i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch +ice-remove-redundant-xdp_rxq_info-registration.patch +intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch +ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch +xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch +i40e-set-xdp_rxq_info-frag_size.patch +i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch +fjes-fix-memleaks-in-fjes_hw_setup.patch +selftests-bonding-do-not-test-arp-ns-target-with-mod.patch +net-fec-fix-the-unhandled-context-fault-from-smmu.patch +tsnep-remove-fcs-for-xdp-data-path.patch +tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch diff --git a/queue-6.6/sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch b/queue-6.6/sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch new file mode 100644 index 00000000000..6cd6c40e046 --- /dev/null +++ b/queue-6.6/sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch @@ -0,0 +1,42 @@ +From 5136798a60d688d5a0b2d542aeb595c24725482d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Jan 2024 22:06:28 +0100 +Subject: SUNRPC: use request size to initialize bio_vec in svc_udp_sendto() + +From: Lucas Stach + +[ Upstream commit 1d9cabe2817edd215779dc9c2fe5e7ab9aac0704 ] + +Use the proper size when setting up the bio_vec, as otherwise only +zero-length UDP packets will be sent. + +Fixes: baabf59c2414 ("SUNRPC: Convert svc_udp_sendto() to use the per-socket bio_vec array") +Signed-off-by: Lucas Stach +Signed-off-by: Chuck Lever +Signed-off-by: Sasha Levin +--- + net/sunrpc/svcsock.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c +index 998687421fa6..e0ce4276274b 100644 +--- a/net/sunrpc/svcsock.c ++++ b/net/sunrpc/svcsock.c +@@ -717,12 +717,12 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) + ARRAY_SIZE(rqstp->rq_bvec), xdr); + + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, +- count, 0); ++ count, rqstp->rq_res.len); + err = sock_sendmsg(svsk->sk_sock, &msg); + if (err == -ECONNREFUSED) { + /* ICMP error on earlier request. */ + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, +- count, 0); ++ count, rqstp->rq_res.len); + err = sock_sendmsg(svsk->sk_sock, &msg); + } + +-- +2.43.0 + diff --git a/queue-6.6/tcp-add-memory-barrier-to-tcp_push.patch b/queue-6.6/tcp-add-memory-barrier-to-tcp_push.patch new file mode 100644 index 00000000000..9daacedddec --- /dev/null +++ b/queue-6.6/tcp-add-memory-barrier-to-tcp_push.patch @@ -0,0 +1,101 @@ +From 7c6a8c1eb5c73240018b9b8701947e3e098d71d1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 11:01:33 -0800 +Subject: tcp: Add memory barrier to tcp_push() + +From: Salvatore Dipietro + +[ Upstream commit 7267e8dcad6b2f9fce05a6a06335d7040acbc2b6 ] + +On CPUs with weak memory models, reads and updates performed by tcp_push +to the sk variables can get reordered leaving the socket throttled when +it should not. The tasklet running tcp_wfree() may also not observe the +memory updates in time and will skip flushing any packets throttled by +tcp_push(), delaying the sending. This can pathologically cause 40ms +extra latency due to bad interactions with delayed acks. + +Adding a memory barrier in tcp_push removes the bug, similarly to the +previous commit bf06200e732d ("tcp: tsq: fix nonagle handling"). +smp_mb__after_atomic() is used to not incur in unnecessary overhead +on x86 since not affected. + +Patch has been tested using an AWS c7g.2xlarge instance with Ubuntu +22.04 and Apache Tomcat 9.0.83 running the basic servlet below: + +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +public class HelloWorldServlet extends HttpServlet { + @Override + protected void doGet(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { + response.setContentType("text/html;charset=utf-8"); + OutputStreamWriter osw = new OutputStreamWriter(response.getOutputStream(),"UTF-8"); + String s = "a".repeat(3096); + osw.write(s,0,s.length()); + osw.flush(); + } +} + +Load was applied using wrk2 (https://github.com/kinvolk/wrk2) from an AWS +c6i.8xlarge instance. Before the patch an additional 40ms latency from P99.99+ +values is observed while, with the patch, the extra latency disappears. + +No patch and tcp_autocorking=1 +./wrk -t32 -c128 -d40s --latency -R10000 http://172.31.60.173:8080/hello/hello + ... + 50.000% 0.91ms + 75.000% 1.13ms + 90.000% 1.46ms + 99.000% 1.74ms + 99.900% 1.89ms + 99.990% 41.95ms <<< 40+ ms extra latency + 99.999% 48.32ms +100.000% 48.96ms + +With patch and tcp_autocorking=1 +./wrk -t32 -c128 -d40s --latency -R10000 http://172.31.60.173:8080/hello/hello + ... + 50.000% 0.90ms + 75.000% 1.13ms + 90.000% 1.45ms + 99.000% 1.72ms + 99.900% 1.83ms + 99.990% 2.11ms <<< no 40+ ms extra latency + 99.999% 2.53ms +100.000% 2.62ms + +Patch has been also tested on x86 (m7i.2xlarge instance) which it is not +affected by this issue and the patch doesn't introduce any additional +delay. + +Fixes: 7aa5470c2c09 ("tcp: tsq: move tsq_flags close to sk_wmem_alloc") +Signed-off-by: Salvatore Dipietro +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/r/20240119190133.43698-1-dipiets@amazon.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + net/ipv4/tcp.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index f124f6c63915..fb417aee86e6 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -722,6 +722,7 @@ void tcp_push(struct sock *sk, int flags, int mss_now, + if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); + set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); ++ smp_mb__after_atomic(); + } + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED. +-- +2.43.0 + diff --git a/queue-6.6/tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch b/queue-6.6/tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch new file mode 100644 index 00000000000..f8ad494f6e3 --- /dev/null +++ b/queue-6.6/tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch @@ -0,0 +1,170 @@ +From b9cc1f57bc1b392c35cd428d5ca29d92a9e28fd8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 09:20:19 +0800 +Subject: tcp: make sure init the accept_queue's spinlocks once + +From: Zhengchao Shao + +[ Upstream commit 198bc90e0e734e5f98c3d2833e8390cac3df61b2 ] + +When I run syz's reproduction C program locally, it causes the following +issue: +pvqspinlock: lock 0xffff9d181cd5c660 has corrupted value 0x0! +WARNING: CPU: 19 PID: 21160 at __pv_queued_spin_unlock_slowpath (kernel/locking/qspinlock_paravirt.h:508) +Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 +RIP: 0010:__pv_queued_spin_unlock_slowpath (kernel/locking/qspinlock_paravirt.h:508) +Code: 73 56 3a ff 90 c3 cc cc cc cc 8b 05 bb 1f 48 01 85 c0 74 05 c3 cc cc cc cc 8b 17 48 89 fe 48 c7 c7 +30 20 ce 8f e8 ad 56 42 ff <0f> 0b c3 cc cc cc cc 0f 0b 0f 1f 40 00 90 90 90 90 90 90 90 90 90 +RSP: 0018:ffffa8d200604cb8 EFLAGS: 00010282 +RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff9d1ef60e0908 +RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff9d1ef60e0900 +RBP: ffff9d181cd5c280 R08: 0000000000000000 R09: 00000000ffff7fff +R10: ffffa8d200604b68 R11: ffffffff907dcdc8 R12: 0000000000000000 +R13: ffff9d181cd5c660 R14: ffff9d1813a3f330 R15: 0000000000001000 +FS: 00007fa110184640(0000) GS:ffff9d1ef60c0000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 0000000020000000 CR3: 000000011f65e000 CR4: 00000000000006f0 +Call Trace: + + _raw_spin_unlock (kernel/locking/spinlock.c:186) + inet_csk_reqsk_queue_add (net/ipv4/inet_connection_sock.c:1321) + inet_csk_complete_hashdance (net/ipv4/inet_connection_sock.c:1358) + tcp_check_req (net/ipv4/tcp_minisocks.c:868) + tcp_v4_rcv (net/ipv4/tcp_ipv4.c:2260) + ip_protocol_deliver_rcu (net/ipv4/ip_input.c:205) + ip_local_deliver_finish (net/ipv4/ip_input.c:234) + __netif_receive_skb_one_core (net/core/dev.c:5529) + process_backlog (./include/linux/rcupdate.h:779) + __napi_poll (net/core/dev.c:6533) + net_rx_action (net/core/dev.c:6604) + __do_softirq (./arch/x86/include/asm/jump_label.h:27) + do_softirq (kernel/softirq.c:454 kernel/softirq.c:441) + + + __local_bh_enable_ip (kernel/softirq.c:381) + __dev_queue_xmit (net/core/dev.c:4374) + ip_finish_output2 (./include/net/neighbour.h:540 net/ipv4/ip_output.c:235) + __ip_queue_xmit (net/ipv4/ip_output.c:535) + __tcp_transmit_skb (net/ipv4/tcp_output.c:1462) + tcp_rcv_synsent_state_process (net/ipv4/tcp_input.c:6469) + tcp_rcv_state_process (net/ipv4/tcp_input.c:6657) + tcp_v4_do_rcv (net/ipv4/tcp_ipv4.c:1929) + __release_sock (./include/net/sock.h:1121 net/core/sock.c:2968) + release_sock (net/core/sock.c:3536) + inet_wait_for_connect (net/ipv4/af_inet.c:609) + __inet_stream_connect (net/ipv4/af_inet.c:702) + inet_stream_connect (net/ipv4/af_inet.c:748) + __sys_connect (./include/linux/file.h:45 net/socket.c:2064) + __x64_sys_connect (net/socket.c:2073 net/socket.c:2070 net/socket.c:2070) + do_syscall_64 (arch/x86/entry/common.c:51 arch/x86/entry/common.c:82) + entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) + RIP: 0033:0x7fa10ff05a3d + Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 + c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ab a3 0e 00 f7 d8 64 89 01 48 + RSP: 002b:00007fa110183de8 EFLAGS: 00000202 ORIG_RAX: 000000000000002a + RAX: ffffffffffffffda RBX: 0000000020000054 RCX: 00007fa10ff05a3d + RDX: 000000000000001c RSI: 0000000020000040 RDI: 0000000000000003 + RBP: 00007fa110183e20 R08: 0000000000000000 R09: 0000000000000000 + R10: 0000000000000000 R11: 0000000000000202 R12: 00007fa110184640 + R13: 0000000000000000 R14: 00007fa10fe8b060 R15: 00007fff73e23b20 + + +The issue triggering process is analyzed as follows: +Thread A Thread B +tcp_v4_rcv //receive ack TCP packet inet_shutdown + tcp_check_req tcp_disconnect //disconnect sock + ... tcp_set_state(sk, TCP_CLOSE) + inet_csk_complete_hashdance ... + inet_csk_reqsk_queue_add inet_listen //start listen + spin_lock(&queue->rskq_lock) inet_csk_listen_start + ... reqsk_queue_alloc + ... spin_lock_init + spin_unlock(&queue->rskq_lock) //warning + +When the socket receives the ACK packet during the three-way handshake, +it will hold spinlock. And then the user actively shutdowns the socket +and listens to the socket immediately, the spinlock will be initialized. +When the socket is going to release the spinlock, a warning is generated. +Also the same issue to fastopenq.lock. + +Move init spinlock to inet_create and inet_accept to make sure init the +accept_queue's spinlocks once. + +Fixes: fff1f3001cc5 ("tcp: add a spinlock to protect struct request_sock_queue") +Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path") +Reported-by: Ming Shu +Signed-off-by: Zhengchao Shao +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/r/20240118012019.1751966-1-shaozhengchao@huawei.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + include/net/inet_connection_sock.h | 8 ++++++++ + net/core/request_sock.c | 3 --- + net/ipv4/af_inet.c | 3 +++ + net/ipv4/inet_connection_sock.c | 4 ++++ + 4 files changed, 15 insertions(+), 3 deletions(-) + +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h +index 5d2fcc137b88..01a73bf74fa1 100644 +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -347,4 +347,12 @@ static inline bool inet_csk_has_ulp(const struct sock *sk) + return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops; + } + ++static inline void inet_init_csk_locks(struct sock *sk) ++{ ++ struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ spin_lock_init(&icsk->icsk_accept_queue.rskq_lock); ++ spin_lock_init(&icsk->icsk_accept_queue.fastopenq.lock); ++} ++ + #endif /* _INET_CONNECTION_SOCK_H */ +diff --git a/net/core/request_sock.c b/net/core/request_sock.c +index f35c2e998406..63de5c635842 100644 +--- a/net/core/request_sock.c ++++ b/net/core/request_sock.c +@@ -33,9 +33,6 @@ + + void reqsk_queue_alloc(struct request_sock_queue *queue) + { +- spin_lock_init(&queue->rskq_lock); +- +- spin_lock_init(&queue->fastopenq.lock); + queue->fastopenq.rskq_rst_head = NULL; + queue->fastopenq.rskq_rst_tail = NULL; + queue->fastopenq.qlen = 0; +diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c +index b0a5de1303b5..b739ddbef0f0 100644 +--- a/net/ipv4/af_inet.c ++++ b/net/ipv4/af_inet.c +@@ -330,6 +330,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, + if (INET_PROTOSW_REUSE & answer_flags) + sk->sk_reuse = SK_CAN_REUSE; + ++ if (INET_PROTOSW_ICSK & answer_flags) ++ inet_init_csk_locks(sk); ++ + inet = inet_sk(sk); + inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); + +diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c +index 394a498c2823..762817d6c8d7 100644 +--- a/net/ipv4/inet_connection_sock.c ++++ b/net/ipv4/inet_connection_sock.c +@@ -730,6 +730,10 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) + } + if (req) + reqsk_put(req); ++ ++ if (newsk) ++ inet_init_csk_locks(newsk); ++ + return newsk; + out_err: + newsk = NULL; +-- +2.43.0 + diff --git a/queue-6.6/tracing-ensure-visibility-when-inserting-an-element-.patch b/queue-6.6/tracing-ensure-visibility-when-inserting-an-element-.patch new file mode 100644 index 00000000000..d833d02dd78 --- /dev/null +++ b/queue-6.6/tracing-ensure-visibility-when-inserting-an-element-.patch @@ -0,0 +1,129 @@ +From 72a7126c4a10804959674d8718962280a7d696ad Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 16:09:28 +0100 +Subject: tracing: Ensure visibility when inserting an element into tracing_map + +From: Petr Pavlu + +[ Upstream commit 2b44760609e9eaafc9d234a6883d042fc21132a7 ] + +Running the following two commands in parallel on a multi-processor +AArch64 machine can sporadically produce an unexpected warning about +duplicate histogram entries: + + $ while true; do + echo hist:key=id.syscall:val=hitcount > \ + /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger + cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist + sleep 0.001 + done + $ stress-ng --sysbadaddr $(nproc) + +The warning looks as follows: + +[ 2911.172474] ------------[ cut here ]------------ +[ 2911.173111] Duplicates detected: 1 +[ 2911.173574] WARNING: CPU: 2 PID: 12247 at kernel/trace/tracing_map.c:983 tracing_map_sort_entries+0x3e0/0x408 +[ 2911.174702] Modules linked in: iscsi_ibft(E) iscsi_boot_sysfs(E) rfkill(E) af_packet(E) nls_iso8859_1(E) nls_cp437(E) vfat(E) fat(E) ena(E) tiny_power_button(E) qemu_fw_cfg(E) button(E) fuse(E) efi_pstore(E) ip_tables(E) x_tables(E) xfs(E) libcrc32c(E) aes_ce_blk(E) aes_ce_cipher(E) crct10dif_ce(E) polyval_ce(E) polyval_generic(E) ghash_ce(E) gf128mul(E) sm4_ce_gcm(E) sm4_ce_ccm(E) sm4_ce(E) sm4_ce_cipher(E) sm4(E) sm3_ce(E) sm3(E) sha3_ce(E) sha512_ce(E) sha512_arm64(E) sha2_ce(E) sha256_arm64(E) nvme(E) sha1_ce(E) nvme_core(E) nvme_auth(E) t10_pi(E) sg(E) scsi_mod(E) scsi_common(E) efivarfs(E) +[ 2911.174738] Unloaded tainted modules: cppc_cpufreq(E):1 +[ 2911.180985] CPU: 2 PID: 12247 Comm: cat Kdump: loaded Tainted: G E 6.7.0-default #2 1b58bbb22c97e4399dc09f92d309344f69c44a01 +[ 2911.182398] Hardware name: Amazon EC2 c7g.8xlarge/, BIOS 1.0 11/1/2018 +[ 2911.183208] pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) +[ 2911.184038] pc : tracing_map_sort_entries+0x3e0/0x408 +[ 2911.184667] lr : tracing_map_sort_entries+0x3e0/0x408 +[ 2911.185310] sp : ffff8000a1513900 +[ 2911.185750] x29: ffff8000a1513900 x28: ffff0003f272fe80 x27: 0000000000000001 +[ 2911.186600] x26: ffff0003f272fe80 x25: 0000000000000030 x24: 0000000000000008 +[ 2911.187458] x23: ffff0003c5788000 x22: ffff0003c16710c8 x21: ffff80008017f180 +[ 2911.188310] x20: ffff80008017f000 x19: ffff80008017f180 x18: ffffffffffffffff +[ 2911.189160] x17: 0000000000000000 x16: 0000000000000000 x15: ffff8000a15134b8 +[ 2911.190015] x14: 0000000000000000 x13: 205d373432323154 x12: 5b5d313131333731 +[ 2911.190844] x11: 00000000fffeffff x10: 00000000fffeffff x9 : ffffd1b78274a13c +[ 2911.191716] x8 : 000000000017ffe8 x7 : c0000000fffeffff x6 : 000000000057ffa8 +[ 2911.192554] x5 : ffff0012f6c24ec0 x4 : 0000000000000000 x3 : ffff2e5b72b5d000 +[ 2911.193404] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0003ff254480 +[ 2911.194259] Call trace: +[ 2911.194626] tracing_map_sort_entries+0x3e0/0x408 +[ 2911.195220] hist_show+0x124/0x800 +[ 2911.195692] seq_read_iter+0x1d4/0x4e8 +[ 2911.196193] seq_read+0xe8/0x138 +[ 2911.196638] vfs_read+0xc8/0x300 +[ 2911.197078] ksys_read+0x70/0x108 +[ 2911.197534] __arm64_sys_read+0x24/0x38 +[ 2911.198046] invoke_syscall+0x78/0x108 +[ 2911.198553] el0_svc_common.constprop.0+0xd0/0xf8 +[ 2911.199157] do_el0_svc+0x28/0x40 +[ 2911.199613] el0_svc+0x40/0x178 +[ 2911.200048] el0t_64_sync_handler+0x13c/0x158 +[ 2911.200621] el0t_64_sync+0x1a8/0x1b0 +[ 2911.201115] ---[ end trace 0000000000000000 ]--- + +The problem appears to be caused by CPU reordering of writes issued from +__tracing_map_insert(). + +The check for the presence of an element with a given key in this +function is: + + val = READ_ONCE(entry->val); + if (val && keys_match(key, val->key, map->key_size)) ... + +The write of a new entry is: + + elt = get_free_elt(map); + memcpy(elt->key, key, map->key_size); + entry->val = elt; + +The "memcpy(elt->key, key, map->key_size);" and "entry->val = elt;" +stores may become visible in the reversed order on another CPU. This +second CPU might then incorrectly determine that a new key doesn't match +an already present val->key and subsequently insert a new element, +resulting in a duplicate. + +Fix the problem by adding a write barrier between +"memcpy(elt->key, key, map->key_size);" and "entry->val = elt;", and for +good measure, also use WRITE_ONCE(entry->val, elt) for publishing the +element. The sequence pairs with the mentioned "READ_ONCE(entry->val);" +and the "val->key" check which has an address dependency. + +The barrier is placed on a path executed when adding an element for +a new key. Subsequent updates targeting the same key remain unaffected. + +From the user's perspective, the issue was introduced by commit +c193707dde77 ("tracing: Remove code which merges duplicates"), which +followed commit cbf4100efb8f ("tracing: Add support to detect and avoid +duplicates"). The previous code operated differently; it inherently +expected potential races which result in duplicates but merged them +later when they occurred. + +Link: https://lore.kernel.org/linux-trace-kernel/20240122150928.27725-1-petr.pavlu@suse.com + +Fixes: c193707dde77 ("tracing: Remove code which merges duplicates") +Signed-off-by: Petr Pavlu +Acked-by: Tom Zanussi +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Sasha Levin +--- + kernel/trace/tracing_map.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c +index c774e560f2f9..a4dcf0f24352 100644 +--- a/kernel/trace/tracing_map.c ++++ b/kernel/trace/tracing_map.c +@@ -574,7 +574,12 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) + } + + memcpy(elt->key, key, map->key_size); +- entry->val = elt; ++ /* ++ * Ensure the initialization is visible and ++ * publish the elt. ++ */ ++ smp_wmb(); ++ WRITE_ONCE(entry->val, elt); + atomic64_inc(&map->hits); + + return entry->val; +-- +2.43.0 + diff --git a/queue-6.6/tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch b/queue-6.6/tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch new file mode 100644 index 00000000000..ada3524f96c --- /dev/null +++ b/queue-6.6/tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch @@ -0,0 +1,52 @@ +From 43d7a55fe424e183b51503d870bf52d919e36ee3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Jan 2024 21:09:18 +0100 +Subject: tsnep: Fix XDP_RING_NEED_WAKEUP for empty fill ring + +From: Gerhard Engleder + +[ Upstream commit 9a91c05f4bd6f6bdd6b8f90445e0da92e3ac956c ] + +The fill ring of the XDP socket may contain not enough buffers to +completey fill the RX queue during socket creation. In this case the +flag XDP_RING_NEED_WAKEUP is not set as this flag is only set if the RX +queue is not completely filled during polling. + +Set XDP_RING_NEED_WAKEUP flag also if RX queue is not completely filled +during XDP socket creation. + +Fixes: 3fc2333933fd ("tsnep: Add XDP socket zero-copy RX support") +Signed-off-by: Gerhard Engleder +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/engleder/tsnep_main.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c +index 9fea97671f4b..08e113e785a7 100644 +--- a/drivers/net/ethernet/engleder/tsnep_main.c ++++ b/drivers/net/ethernet/engleder/tsnep_main.c +@@ -1711,6 +1711,19 @@ static void tsnep_rx_reopen_xsk(struct tsnep_rx *rx) + allocated--; + } + } ++ ++ /* set need wakeup flag immediately if ring is not filled completely, ++ * first polling would be too late as need wakeup signalisation would ++ * be delayed for an indefinite time ++ */ ++ if (xsk_uses_need_wakeup(rx->xsk_pool)) { ++ int desc_available = tsnep_rx_desc_available(rx); ++ ++ if (desc_available) ++ xsk_set_rx_need_wakeup(rx->xsk_pool); ++ else ++ xsk_clear_rx_need_wakeup(rx->xsk_pool); ++ } + } + + static bool tsnep_pending(struct tsnep_queue *queue) +-- +2.43.0 + diff --git a/queue-6.6/tsnep-remove-fcs-for-xdp-data-path.patch b/queue-6.6/tsnep-remove-fcs-for-xdp-data-path.patch new file mode 100644 index 00000000000..13b52f57581 --- /dev/null +++ b/queue-6.6/tsnep-remove-fcs-for-xdp-data-path.patch @@ -0,0 +1,49 @@ +From bb256d5a97bae3dcb46d7b662391f874fe788a98 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Jan 2024 21:09:17 +0100 +Subject: tsnep: Remove FCS for XDP data path + +From: Gerhard Engleder + +[ Upstream commit 50bad6f797d4d501c5ef416a6f92e1912ab5aa8b ] + +The RX data buffer includes the FCS. The FCS is already stripped for the +normal data path. But for the XDP data path the FCS is included and +acts like additional/useless data. + +Remove the FCS from the RX data buffer also for XDP. + +Fixes: 65b28c810035 ("tsnep: Add XDP RX support") +Fixes: 3fc2333933fd ("tsnep: Add XDP socket zero-copy RX support") +Signed-off-by: Gerhard Engleder +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/engleder/tsnep_main.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c +index 38da2d6c250e..9fea97671f4b 100644 +--- a/drivers/net/ethernet/engleder/tsnep_main.c ++++ b/drivers/net/ethernet/engleder/tsnep_main.c +@@ -1434,7 +1434,7 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi, + + xdp_prepare_buff(&xdp, page_address(entry->page), + XDP_PACKET_HEADROOM + TSNEP_RX_INLINE_METADATA_SIZE, +- length, false); ++ length - ETH_FCS_LEN, false); + + consume = tsnep_xdp_run_prog(rx, prog, &xdp, + &xdp_status, tx_nq, tx); +@@ -1517,7 +1517,7 @@ static int tsnep_rx_poll_zc(struct tsnep_rx *rx, struct napi_struct *napi, + prefetch(entry->xdp->data); + length = __le32_to_cpu(entry->desc_wb->properties) & + TSNEP_DESC_LENGTH_MASK; +- xsk_buff_set_size(entry->xdp, length); ++ xsk_buff_set_size(entry->xdp, length - ETH_FCS_LEN); + xsk_buff_dma_sync_for_cpu(entry->xdp, rx->xsk_pool); + + /* RX metadata with timestamps is in front of actual data, +-- +2.43.0 + diff --git a/queue-6.6/tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch b/queue-6.6/tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch new file mode 100644 index 00000000000..a57aa535ce7 --- /dev/null +++ b/queue-6.6/tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch @@ -0,0 +1,49 @@ +From f5d4f981502d41e1176c57c6fd78b6125f849ff7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 18:22:56 +0800 +Subject: tun: add missing rx stats accounting in tun_xdp_act + +From: Yunjian Wang + +[ Upstream commit f1084c427f55d573fcd5688d9ba7b31b78019716 ] + +The TUN can be used as vhost-net backend, and it is necessary to +count the packets transmitted from TUN to vhost-net/virtio-net. +However, there are some places in the receive path that were not +taken into account when using XDP. It would be beneficial to also +include new accounting for successfully received bytes using +dev_sw_netstats_rx_add. + +Fixes: 761876c857cb ("tap: XDP support") +Signed-off-by: Yunjian Wang +Reviewed-by: Willem de Bruijn +Acked-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/tun.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/drivers/net/tun.c b/drivers/net/tun.c +index 237fef557ba5..4a4f8c8e79fa 100644 +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -1634,6 +1634,7 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, + dev_core_stats_rx_dropped_inc(tun->dev); + return err; + } ++ dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); + break; + case XDP_TX: + err = tun_xdp_tx(tun->dev, xdp); +@@ -1641,6 +1642,7 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, + dev_core_stats_rx_dropped_inc(tun->dev); + return err; + } ++ dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); + break; + case XDP_PASS: + break; +-- +2.43.0 + diff --git a/queue-6.6/tun-fix-missing-dropped-counter-in-tun_xdp_act.patch b/queue-6.6/tun-fix-missing-dropped-counter-in-tun_xdp_act.patch new file mode 100644 index 00000000000..72010265237 --- /dev/null +++ b/queue-6.6/tun-fix-missing-dropped-counter-in-tun_xdp_act.patch @@ -0,0 +1,52 @@ +From ba746249f1664840569f703c743616bbc33f83ed Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 18:22:35 +0800 +Subject: tun: fix missing dropped counter in tun_xdp_act + +From: Yunjian Wang + +[ Upstream commit 5744ba05e7c4bff8fec133dd0f9e51ddffba92f5 ] + +The commit 8ae1aff0b331 ("tuntap: split out XDP logic") includes +dropped counter for XDP_DROP, XDP_ABORTED, and invalid XDP actions. +Unfortunately, that commit missed the dropped counter when error +occurs during XDP_TX and XDP_REDIRECT actions. This patch fixes +this issue. + +Fixes: 8ae1aff0b331 ("tuntap: split out XDP logic") +Signed-off-by: Yunjian Wang +Reviewed-by: Willem de Bruijn +Acked-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/tun.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/tun.c b/drivers/net/tun.c +index afa5497f7c35..237fef557ba5 100644 +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -1630,13 +1630,17 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, + switch (act) { + case XDP_REDIRECT: + err = xdp_do_redirect(tun->dev, xdp, xdp_prog); +- if (err) ++ if (err) { ++ dev_core_stats_rx_dropped_inc(tun->dev); + return err; ++ } + break; + case XDP_TX: + err = tun_xdp_tx(tun->dev, xdp); +- if (err < 0) ++ if (err < 0) { ++ dev_core_stats_rx_dropped_inc(tun->dev); + return err; ++ } + break; + case XDP_PASS: + break; +-- +2.43.0 + diff --git a/queue-6.6/udp-fix-busy-polling.patch b/queue-6.6/udp-fix-busy-polling.patch new file mode 100644 index 00000000000..51ad36b0726 --- /dev/null +++ b/queue-6.6/udp-fix-busy-polling.patch @@ -0,0 +1,134 @@ +From ace3a097e86597bd5f6569778b648ae2dc67ca42 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 20:17:49 +0000 +Subject: udp: fix busy polling + +From: Eric Dumazet + +[ Upstream commit a54d51fb2dfb846aedf3751af501e9688db447f5 ] + +Generic sk_busy_loop_end() only looks at sk->sk_receive_queue +for presence of packets. + +Problem is that for UDP sockets after blamed commit, some packets +could be present in another queue: udp_sk(sk)->reader_queue + +In some cases, a busy poller could spin until timeout expiration, +even if some packets are available in udp_sk(sk)->reader_queue. + +v3: - make sk_busy_loop_end() nicer (Willem) + +v2: - add a READ_ONCE(sk->sk_family) in sk_is_inet() to avoid KCSAN splats. + - add a sk_is_inet() check in sk_is_udp() (Willem feedback) + - add a sk_is_inet() check in sk_is_tcp(). + +Fixes: 2276f58ac589 ("udp: use a separate rx queue for packet reception") +Signed-off-by: Eric Dumazet +Reviewed-by: Paolo Abeni +Reviewed-by: Willem de Bruijn +Reviewed-by: Kuniyuki Iwashima +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + include/linux/skmsg.h | 6 ------ + include/net/inet_sock.h | 5 ----- + include/net/sock.h | 18 +++++++++++++++++- + net/core/sock.c | 11 +++++++++-- + 4 files changed, 26 insertions(+), 14 deletions(-) + +diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h +index c953b8c0d2f4..bd4418377bac 100644 +--- a/include/linux/skmsg.h ++++ b/include/linux/skmsg.h +@@ -500,12 +500,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) + return !!psock->saved_data_ready; + } + +-static inline bool sk_is_udp(const struct sock *sk) +-{ +- return sk->sk_type == SOCK_DGRAM && +- sk->sk_protocol == IPPROTO_UDP; +-} +- + #if IS_ENABLED(CONFIG_NET_SOCK_MSG) + + #define BPF_F_STRPARSER (1UL << 1) +diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h +index 2de0e4d4a027..2790ba58ffe5 100644 +--- a/include/net/inet_sock.h ++++ b/include/net/inet_sock.h +@@ -301,11 +301,6 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) + #define inet_assign_bit(nr, sk, val) \ + assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val) + +-static inline bool sk_is_inet(struct sock *sk) +-{ +- return sk->sk_family == AF_INET || sk->sk_family == AF_INET6; +-} +- + /** + * sk_to_full_sk - Access to a full socket + * @sk: pointer to a socket +diff --git a/include/net/sock.h b/include/net/sock.h +index 70a771d96467..e70c903b04f3 100644 +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -2793,9 +2793,25 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) + &skb_shinfo(skb)->tskey); + } + ++static inline bool sk_is_inet(const struct sock *sk) ++{ ++ int family = READ_ONCE(sk->sk_family); ++ ++ return family == AF_INET || family == AF_INET6; ++} ++ + static inline bool sk_is_tcp(const struct sock *sk) + { +- return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP; ++ return sk_is_inet(sk) && ++ sk->sk_type == SOCK_STREAM && ++ sk->sk_protocol == IPPROTO_TCP; ++} ++ ++static inline bool sk_is_udp(const struct sock *sk) ++{ ++ return sk_is_inet(sk) && ++ sk->sk_type == SOCK_DGRAM && ++ sk->sk_protocol == IPPROTO_UDP; + } + + static inline bool sk_is_stream_unix(const struct sock *sk) +diff --git a/net/core/sock.c b/net/core/sock.c +index 5cd21e699f2d..383e30fe79f4 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -107,6 +107,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -4136,8 +4137,14 @@ bool sk_busy_loop_end(void *p, unsigned long start_time) + { + struct sock *sk = p; + +- return !skb_queue_empty_lockless(&sk->sk_receive_queue) || +- sk_busy_loop_timeout(sk, start_time); ++ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) ++ return true; ++ ++ if (sk_is_udp(sk) && ++ !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) ++ return true; ++ ++ return sk_busy_loop_timeout(sk, start_time); + } + EXPORT_SYMBOL(sk_busy_loop_end); + #endif /* CONFIG_NET_RX_BUSY_POLL */ +-- +2.43.0 + diff --git a/queue-6.6/vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch b/queue-6.6/vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch new file mode 100644 index 00000000000..ee7b2bfce69 --- /dev/null +++ b/queue-6.6/vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch @@ -0,0 +1,58 @@ +From e7753228b98d84fa2b26786a7330c8ebca54b5e5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 21:03:06 +0800 +Subject: vlan: skip nested type that is not IFLA_VLAN_QOS_MAPPING + +From: Lin Ma + +[ Upstream commit 6c21660fe221a15c789dee2bc2fd95516bc5aeaf ] + +In the vlan_changelink function, a loop is used to parse the nested +attributes IFLA_VLAN_EGRESS_QOS and IFLA_VLAN_INGRESS_QOS in order to +obtain the struct ifla_vlan_qos_mapping. These two nested attributes are +checked in the vlan_validate_qos_map function, which calls +nla_validate_nested_deprecated with the vlan_map_policy. + +However, this deprecated validator applies a LIBERAL strictness, allowing +the presence of an attribute with the type IFLA_VLAN_QOS_UNSPEC. +Consequently, the loop in vlan_changelink may parse an attribute of type +IFLA_VLAN_QOS_UNSPEC and believe it carries a payload of +struct ifla_vlan_qos_mapping, which is not necessarily true. + +To address this issue and ensure compatibility, this patch introduces two +type checks that skip attributes whose type is not IFLA_VLAN_QOS_MAPPING. + +Fixes: 07b5b17e157b ("[VLAN]: Use rtnl_link API") +Signed-off-by: Lin Ma +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20240118130306.1644001-1-linma@zju.edu.cn +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/8021q/vlan_netlink.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c +index 214532173536..a3b68243fd4b 100644 +--- a/net/8021q/vlan_netlink.c ++++ b/net/8021q/vlan_netlink.c +@@ -118,12 +118,16 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], + } + if (data[IFLA_VLAN_INGRESS_QOS]) { + nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) { ++ if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) ++ continue; + m = nla_data(attr); + vlan_dev_set_ingress_priority(dev, m->to, m->from); + } + } + if (data[IFLA_VLAN_EGRESS_QOS]) { + nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) { ++ if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) ++ continue; + m = nla_data(attr); + err = vlan_dev_set_egress_priority(dev, m->from, m->to); + if (err) +-- +2.43.0 + diff --git a/queue-6.6/wifi-mac80211-fix-potential-sta-link-leak.patch b/queue-6.6/wifi-mac80211-fix-potential-sta-link-leak.patch new file mode 100644 index 00000000000..6ed5cfc7ec1 --- /dev/null +++ b/queue-6.6/wifi-mac80211-fix-potential-sta-link-leak.patch @@ -0,0 +1,44 @@ +From 65daa8e237218112e916d1176649db6695ae27b0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 11 Jan 2024 18:17:44 +0200 +Subject: wifi: mac80211: fix potential sta-link leak + +From: Johannes Berg + +[ Upstream commit b01a74b3ca6fd51b62c67733ba7c3280fa6c5d26 ] + +When a station is allocated, links are added but not +set to valid yet (e.g. during connection to an AP MLD), +we might remove the station without ever marking links +valid, and leak them. Fix that. + +Fixes: cb71f1d136a6 ("wifi: mac80211: add sta link addition/removal") +Signed-off-by: Johannes Berg +Reviewed-by: Ilan Peer +Signed-off-by: Miri Korenblit +Link: https://msgid.link/20240111181514.6573998beaf8.I09ac2e1d41c80f82a5a616b8bd1d9d8dd709a6a6@changeid +Signed-off-by: Johannes Berg +Signed-off-by: Sasha Levin +--- + net/mac80211/sta_info.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c +index 0c5cc75857e4..e112300caaf7 100644 +--- a/net/mac80211/sta_info.c ++++ b/net/mac80211/sta_info.c +@@ -398,7 +398,10 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) + int i; + + for (i = 0; i < ARRAY_SIZE(sta->link); i++) { +- if (!(sta->sta.valid_links & BIT(i))) ++ struct link_sta_info *link_sta; ++ ++ link_sta = rcu_access_pointer(sta->link[i]); ++ if (!link_sta) + continue; + + sta_remove_link(sta, i, false); +-- +2.43.0 + diff --git a/queue-6.6/xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch b/queue-6.6/xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch new file mode 100644 index 00000000000..5f2ed2c6840 --- /dev/null +++ b/queue-6.6/xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch @@ -0,0 +1,42 @@ +From 87e3f813265f9528c9b11271cb2531f4952a8a26 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:16:00 +0100 +Subject: xdp: reflect tail increase for MEM_TYPE_XSK_BUFF_POOL + +From: Maciej Fijalkowski + +[ Upstream commit fbadd83a612c3b7aad2987893faca6bd24aaebb3 ] + +XSK ZC Rx path calculates the size of data that will be posted to XSK Rx +queue via subtracting xdp_buff::data_end from xdp_buff::data. + +In bpf_xdp_frags_increase_tail(), when underlying memory type of +xdp_rxq_info is MEM_TYPE_XSK_BUFF_POOL, add offset to data_end in tail +fragment, so that later on user space will be able to take into account +the amount of bytes added by XDP program. + +Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-10-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + net/core/filter.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/net/core/filter.c b/net/core/filter.c +index 46ee0f5433e3..01f2417deef2 100644 +--- a/net/core/filter.c ++++ b/net/core/filter.c +@@ -4081,6 +4081,8 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) + memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); + skb_frag_size_add(frag, offset); + sinfo->xdp_frags_size += offset; ++ if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) ++ xsk_buff_get_tail(xdp)->data_end += offset; + + return 0; + } +-- +2.43.0 + diff --git a/queue-6.6/xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch b/queue-6.6/xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch new file mode 100644 index 00000000000..dd775d677fb --- /dev/null +++ b/queue-6.6/xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch @@ -0,0 +1,195 @@ +From 991e25fbea1c00cd8a3ebf0504a17c7f19093ee0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:54 +0100 +Subject: xsk: fix usage of multi-buffer BPF helpers for ZC XDP + +From: Maciej Fijalkowski + +[ Upstream commit c5114710c8ce86b8317e9b448f4fd15c711c2a82 ] + +Currently when packet is shrunk via bpf_xdp_adjust_tail() and memory +type is set to MEM_TYPE_XSK_BUFF_POOL, null ptr dereference happens: + +[1136314.192256] BUG: kernel NULL pointer dereference, address: +0000000000000034 +[1136314.203943] #PF: supervisor read access in kernel mode +[1136314.213768] #PF: error_code(0x0000) - not-present page +[1136314.223550] PGD 0 P4D 0 +[1136314.230684] Oops: 0000 [#1] PREEMPT SMP NOPTI +[1136314.239621] CPU: 8 PID: 54203 Comm: xdpsock Not tainted 6.6.0+ #257 +[1136314.250469] Hardware name: Intel Corporation S2600WFT/S2600WFT, +BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019 +[1136314.265615] RIP: 0010:__xdp_return+0x6c/0x210 +[1136314.274653] Code: ad 00 48 8b 47 08 49 89 f8 a8 01 0f 85 9b 01 00 00 0f 1f 44 00 00 f0 41 ff 48 34 75 32 4c 89 c7 e9 79 cd 80 ff 83 fe 03 75 17 41 34 01 0f 85 02 01 00 00 48 89 cf e9 22 cc 1e 00 e9 3d d2 86 +[1136314.302907] RSP: 0018:ffffc900089f8db0 EFLAGS: 00010246 +[1136314.312967] RAX: ffffc9003168aed0 RBX: ffff8881c3300000 RCX: +0000000000000000 +[1136314.324953] RDX: 0000000000000000 RSI: 0000000000000003 RDI: +ffffc9003168c000 +[1136314.336929] RBP: 0000000000000ae0 R08: 0000000000000002 R09: +0000000000010000 +[1136314.348844] R10: ffffc9000e495000 R11: 0000000000000040 R12: +0000000000000001 +[1136314.360706] R13: 0000000000000524 R14: ffffc9003168aec0 R15: +0000000000000001 +[1136314.373298] FS: 00007f8df8bbcb80(0000) GS:ffff8897e0e00000(0000) +knlGS:0000000000000000 +[1136314.386105] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[1136314.396532] CR2: 0000000000000034 CR3: 00000001aa912002 CR4: +00000000007706f0 +[1136314.408377] DR0: 0000000000000000 DR1: 0000000000000000 DR2: +0000000000000000 +[1136314.420173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: +0000000000000400 +[1136314.431890] PKRU: 55555554 +[1136314.439143] Call Trace: +[1136314.446058] +[1136314.452465] ? __die+0x20/0x70 +[1136314.459881] ? page_fault_oops+0x15b/0x440 +[1136314.468305] ? exc_page_fault+0x6a/0x150 +[1136314.476491] ? asm_exc_page_fault+0x22/0x30 +[1136314.484927] ? __xdp_return+0x6c/0x210 +[1136314.492863] bpf_xdp_adjust_tail+0x155/0x1d0 +[1136314.501269] bpf_prog_ccc47ae29d3b6570_xdp_sock_prog+0x15/0x60 +[1136314.511263] ice_clean_rx_irq_zc+0x206/0xc60 [ice] +[1136314.520222] ? ice_xmit_zc+0x6e/0x150 [ice] +[1136314.528506] ice_napi_poll+0x467/0x670 [ice] +[1136314.536858] ? ttwu_do_activate.constprop.0+0x8f/0x1a0 +[1136314.546010] __napi_poll+0x29/0x1b0 +[1136314.553462] net_rx_action+0x133/0x270 +[1136314.561619] __do_softirq+0xbe/0x28e +[1136314.569303] do_softirq+0x3f/0x60 + +This comes from __xdp_return() call with xdp_buff argument passed as +NULL which is supposed to be consumed by xsk_buff_free() call. + +To address this properly, in ZC case, a node that represents the frag +being removed has to be pulled out of xskb_list. Introduce +appropriate xsk helpers to do such node operation and use them +accordingly within bpf_xdp_adjust_tail(). + +Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") +Acked-by: Magnus Karlsson # For the xsk header part +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-4-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + include/net/xdp_sock_drv.h | 26 +++++++++++++++++++++++ + net/core/filter.c | 42 ++++++++++++++++++++++++++++++++------ + 2 files changed, 62 insertions(+), 6 deletions(-) + +diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h +index 7290eb721c07..5425f7ad5ebd 100644 +--- a/include/net/xdp_sock_drv.h ++++ b/include/net/xdp_sock_drv.h +@@ -147,6 +147,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) + return ret; + } + ++static inline void xsk_buff_del_tail(struct xdp_buff *tail) ++{ ++ struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); ++ ++ list_del(&xskb->xskb_list_node); ++} ++ ++static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) ++{ ++ struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); ++ struct xdp_buff_xsk *frag; ++ ++ frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, ++ xskb_list_node); ++ return &frag->xdp; ++} ++ + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) + { + xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; +@@ -310,6 +327,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) + return NULL; + } + ++static inline void xsk_buff_del_tail(struct xdp_buff *tail) ++{ ++} ++ ++static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) ++{ ++ return NULL; ++} ++ + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) + { + } +diff --git a/net/core/filter.c b/net/core/filter.c +index cbc395d96479..46ee0f5433e3 100644 +--- a/net/core/filter.c ++++ b/net/core/filter.c +@@ -82,6 +82,7 @@ + #include + #include + #include ++#include + + static const struct bpf_func_proto * + bpf_sk_base_func_proto(enum bpf_func_id func_id); +@@ -4084,6 +4085,40 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) + return 0; + } + ++static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, ++ struct xdp_mem_info *mem_info, bool release) ++{ ++ struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); ++ ++ if (release) { ++ xsk_buff_del_tail(zc_frag); ++ __xdp_return(NULL, mem_info, false, zc_frag); ++ } else { ++ zc_frag->data_end -= shrink; ++ } ++} ++ ++static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, ++ int shrink) ++{ ++ struct xdp_mem_info *mem_info = &xdp->rxq->mem; ++ bool release = skb_frag_size(frag) == shrink; ++ ++ if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { ++ bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release); ++ goto out; ++ } ++ ++ if (release) { ++ struct page *page = skb_frag_page(frag); ++ ++ __xdp_return(page_address(page), mem_info, false, NULL); ++ } ++ ++out: ++ return release; ++} ++ + static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) + { + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); +@@ -4098,12 +4133,7 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) + + len_free += shrink; + offset -= shrink; +- +- if (skb_frag_size(frag) == shrink) { +- struct page *page = skb_frag_page(frag); +- +- __xdp_return(page_address(page), &xdp->rxq->mem, +- false, NULL); ++ if (bpf_xdp_shrink_data(xdp, frag, shrink)) { + n_frags_free++; + } else { + skb_frag_size_sub(frag, shrink); +-- +2.43.0 + diff --git a/queue-6.6/xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch b/queue-6.6/xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch new file mode 100644 index 00000000000..ad30a04daea --- /dev/null +++ b/queue-6.6/xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch @@ -0,0 +1,107 @@ +From b86bbc55700fc6b7540c6611c8605721a4fac36a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:53 +0100 +Subject: xsk: make xsk_buff_pool responsible for clearing xdp_buff::flags + +From: Maciej Fijalkowski + +[ Upstream commit f7f6aa8e24383fbb11ac55942e66da9660110f80 ] + +XDP multi-buffer support introduced XDP_FLAGS_HAS_FRAGS flag that is +used by drivers to notify data path whether xdp_buff contains fragments +or not. Data path looks up mentioned flag on first buffer that occupies +the linear part of xdp_buff, so drivers only modify it there. This is +sufficient for SKB and XDP_DRV modes as usually xdp_buff is allocated on +stack or it resides within struct representing driver's queue and +fragments are carried via skb_frag_t structs. IOW, we are dealing with +only one xdp_buff. + +ZC mode though relies on list of xdp_buff structs that is carried via +xsk_buff_pool::xskb_list, so ZC data path has to make sure that +fragments do *not* have XDP_FLAGS_HAS_FRAGS set. Otherwise, +xsk_buff_free() could misbehave if it would be executed against xdp_buff +that carries a frag with XDP_FLAGS_HAS_FRAGS flag set. Such scenario can +take place when within supplied XDP program bpf_xdp_adjust_tail() is +used with negative offset that would in turn release the tail fragment +from multi-buffer frame. + +Calling xsk_buff_free() on tail fragment with XDP_FLAGS_HAS_FRAGS would +result in releasing all the nodes from xskb_list that were produced by +driver before XDP program execution, which is not what is intended - +only tail fragment should be deleted from xskb_list and then it should +be put onto xsk_buff_pool::free_list. Such multi-buffer frame will never +make it up to user space, so from AF_XDP application POV there would be +no traffic running, however due to free_list getting constantly new +nodes, driver will be able to feed HW Rx queue with recycled buffers. +Bottom line is that instead of traffic being redirected to user space, +it would be continuously dropped. + +To fix this, let us clear the mentioned flag on xsk_buff_pool side +during xdp_buff initialization, which is what should have been done +right from the start of XSK multi-buffer support. + +Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support") +Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support") +Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-3-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_xsk.c | 1 - + drivers/net/ethernet/intel/ice/ice_xsk.c | 1 - + include/net/xdp_sock_drv.h | 1 + + net/xdp/xsk_buff_pool.c | 1 + + 4 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c +index 7d991e4d9b89..b75e6b6d317c 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c +@@ -503,7 +503,6 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) + xdp_res = i40e_run_xdp_zc(rx_ring, first, xdp_prog); + i40e_handle_xdp_result_zc(rx_ring, first, rx_desc, &rx_packets, + &rx_bytes, xdp_res, &failure); +- first->flags = 0; + next_to_clean = next_to_process; + if (failure) + break; +diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c +index 2a3f0834e139..33f194c870bb 100644 +--- a/drivers/net/ethernet/intel/ice/ice_xsk.c ++++ b/drivers/net/ethernet/intel/ice/ice_xsk.c +@@ -897,7 +897,6 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) + + if (!first) { + first = xdp; +- xdp_buff_clear_frags_flag(first); + } else if (ice_add_xsk_frag(rx_ring, first, xdp, size)) { + break; + } +diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h +index 1f6fc8c7a84c..7290eb721c07 100644 +--- a/include/net/xdp_sock_drv.h ++++ b/include/net/xdp_sock_drv.h +@@ -152,6 +152,7 @@ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) + xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; + xdp->data_meta = xdp->data; + xdp->data_end = xdp->data + size; ++ xdp->flags = 0; + } + + static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, +diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c +index 49cb9f9a09be..b0a611677865 100644 +--- a/net/xdp/xsk_buff_pool.c ++++ b/net/xdp/xsk_buff_pool.c +@@ -541,6 +541,7 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) + + xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; + xskb->xdp.data_meta = xskb->xdp.data; ++ xskb->xdp.flags = 0; + + if (pool->dma_need_sync) { + dma_sync_single_range_for_device(pool->dev, xskb->dma, 0, +-- +2.43.0 + diff --git a/queue-6.6/xsk-recycle-buffer-in-case-rx-queue-was-full.patch b/queue-6.6/xsk-recycle-buffer-in-case-rx-queue-was-full.patch new file mode 100644 index 00000000000..ccedf35c9ec --- /dev/null +++ b/queue-6.6/xsk-recycle-buffer-in-case-rx-queue-was-full.patch @@ -0,0 +1,58 @@ +From 3a23678456ba4de3d40da0dbc139590cb5a1c647 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:52 +0100 +Subject: xsk: recycle buffer in case Rx queue was full + +From: Maciej Fijalkowski + +[ Upstream commit 269009893146c495f41e9572dd9319e787c2eba9 ] + +Add missing xsk_buff_free() call when __xsk_rcv_zc() failed to produce +descriptor to XSK Rx queue. + +Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") +Acked-by: Magnus Karlsson +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-2-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + net/xdp/xsk.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c +index 774a6d1916e4..d849dc04a334 100644 +--- a/net/xdp/xsk.c ++++ b/net/xdp/xsk.c +@@ -166,8 +166,10 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) + contd = XDP_PKT_CONTD; + + err = __xsk_rcv_zc(xs, xskb, len, contd); +- if (err || likely(!frags)) +- goto out; ++ if (err) ++ goto err; ++ if (likely(!frags)) ++ return 0; + + xskb_list = &xskb->pool->xskb_list; + list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { +@@ -176,11 +178,13 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) + len = pos->xdp.data_end - pos->xdp.data; + err = __xsk_rcv_zc(xs, pos, len, contd); + if (err) +- return err; ++ goto err; + list_del(&pos->xskb_list_node); + } + +-out: ++ return 0; ++err: ++ xsk_buff_free(xdp); + return err; + } + +-- +2.43.0 + -- 2.47.3