]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 6.6
authorSasha Levin <sashal@kernel.org>
Sat, 27 Jan 2024 12:47:05 +0000 (07:47 -0500)
committerSasha Levin <sashal@kernel.org>
Sat, 27 Jan 2024 12:47:05 +0000 (07:47 -0500)
Signed-off-by: Sasha Levin <sashal@kernel.org>
65 files changed:
queue-6.6/afs-hide-silly-rename-files-from-userspace.patch [new file with mode: 0644]
queue-6.6/bnxt_en-prevent-kernel-warning-when-running-offline-.patch [new file with mode: 0644]
queue-6.6/bnxt_en-wait-for-flr-to-complete-during-probe.patch [new file with mode: 0644]
queue-6.6/bpf-add-bpf_sock_addr_set_sun_path-to-allow-writing-.patch [new file with mode: 0644]
queue-6.6/bpf-propagate-modified-uaddrlen-from-cgroup-sockaddr.patch [new file with mode: 0644]
queue-6.6/btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch [new file with mode: 0644]
queue-6.6/fjes-fix-memleaks-in-fjes_hw_setup.patch [new file with mode: 0644]
queue-6.6/i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch [new file with mode: 0644]
queue-6.6/i40e-set-xdp_rxq_info-frag_size.patch [new file with mode: 0644]
queue-6.6/i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch [new file with mode: 0644]
queue-6.6/ice-remove-redundant-xdp_rxq_info-registration.patch [new file with mode: 0644]
queue-6.6/ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch [new file with mode: 0644]
queue-6.6/ice-work-on-pre-xdp-prog-frag-count.patch [new file with mode: 0644]
queue-6.6/intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch [new file with mode: 0644]
queue-6.6/ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch [new file with mode: 0644]
queue-6.6/llc-drop-support-for-eth_p_tr_802_2.patch [new file with mode: 0644]
queue-6.6/llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch [new file with mode: 0644]
queue-6.6/net-fec-fix-the-unhandled-context-fault-from-smmu.patch [new file with mode: 0644]
queue-6.6/net-fix-removing-a-namespace-with-conflicting-altnam.patch [new file with mode: 0644]
queue-6.6/net-micrel-fix-ptp-frame-parsing-for-lan8814.patch [new file with mode: 0644]
queue-6.6/net-mlx5-bridge-enable-mcast-in-smfs-steering-mode.patch [new file with mode: 0644]
queue-6.6/net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch [new file with mode: 0644]
queue-6.6/net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch [new file with mode: 0644]
queue-6.6/net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch [new file with mode: 0644]
queue-6.6/net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch [new file with mode: 0644]
queue-6.6/net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch [new file with mode: 0644]
queue-6.6/net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch [new file with mode: 0644]
queue-6.6/net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch [new file with mode: 0644]
queue-6.6/net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch [new file with mode: 0644]
queue-6.6/net-mlx5e-fix-operation-precedence-bug-in-port-times.patch [new file with mode: 0644]
queue-6.6/net-mlx5e-fix-peer-flow-lists-handling.patch [new file with mode: 0644]
queue-6.6/net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch [new file with mode: 0644]
queue-6.6/net-mvpp2-clear-bm-pool-before-initialization.patch [new file with mode: 0644]
queue-6.6/net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch [new file with mode: 0644]
queue-6.6/net-sched-flower-fix-chain-template-offload.patch [new file with mode: 0644]
queue-6.6/net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch [new file with mode: 0644]
queue-6.6/net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch [new file with mode: 0644]
queue-6.6/netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch [new file with mode: 0644]
queue-6.6/netfilter-nf_tables-validate-nfproto_-family.patch [new file with mode: 0644]
queue-6.6/netfilter-nft_limit-reject-configurations-that-cause.patch [new file with mode: 0644]
queue-6.6/netfs-fscache-prevent-oops-in-fscache_put_cache.patch [new file with mode: 0644]
queue-6.6/netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch [new file with mode: 0644]
queue-6.6/rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch [new file with mode: 0644]
queue-6.6/selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch [new file with mode: 0644]
queue-6.6/selftests-bonding-do-not-test-arp-ns-target-with-mod.patch [new file with mode: 0644]
queue-6.6/selftests-bonding-increase-timeout-to-1200s.patch [new file with mode: 0644]
queue-6.6/selftests-fill-in-some-missing-configs-for-net.patch [new file with mode: 0644]
queue-6.6/selftests-net-fix-rps_default_mask-with-32-cpus.patch [new file with mode: 0644]
queue-6.6/selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch [new file with mode: 0644]
queue-6.6/series
queue-6.6/sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch [new file with mode: 0644]
queue-6.6/tcp-add-memory-barrier-to-tcp_push.patch [new file with mode: 0644]
queue-6.6/tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch [new file with mode: 0644]
queue-6.6/tracing-ensure-visibility-when-inserting-an-element-.patch [new file with mode: 0644]
queue-6.6/tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch [new file with mode: 0644]
queue-6.6/tsnep-remove-fcs-for-xdp-data-path.patch [new file with mode: 0644]
queue-6.6/tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch [new file with mode: 0644]
queue-6.6/tun-fix-missing-dropped-counter-in-tun_xdp_act.patch [new file with mode: 0644]
queue-6.6/udp-fix-busy-polling.patch [new file with mode: 0644]
queue-6.6/vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch [new file with mode: 0644]
queue-6.6/wifi-mac80211-fix-potential-sta-link-leak.patch [new file with mode: 0644]
queue-6.6/xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch [new file with mode: 0644]
queue-6.6/xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch [new file with mode: 0644]
queue-6.6/xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch [new file with mode: 0644]
queue-6.6/xsk-recycle-buffer-in-case-rx-queue-was-full.patch [new file with mode: 0644]

diff --git a/queue-6.6/afs-hide-silly-rename-files-from-userspace.patch b/queue-6.6/afs-hide-silly-rename-files-from-userspace.patch
new file mode 100644 (file)
index 0000000..f11e8df
--- /dev/null
@@ -0,0 +1,54 @@
+From 0c016e7bd1183e0ed5b09bc1d81f8f8e0779831f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 8 Jan 2024 17:22:36 +0000
+Subject: afs: Hide silly-rename files from userspace
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 57e9d49c54528c49b8bffe6d99d782ea051ea534 ]
+
+There appears to be a race between silly-rename files being created/removed
+and various userspace tools iterating over the contents of a directory,
+leading to such errors as:
+
+       find: './kernel/.tmp_cpio_dir/include/dt-bindings/reset/.__afs2080': No such file or directory
+       tar: ./include/linux/greybus/.__afs3C95: File removed before we read it
+
+when building a kernel.
+
+Fix afs_readdir() so that it doesn't return .__afsXXXX silly-rename files
+to userspace.  This doesn't stop them being looked up directly by name as
+we need to be able to look them up from within the kernel as part of the
+silly-rename algorithm.
+
+Fixes: 79ddbfa500b3 ("afs: Implement sillyrename for unlink and rename")
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/dir.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+diff --git a/fs/afs/dir.c b/fs/afs/dir.c
+index 5219182e52e1..2df2e9ee130d 100644
+--- a/fs/afs/dir.c
++++ b/fs/afs/dir.c
+@@ -474,6 +474,14 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
+                       continue;
+               }
++              /* Don't expose silly rename entries to userspace. */
++              if (nlen > 6 &&
++                  dire->u.name[0] == '.' &&
++                  ctx->actor != afs_lookup_filldir &&
++                  ctx->actor != afs_lookup_one_filldir &&
++                  memcmp(dire->u.name, ".__afs", 6) == 0)
++                      continue;
++
+               /* found the next entry */
+               if (!dir_emit(ctx, dire->u.name, nlen,
+                             ntohl(dire->u.vnode),
+-- 
+2.43.0
+
diff --git a/queue-6.6/bnxt_en-prevent-kernel-warning-when-running-offline-.patch b/queue-6.6/bnxt_en-prevent-kernel-warning-when-running-offline-.patch
new file mode 100644 (file)
index 0000000..0775e05
--- /dev/null
@@ -0,0 +1,113 @@
+From 198096f917cac879e5cd7f5f65711b6ac2011576 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 15:45:14 -0800
+Subject: bnxt_en: Prevent kernel warning when running offline self test
+
+From: Michael Chan <michael.chan@broadcom.com>
+
+[ Upstream commit c20f482129a582455f02eb9a6dcb2a4215274599 ]
+
+We call bnxt_half_open_nic() to setup the chip partially to run
+loopback tests.  The rings and buffers are initialized normally
+so that we can transmit and receive packets in loopback mode.
+That means page pool buffers are allocated for the aggregation ring
+just like the normal case.  NAPI is not needed because we are just
+polling for the loopback packets.
+
+When we're done with the loopback tests, we call bnxt_half_close_nic()
+to clean up.  When freeing the page pools, we hit a WARN_ON()
+in page_pool_unlink_napi() because the NAPI state linked to the
+page pool is uninitialized.
+
+The simplest way to avoid this warning is just to initialize the
+NAPIs during half open and delete the NAPIs during half close.
+Trying to skip the page pool initialization or skip linking of
+NAPI during half open will be more complicated.
+
+This fix avoids this warning:
+
+WARNING: CPU: 4 PID: 46967 at net/core/page_pool.c:946 page_pool_unlink_napi+0x1f/0x30
+CPU: 4 PID: 46967 Comm: ethtool Tainted: G S      W          6.7.0-rc5+ #22
+Hardware name: Dell Inc. PowerEdge R750/06V45N, BIOS 1.3.8 08/31/2021
+RIP: 0010:page_pool_unlink_napi+0x1f/0x30
+Code: 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 48 8b 47 18 48 85 c0 74 1b 48 8b 50 10 83 e2 01 74 08 8b 40 34 83 f8 ff 74 02 <0f> 0b 48 c7 47 18 00 00 00 00 c3 cc cc cc cc 66 90 90 90 90 90 90
+RSP: 0018:ffa000003d0dfbe8 EFLAGS: 00010246
+RAX: ff110003607ce640 RBX: ff110010baf5d000 RCX: 0000000000000008
+RDX: 0000000000000000 RSI: ff110001e5e522c0 RDI: ff110010baf5d000
+RBP: ff11000145539b40 R08: 0000000000000001 R09: ffffffffc063f641
+R10: ff110001361eddb8 R11: 000000000040000f R12: 0000000000000001
+R13: 000000000000001c R14: ff1100014553a080 R15: 0000000000003fc0
+FS:  00007f9301c4f740(0000) GS:ff1100103fd00000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007f91344fa8f0 CR3: 00000003527cc005 CR4: 0000000000771ef0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+PKRU: 55555554
+Call Trace:
+ <TASK>
+ ? __warn+0x81/0x140
+ ? page_pool_unlink_napi+0x1f/0x30
+ ? report_bug+0x102/0x200
+ ? handle_bug+0x44/0x70
+ ? exc_invalid_op+0x13/0x60
+ ? asm_exc_invalid_op+0x16/0x20
+ ? bnxt_free_ring.isra.123+0xb1/0xd0 [bnxt_en]
+ ? page_pool_unlink_napi+0x1f/0x30
+ page_pool_destroy+0x3e/0x150
+ bnxt_free_mem+0x441/0x5e0 [bnxt_en]
+ bnxt_half_close_nic+0x2a/0x40 [bnxt_en]
+ bnxt_self_test+0x21d/0x450 [bnxt_en]
+ __dev_ethtool+0xeda/0x2e30
+ ? native_queued_spin_lock_slowpath+0x17f/0x2b0
+ ? __link_object+0xa1/0x160
+ ? _raw_spin_unlock_irqrestore+0x23/0x40
+ ? __create_object+0x5f/0x90
+ ? __kmem_cache_alloc_node+0x317/0x3c0
+ ? dev_ethtool+0x59/0x170
+ dev_ethtool+0xa7/0x170
+ dev_ioctl+0xc3/0x530
+ sock_do_ioctl+0xa8/0xf0
+ sock_ioctl+0x270/0x310
+ __x64_sys_ioctl+0x8c/0xc0
+ do_syscall_64+0x3e/0xf0
+ entry_SYSCALL_64_after_hwframe+0x6e/0x76
+
+Fixes: 294e39e0d034 ("bnxt: hook NAPIs to page pools")
+Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
+Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
+Signed-off-by: Michael Chan <michael.chan@broadcom.com>
+Link: https://lore.kernel.org/r/20240117234515.226944-5-michael.chan@broadcom.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+index 9e04db1273a5..dac4f9510c17 100644
+--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+@@ -10598,10 +10598,12 @@ int bnxt_half_open_nic(struct bnxt *bp)
+               netdev_err(bp->dev, "bnxt_alloc_mem err: %x\n", rc);
+               goto half_open_err;
+       }
++      bnxt_init_napi(bp);
+       set_bit(BNXT_STATE_HALF_OPEN, &bp->state);
+       rc = bnxt_init_nic(bp, true);
+       if (rc) {
+               clear_bit(BNXT_STATE_HALF_OPEN, &bp->state);
++              bnxt_del_napi(bp);
+               netdev_err(bp->dev, "bnxt_init_nic err: %x\n", rc);
+               goto half_open_err;
+       }
+@@ -10620,6 +10622,7 @@ int bnxt_half_open_nic(struct bnxt *bp)
+ void bnxt_half_close_nic(struct bnxt *bp)
+ {
+       bnxt_hwrm_resource_free(bp, false, true);
++      bnxt_del_napi(bp);
+       bnxt_free_skbs(bp);
+       bnxt_free_mem(bp, true);
+       clear_bit(BNXT_STATE_HALF_OPEN, &bp->state);
+-- 
+2.43.0
+
diff --git a/queue-6.6/bnxt_en-wait-for-flr-to-complete-during-probe.patch b/queue-6.6/bnxt_en-wait-for-flr-to-complete-during-probe.patch
new file mode 100644 (file)
index 0000000..39ddaeb
--- /dev/null
@@ -0,0 +1,43 @@
+From 9736c9e26022a20d1c33e799d7b117ca0d1b5f6f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 15:45:11 -0800
+Subject: bnxt_en: Wait for FLR to complete during probe
+
+From: Michael Chan <michael.chan@broadcom.com>
+
+[ Upstream commit 3c1069fa42872f95cf3c6fedf80723d391e12d57 ]
+
+The first message to firmware may fail if the device is undergoing FLR.
+The driver has some recovery logic for this failure scenario but we must
+wait 100 msec for FLR to complete before proceeding.  Otherwise the
+recovery will always fail.
+
+Fixes: ba02629ff6cb ("bnxt_en: log firmware status on firmware init failure")
+Reviewed-by: Damodharam Ammepalli <damodharam.ammepalli@broadcom.com>
+Signed-off-by: Michael Chan <michael.chan@broadcom.com>
+Link: https://lore.kernel.org/r/20240117234515.226944-2-michael.chan@broadcom.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+index 6039886a8544..9e04db1273a5 100644
+--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+@@ -12261,6 +12261,11 @@ static int bnxt_fw_init_one_p1(struct bnxt *bp)
+       bp->fw_cap = 0;
+       rc = bnxt_hwrm_ver_get(bp);
++      /* FW may be unresponsive after FLR. FLR must complete within 100 msec
++       * so wait before continuing with recovery.
++       */
++      if (rc)
++              msleep(100);
+       bnxt_try_map_fw_health_reg(bp);
+       if (rc) {
+               rc = bnxt_try_recover_fw(bp);
+-- 
+2.43.0
+
diff --git a/queue-6.6/bpf-add-bpf_sock_addr_set_sun_path-to-allow-writing-.patch b/queue-6.6/bpf-add-bpf_sock_addr_set_sun_path-to-allow-writing-.patch
new file mode 100644 (file)
index 0000000..248346c
--- /dev/null
@@ -0,0 +1,116 @@
+From a2bf332849250cb0b7a5a168a0675fc16ac0ffb8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 11 Oct 2023 20:51:05 +0200
+Subject: bpf: Add bpf_sock_addr_set_sun_path() to allow writing unix sockaddr
+ from bpf
+
+From: Daan De Meyer <daan.j.demeyer@gmail.com>
+
+[ Upstream commit 53e380d21441909b12b6e0782b77187ae4b971c4 ]
+
+As prep for adding unix socket support to the cgroup sockaddr hooks,
+let's add a kfunc bpf_sock_addr_set_sun_path() that allows modifying a unix
+sockaddr from bpf. While this is already possible for AF_INET and AF_INET6,
+we'll need this kfunc when we add unix socket support since modifying the
+address for those requires modifying both the address and the sockaddr
+length.
+
+Signed-off-by: Daan De Meyer <daan.j.demeyer@gmail.com>
+Link: https://lore.kernel.org/r/20231011185113.140426-4-daan.j.demeyer@gmail.com
+Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
+Stable-dep-of: c5114710c8ce ("xsk: fix usage of multi-buffer BPF helpers for ZC XDP")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/bpf/btf.c  |  1 +
+ net/core/filter.c | 35 ++++++++++++++++++++++++++++++++++-
+ 2 files changed, 35 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/bpf/btf.c b/kernel/bpf/btf.c
+index 8090d7fb11ef..a31704a6bb61 100644
+--- a/kernel/bpf/btf.c
++++ b/kernel/bpf/btf.c
+@@ -7832,6 +7832,7 @@ static int bpf_prog_type_to_kfunc_hook(enum bpf_prog_type prog_type)
+       case BPF_PROG_TYPE_SYSCALL:
+               return BTF_KFUNC_HOOK_SYSCALL;
+       case BPF_PROG_TYPE_CGROUP_SKB:
++      case BPF_PROG_TYPE_CGROUP_SOCK_ADDR:
+               return BTF_KFUNC_HOOK_CGROUP_SKB;
+       case BPF_PROG_TYPE_SCHED_ACT:
+               return BTF_KFUNC_HOOK_SCHED_ACT;
+diff --git a/net/core/filter.c b/net/core/filter.c
+index 90fe3e754383..cbc395d96479 100644
+--- a/net/core/filter.c
++++ b/net/core/filter.c
+@@ -81,6 +81,7 @@
+ #include <net/xdp.h>
+ #include <net/mptcp.h>
+ #include <net/netfilter/nf_conntrack_bpf.h>
++#include <linux/un.h>
+ static const struct bpf_func_proto *
+ bpf_sk_base_func_proto(enum bpf_func_id func_id);
+@@ -11772,6 +11773,27 @@ __bpf_kfunc int bpf_dynptr_from_xdp(struct xdp_buff *xdp, u64 flags,
+       return 0;
+ }
++
++__bpf_kfunc int bpf_sock_addr_set_sun_path(struct bpf_sock_addr_kern *sa_kern,
++                                         const u8 *sun_path, u32 sun_path__sz)
++{
++      struct sockaddr_un *un;
++
++      if (sa_kern->sk->sk_family != AF_UNIX)
++              return -EINVAL;
++
++      /* We do not allow changing the address to unnamed or larger than the
++       * maximum allowed address size for a unix sockaddr.
++       */
++      if (sun_path__sz == 0 || sun_path__sz > UNIX_PATH_MAX)
++              return -EINVAL;
++
++      un = (struct sockaddr_un *)sa_kern->uaddr;
++      memcpy(un->sun_path, sun_path, sun_path__sz);
++      sa_kern->uaddrlen = offsetof(struct sockaddr_un, sun_path) + sun_path__sz;
++
++      return 0;
++}
+ __diag_pop();
+ int bpf_dynptr_from_skb_rdonly(struct sk_buff *skb, u64 flags,
+@@ -11796,6 +11818,10 @@ BTF_SET8_START(bpf_kfunc_check_set_xdp)
+ BTF_ID_FLAGS(func, bpf_dynptr_from_xdp)
+ BTF_SET8_END(bpf_kfunc_check_set_xdp)
++BTF_SET8_START(bpf_kfunc_check_set_sock_addr)
++BTF_ID_FLAGS(func, bpf_sock_addr_set_sun_path)
++BTF_SET8_END(bpf_kfunc_check_set_sock_addr)
++
+ static const struct btf_kfunc_id_set bpf_kfunc_set_skb = {
+       .owner = THIS_MODULE,
+       .set = &bpf_kfunc_check_set_skb,
+@@ -11806,6 +11832,11 @@ static const struct btf_kfunc_id_set bpf_kfunc_set_xdp = {
+       .set = &bpf_kfunc_check_set_xdp,
+ };
++static const struct btf_kfunc_id_set bpf_kfunc_set_sock_addr = {
++      .owner = THIS_MODULE,
++      .set = &bpf_kfunc_check_set_sock_addr,
++};
++
+ static int __init bpf_kfunc_init(void)
+ {
+       int ret;
+@@ -11820,7 +11851,9 @@ static int __init bpf_kfunc_init(void)
+       ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_XMIT, &bpf_kfunc_set_skb);
+       ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_LWT_SEG6LOCAL, &bpf_kfunc_set_skb);
+       ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_NETFILTER, &bpf_kfunc_set_skb);
+-      return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
++      ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &bpf_kfunc_set_xdp);
++      return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_CGROUP_SOCK_ADDR,
++                                              &bpf_kfunc_set_sock_addr);
+ }
+ late_initcall(bpf_kfunc_init);
+-- 
+2.43.0
+
diff --git a/queue-6.6/bpf-propagate-modified-uaddrlen-from-cgroup-sockaddr.patch b/queue-6.6/bpf-propagate-modified-uaddrlen-from-cgroup-sockaddr.patch
new file mode 100644 (file)
index 0000000..43f9283
--- /dev/null
@@ -0,0 +1,449 @@
+From 04d0f77b90682fb29a22a4567a54e12bedbe4f13 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 11 Oct 2023 20:51:04 +0200
+Subject: bpf: Propagate modified uaddrlen from cgroup sockaddr programs
+
+From: Daan De Meyer <daan.j.demeyer@gmail.com>
+
+[ Upstream commit fefba7d1ae198dcbf8b3b432de46a4e29f8dbd8c ]
+
+As prep for adding unix socket support to the cgroup sockaddr hooks,
+let's propagate the sockaddr length back to the caller after running
+a bpf cgroup sockaddr hook program. While not important for AF_INET or
+AF_INET6, the sockaddr length is important when working with AF_UNIX
+sockaddrs as the size of the sockaddr cannot be determined just from the
+address family or the sockaddr's contents.
+
+__cgroup_bpf_run_filter_sock_addr() is modified to take the uaddrlen as
+an input/output argument. After running the program, the modified sockaddr
+length is stored in the uaddrlen pointer.
+
+Signed-off-by: Daan De Meyer <daan.j.demeyer@gmail.com>
+Link: https://lore.kernel.org/r/20231011185113.140426-3-daan.j.demeyer@gmail.com
+Signed-off-by: Martin KaFai Lau <martin.lau@kernel.org>
+Stable-dep-of: c5114710c8ce ("xsk: fix usage of multi-buffer BPF helpers for ZC XDP")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/bpf-cgroup.h | 73 +++++++++++++++++++-------------------
+ include/linux/filter.h     |  1 +
+ kernel/bpf/cgroup.c        | 17 +++++++--
+ net/ipv4/af_inet.c         |  7 ++--
+ net/ipv4/ping.c            |  2 +-
+ net/ipv4/tcp_ipv4.c        |  2 +-
+ net/ipv4/udp.c             |  9 +++--
+ net/ipv6/af_inet6.c        |  9 ++---
+ net/ipv6/ping.c            |  2 +-
+ net/ipv6/tcp_ipv6.c        |  2 +-
+ net/ipv6/udp.c             |  6 ++--
+ 11 files changed, 76 insertions(+), 54 deletions(-)
+
+diff --git a/include/linux/bpf-cgroup.h b/include/linux/bpf-cgroup.h
+index 8506690dbb9c..31561e789715 100644
+--- a/include/linux/bpf-cgroup.h
++++ b/include/linux/bpf-cgroup.h
+@@ -120,6 +120,7 @@ int __cgroup_bpf_run_filter_sk(struct sock *sk,
+ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
+                                     struct sockaddr *uaddr,
++                                    int *uaddrlen,
+                                     enum cgroup_bpf_attach_type atype,
+                                     void *t_ctx,
+                                     u32 *flags);
+@@ -230,22 +231,22 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
+ #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk)                                      \
+       BPF_CGROUP_RUN_SK_PROG(sk, CGROUP_INET6_POST_BIND)
+-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype)                                     \
++#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype)                   \
+ ({                                                                           \
+       int __ret = 0;                                                         \
+       if (cgroup_bpf_enabled(atype))                                         \
+-              __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
+-                                                        NULL, NULL);         \
++              __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
++                                                        atype, NULL, NULL);  \
+       __ret;                                                                 \
+ })
+-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx)                 \
++#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx)               \
+ ({                                                                           \
+       int __ret = 0;                                                         \
+       if (cgroup_bpf_enabled(atype))  {                                      \
+               lock_sock(sk);                                                 \
+-              __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
+-                                                        t_ctx, NULL);        \
++              __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
++                                                        atype, t_ctx, NULL); \
+               release_sock(sk);                                              \
+       }                                                                      \
+       __ret;                                                                 \
+@@ -256,14 +257,14 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
+  * (at bit position 0) is to indicate CAP_NET_BIND_SERVICE capability check
+  * should be bypassed (BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE).
+  */
+-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, bind_flags)             \
++#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, bind_flags) \
+ ({                                                                           \
+       u32 __flags = 0;                                                       \
+       int __ret = 0;                                                         \
+       if (cgroup_bpf_enabled(atype))  {                                      \
+               lock_sock(sk);                                                 \
+-              __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, atype,     \
+-                                                        NULL, &__flags);     \
++              __ret = __cgroup_bpf_run_filter_sock_addr(sk, uaddr, uaddrlen, \
++                                                        atype, NULL, &__flags); \
+               release_sock(sk);                                              \
+               if (__flags & BPF_RET_BIND_NO_CAP_NET_BIND_SERVICE)            \
+                       *bind_flags |= BIND_NO_CAP_NET_BIND_SERVICE;           \
+@@ -276,29 +277,29 @@ static inline bool cgroup_bpf_sock_enabled(struct sock *sk,
+         cgroup_bpf_enabled(CGROUP_INET6_CONNECT)) &&                 \
+        (sk)->sk_prot->pre_connect)
+-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr)                         \
+-      BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET4_CONNECT)
++#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen)                        \
++      BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT)
+-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr)                         \
+-      BPF_CGROUP_RUN_SA_PROG(sk, uaddr, CGROUP_INET6_CONNECT)
++#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen)                        \
++      BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT)
+-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr)                    \
+-      BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET4_CONNECT, NULL)
++#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen)           \
++      BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET4_CONNECT, NULL)
+-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr)                    \
+-      BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_INET6_CONNECT, NULL)
++#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen)           \
++      BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_INET6_CONNECT, NULL)
+-#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx)                      \
+-      BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_SENDMSG, t_ctx)
++#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)     \
++      BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_SENDMSG, t_ctx)
+-#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx)                      \
+-      BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_SENDMSG, t_ctx)
++#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx)     \
++      BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_SENDMSG, t_ctx)
+-#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr)                      \
+-      BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP4_RECVMSG, NULL)
++#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen)            \
++      BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP4_RECVMSG, NULL)
+-#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr)                      \
+-      BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, CGROUP_UDP6_RECVMSG, NULL)
++#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen)            \
++      BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, CGROUP_UDP6_RECVMSG, NULL)
+ /* The SOCK_OPS"_SK" macro should be used when sock_ops->sk is not a
+  * fullsock and its parent fullsock cannot be traced by
+@@ -477,24 +478,24 @@ static inline int bpf_percpu_cgroup_storage_update(struct bpf_map *map,
+ }
+ #define cgroup_bpf_enabled(atype) (0)
+-#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, atype, t_ctx) ({ 0; })
+-#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, atype) ({ 0; })
++#define BPF_CGROUP_RUN_SA_PROG_LOCK(sk, uaddr, uaddrlen, atype, t_ctx) ({ 0; })
++#define BPF_CGROUP_RUN_SA_PROG(sk, uaddr, uaddrlen, atype) ({ 0; })
+ #define BPF_CGROUP_PRE_CONNECT_ENABLED(sk) (0)
+ #define BPF_CGROUP_RUN_PROG_INET_INGRESS(sk,skb) ({ 0; })
+ #define BPF_CGROUP_RUN_PROG_INET_EGRESS(sk,skb) ({ 0; })
+ #define BPF_CGROUP_RUN_PROG_INET_SOCK(sk) ({ 0; })
+ #define BPF_CGROUP_RUN_PROG_INET_SOCK_RELEASE(sk) ({ 0; })
+-#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, atype, flags) ({ 0; })
++#define BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, uaddrlen, atype, flags) ({ 0; })
+ #define BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk) ({ 0; })
+ #define BPF_CGROUP_RUN_PROG_INET6_POST_BIND(sk) ({ 0; })
+-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr) ({ 0; })
+-#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr) ({ 0; })
+-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr) ({ 0; })
+-#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr) ({ 0; })
+-#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
+-#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, t_ctx) ({ 0; })
+-#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr) ({ 0; })
+-#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr) ({ 0; })
++#define BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, uaddrlen) ({ 0; })
++#define BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
++#define BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, uaddrlen) ({ 0; })
++#define BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, uaddrlen) ({ 0; })
++#define BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
++#define BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk, uaddr, uaddrlen, t_ctx) ({ 0; })
++#define BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
++#define BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk, uaddr, uaddrlen) ({ 0; })
+ #define BPF_CGROUP_RUN_PROG_SOCK_OPS(sock_ops) ({ 0; })
+ #define BPF_CGROUP_RUN_PROG_DEVICE_CGROUP(atype, major, minor, access) ({ 0; })
+ #define BPF_CGROUP_RUN_PROG_SYSCTL(head,table,write,buf,count,pos) ({ 0; })
+diff --git a/include/linux/filter.h b/include/linux/filter.h
+index 761af6b3cf2b..77db4263d68d 100644
+--- a/include/linux/filter.h
++++ b/include/linux/filter.h
+@@ -1285,6 +1285,7 @@ struct bpf_sock_addr_kern {
+        */
+       u64 tmp_reg;
+       void *t_ctx;    /* Attach type specific context. */
++      u32 uaddrlen;
+ };
+ struct bpf_sock_ops_kern {
+diff --git a/kernel/bpf/cgroup.c b/kernel/bpf/cgroup.c
+index 03b3d4492980..ac37bd53aee0 100644
+--- a/kernel/bpf/cgroup.c
++++ b/kernel/bpf/cgroup.c
+@@ -1450,6 +1450,9 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
+  *                                       provided by user sockaddr
+  * @sk: sock struct that will use sockaddr
+  * @uaddr: sockaddr struct provided by user
++ * @uaddrlen: Pointer to the size of the sockaddr struct provided by user. It is
++ *            read-only for AF_INET[6] uaddr but can be modified for AF_UNIX
++ *            uaddr.
+  * @atype: The type of program to be executed
+  * @t_ctx: Pointer to attach type specific context
+  * @flags: Pointer to u32 which contains higher bits of BPF program
+@@ -1462,6 +1465,7 @@ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sk);
+  */
+ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
+                                     struct sockaddr *uaddr,
++                                    int *uaddrlen,
+                                     enum cgroup_bpf_attach_type atype,
+                                     void *t_ctx,
+                                     u32 *flags)
+@@ -1473,6 +1477,7 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
+       };
+       struct sockaddr_storage unspec;
+       struct cgroup *cgrp;
++      int ret;
+       /* Check socket family since not all sockets represent network
+        * endpoint (e.g. AF_UNIX).
+@@ -1483,11 +1488,19 @@ int __cgroup_bpf_run_filter_sock_addr(struct sock *sk,
+       if (!ctx.uaddr) {
+               memset(&unspec, 0, sizeof(unspec));
+               ctx.uaddr = (struct sockaddr *)&unspec;
++              ctx.uaddrlen = 0;
++      } else {
++              ctx.uaddrlen = *uaddrlen;
+       }
+       cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data);
+-      return bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
+-                                   0, flags);
++      ret = bpf_prog_run_array_cg(&cgrp->bpf, atype, &ctx, bpf_prog_run,
++                                  0, flags);
++
++      if (!ret && uaddr)
++              *uaddrlen = ctx.uaddrlen;
++
++      return ret;
+ }
+ EXPORT_SYMBOL(__cgroup_bpf_run_filter_sock_addr);
+diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
+index b739ddbef0f0..7d4d625471f7 100644
+--- a/net/ipv4/af_inet.c
++++ b/net/ipv4/af_inet.c
+@@ -455,7 +455,7 @@ int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+       /* BPF prog is run before any checks are done so that if the prog
+        * changes context in a wrong way it will be caught.
+        */
+-      err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
++      err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
+                                                CGROUP_INET4_BIND, &flags);
+       if (err)
+               return err;
+@@ -797,6 +797,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
+       struct sock *sk         = sock->sk;
+       struct inet_sock *inet  = inet_sk(sk);
+       DECLARE_SOCKADDR(struct sockaddr_in *, sin, uaddr);
++      int sin_addr_len = sizeof(*sin);
+       sin->sin_family = AF_INET;
+       lock_sock(sk);
+@@ -809,7 +810,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
+               }
+               sin->sin_port = inet->inet_dport;
+               sin->sin_addr.s_addr = inet->inet_daddr;
+-              BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
++              BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
+                                      CGROUP_INET4_GETPEERNAME);
+       } else {
+               __be32 addr = inet->inet_rcv_saddr;
+@@ -817,7 +818,7 @@ int inet_getname(struct socket *sock, struct sockaddr *uaddr,
+                       addr = inet->inet_saddr;
+               sin->sin_port = inet->inet_sport;
+               sin->sin_addr.s_addr = addr;
+-              BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
++              BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
+                                      CGROUP_INET4_GETSOCKNAME);
+       }
+       release_sock(sk);
+diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
+index 75e0aee35eb7..4cb0c896caf9 100644
+--- a/net/ipv4/ping.c
++++ b/net/ipv4/ping.c
+@@ -301,7 +301,7 @@ static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+       if (addr_len < sizeof(struct sockaddr_in))
+               return -EINVAL;
+-      return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr);
++      return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
+ }
+ /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */
+diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
+index 4167e8a48b60..c7ffab37a34c 100644
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -194,7 +194,7 @@ static int tcp_v4_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+       sock_owned_by_me(sk);
+-      return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr);
++      return BPF_CGROUP_RUN_PROG_INET4_CONNECT(sk, uaddr, &addr_len);
+ }
+ /* This will initiate an outgoing connection. */
+diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
+index 9cb22a6ae1dc..7be4ddc80d95 100644
+--- a/net/ipv4/udp.c
++++ b/net/ipv4/udp.c
+@@ -1143,7 +1143,9 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+       if (cgroup_bpf_enabled(CGROUP_UDP4_SENDMSG) && !connected) {
+               err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk,
+-                                          (struct sockaddr *)usin, &ipc.addr);
++                                          (struct sockaddr *)usin,
++                                          &msg->msg_namelen,
++                                          &ipc.addr);
+               if (err)
+                       goto out_free;
+               if (usin) {
+@@ -1865,7 +1867,8 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int flags,
+               *addr_len = sizeof(*sin);
+               BPF_CGROUP_RUN_PROG_UDP4_RECVMSG_LOCK(sk,
+-                                                    (struct sockaddr *)sin);
++                                                    (struct sockaddr *)sin,
++                                                    addr_len);
+       }
+       if (udp_test_bit(GRO_ENABLED, sk))
+@@ -1904,7 +1907,7 @@ int udp_pre_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+       if (addr_len < sizeof(struct sockaddr_in))
+               return -EINVAL;
+-      return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr);
++      return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr, &addr_len);
+ }
+ EXPORT_SYMBOL(udp_pre_connect);
+diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
+index b6c5b5e25a2f..4375bfa4f608 100644
+--- a/net/ipv6/af_inet6.c
++++ b/net/ipv6/af_inet6.c
+@@ -456,7 +456,7 @@ int inet6_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
+       /* BPF prog is run before any checks are done so that if the prog
+        * changes context in a wrong way it will be caught.
+        */
+-      err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr,
++      err = BPF_CGROUP_RUN_PROG_INET_BIND_LOCK(sk, uaddr, &addr_len,
+                                                CGROUP_INET6_BIND, &flags);
+       if (err)
+               return err;
+@@ -522,6 +522,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
+                 int peer)
+ {
+       struct sockaddr_in6 *sin = (struct sockaddr_in6 *)uaddr;
++      int sin_addr_len = sizeof(*sin);
+       struct sock *sk = sock->sk;
+       struct inet_sock *inet = inet_sk(sk);
+       struct ipv6_pinfo *np = inet6_sk(sk);
+@@ -541,7 +542,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
+               sin->sin6_addr = sk->sk_v6_daddr;
+               if (np->sndflow)
+                       sin->sin6_flowinfo = np->flow_label;
+-              BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
++              BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
+                                      CGROUP_INET6_GETPEERNAME);
+       } else {
+               if (ipv6_addr_any(&sk->sk_v6_rcv_saddr))
+@@ -549,13 +550,13 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr,
+               else
+                       sin->sin6_addr = sk->sk_v6_rcv_saddr;
+               sin->sin6_port = inet->inet_sport;
+-              BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin,
++              BPF_CGROUP_RUN_SA_PROG(sk, (struct sockaddr *)sin, &sin_addr_len,
+                                      CGROUP_INET6_GETSOCKNAME);
+       }
+       sin->sin6_scope_id = ipv6_iface_scope_id(&sin->sin6_addr,
+                                                sk->sk_bound_dev_if);
+       release_sock(sk);
+-      return sizeof(*sin);
++      return sin_addr_len;
+ }
+ EXPORT_SYMBOL(inet6_getname);
+diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
+index 5831aaa53d75..25243737fbc4 100644
+--- a/net/ipv6/ping.c
++++ b/net/ipv6/ping.c
+@@ -56,7 +56,7 @@ static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+       if (addr_len < SIN6_LEN_RFC2133)
+               return -EINVAL;
+-      return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr);
++      return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len);
+ }
+ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
+index 44b6949d72b2..3783334ef233 100644
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -135,7 +135,7 @@ static int tcp_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+       sock_owned_by_me(sk);
+-      return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr);
++      return BPF_CGROUP_RUN_PROG_INET6_CONNECT(sk, uaddr, &addr_len);
+ }
+ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
+diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
+index f1170dcc21d9..438476a31313 100644
+--- a/net/ipv6/udp.c
++++ b/net/ipv6/udp.c
+@@ -410,7 +410,8 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
+               *addr_len = sizeof(*sin6);
+               BPF_CGROUP_RUN_PROG_UDP6_RECVMSG_LOCK(sk,
+-                                                    (struct sockaddr *)sin6);
++                                                    (struct sockaddr *)sin6,
++                                                    addr_len);
+       }
+       if (udp_test_bit(GRO_ENABLED, sk))
+@@ -1157,7 +1158,7 @@ static int udpv6_pre_connect(struct sock *sk, struct sockaddr *uaddr,
+       if (addr_len < SIN6_LEN_RFC2133)
+               return -EINVAL;
+-      return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr);
++      return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr, &addr_len);
+ }
+ /**
+@@ -1510,6 +1511,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+       if (cgroup_bpf_enabled(CGROUP_UDP6_SENDMSG) && !connected) {
+               err = BPF_CGROUP_RUN_PROG_UDP6_SENDMSG_LOCK(sk,
+                                          (struct sockaddr *)sin6,
++                                         &addr_len,
+                                          &fl6->saddr);
+               if (err)
+                       goto out_no_dst;
+-- 
+2.43.0
+
diff --git a/queue-6.6/btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch b/queue-6.6/btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch
new file mode 100644 (file)
index 0000000..0b985ad
--- /dev/null
@@ -0,0 +1,161 @@
+From 4ef6700db3661b10beae452d02a6fc8b0e7570d4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 11:02:25 +1030
+Subject: btrfs: scrub: avoid use-after-free when chunk length is not 64K
+ aligned
+
+From: Qu Wenruo <wqu@suse.com>
+
+[ Upstream commit f546c4282673497a06ecb6190b50ae7f6c85b02f ]
+
+[BUG]
+There is a bug report that, on a ext4-converted btrfs, scrub leads to
+various problems, including:
+
+- "unable to find chunk map" errors
+  BTRFS info (device vdb): scrub: started on devid 1
+  BTRFS critical (device vdb): unable to find chunk map for logical 2214744064 length 4096
+  BTRFS critical (device vdb): unable to find chunk map for logical 2214744064 length 45056
+
+  This would lead to unrepariable errors.
+
+- Use-after-free KASAN reports:
+  ==================================================================
+  BUG: KASAN: slab-use-after-free in __blk_rq_map_sg+0x18f/0x7c0
+  Read of size 8 at addr ffff8881013c9040 by task btrfs/909
+  CPU: 0 PID: 909 Comm: btrfs Not tainted 6.7.0-x64v3-dbg #11 c50636e9419a8354555555245df535e380563b2b
+  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 2023.11-2 12/24/2023
+  Call Trace:
+   <TASK>
+   dump_stack_lvl+0x43/0x60
+   print_report+0xcf/0x640
+   kasan_report+0xa6/0xd0
+   __blk_rq_map_sg+0x18f/0x7c0
+   virtblk_prep_rq.isra.0+0x215/0x6a0 [virtio_blk 19a65eeee9ae6fcf02edfad39bb9ddee07dcdaff]
+   virtio_queue_rqs+0xc4/0x310 [virtio_blk 19a65eeee9ae6fcf02edfad39bb9ddee07dcdaff]
+   blk_mq_flush_plug_list.part.0+0x780/0x860
+   __blk_flush_plug+0x1ba/0x220
+   blk_finish_plug+0x3b/0x60
+   submit_initial_group_read+0x10a/0x290 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+   flush_scrub_stripes+0x38e/0x430 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+   scrub_stripe+0x82a/0xae0 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+   scrub_chunk+0x178/0x200 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+   scrub_enumerate_chunks+0x4bc/0xa30 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+   btrfs_scrub_dev+0x398/0x810 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+   btrfs_ioctl+0x4b9/0x3020 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+   __x64_sys_ioctl+0xbd/0x100
+   do_syscall_64+0x5d/0xe0
+   entry_SYSCALL_64_after_hwframe+0x63/0x6b
+  RIP: 0033:0x7f47e5e0952b
+
+- Crash, mostly due to above use-after-free
+
+[CAUSE]
+The converted fs has the following data chunk layout:
+
+    item 2 key (FIRST_CHUNK_TREE CHUNK_ITEM 2214658048) itemoff 16025 itemsize 80
+        length 86016 owner 2 stripe_len 65536 type DATA|single
+
+For above logical bytenr 2214744064, it's at the chunk end
+(2214658048 + 86016 = 2214744064).
+
+This means btrfs_submit_bio() would split the bio, and trigger endio
+function for both of the two halves.
+
+However scrub_submit_initial_read() would only expect the endio function
+to be called once, not any more.
+This means the first endio function would already free the bbio::bio,
+leaving the bvec freed, thus the 2nd endio call would lead to
+use-after-free.
+
+[FIX]
+- Make sure scrub_read_endio() only updates bits in its range
+  Since we may read less than 64K at the end of the chunk, we should not
+  touch the bits beyond chunk boundary.
+
+- Make sure scrub_submit_initial_read() only to read the chunk range
+  This is done by calculating the real number of sectors we need to
+  read, and add sector-by-sector to the bio.
+
+Thankfully the scrub read repair path won't need extra fixes:
+
+- scrub_stripe_submit_repair_read()
+  With above fixes, we won't update error bit for range beyond chunk,
+  thus scrub_stripe_submit_repair_read() should never submit any read
+  beyond the chunk.
+
+Reported-by: Rongrong <i@rong.moe>
+Fixes: e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure")
+Tested-by: Rongrong <i@rong.moe>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/scrub.c | 29 ++++++++++++++++++++++-------
+ 1 file changed, 22 insertions(+), 7 deletions(-)
+
+diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
+index 4445a52a0707..12147d0f2805 100644
+--- a/fs/btrfs/scrub.c
++++ b/fs/btrfs/scrub.c
+@@ -1099,12 +1099,22 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
+ static void scrub_read_endio(struct btrfs_bio *bbio)
+ {
+       struct scrub_stripe *stripe = bbio->private;
++      struct bio_vec *bvec;
++      int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
++      int num_sectors;
++      u32 bio_size = 0;
++      int i;
++
++      ASSERT(sector_nr < stripe->nr_sectors);
++      bio_for_each_bvec_all(bvec, &bbio->bio, i)
++              bio_size += bvec->bv_len;
++      num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;
+       if (bbio->bio.bi_status) {
+-              bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
+-              bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
++              bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors);
++              bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors);
+       } else {
+-              bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
++              bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors);
+       }
+       bio_put(&bbio->bio);
+       if (atomic_dec_and_test(&stripe->pending_io)) {
+@@ -1640,6 +1650,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
+ {
+       struct btrfs_fs_info *fs_info = sctx->fs_info;
+       struct btrfs_bio *bbio;
++      unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
++                                    stripe->bg->length - stripe->logical) >>
++                                fs_info->sectorsize_bits;
+       int mirror = stripe->mirror_num;
+       ASSERT(stripe->bg);
+@@ -1649,14 +1662,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
+       bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
+                              scrub_read_endio, stripe);
+-      /* Read the whole stripe. */
+       bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
+-      for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
++      /* Read the whole range inside the chunk boundary. */
++      for (unsigned int cur = 0; cur < nr_sectors; cur++) {
++              struct page *page = scrub_stripe_get_page(stripe, cur);
++              unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur);
+               int ret;
+-              ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
++              ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+               /* We should have allocated enough bio vectors. */
+-              ASSERT(ret == PAGE_SIZE);
++              ASSERT(ret == fs_info->sectorsize);
+       }
+       atomic_inc(&stripe->pending_io);
+-- 
+2.43.0
+
diff --git a/queue-6.6/fjes-fix-memleaks-in-fjes_hw_setup.patch b/queue-6.6/fjes-fix-memleaks-in-fjes_hw_setup.patch
new file mode 100644 (file)
index 0000000..9d7928f
--- /dev/null
@@ -0,0 +1,109 @@
+From 026713060d1f9306327ab4782d38b8cc303dfdbd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 01:24:42 +0800
+Subject: fjes: fix memleaks in fjes_hw_setup
+
+From: Zhipeng Lu <alexious@zju.edu.cn>
+
+[ Upstream commit f6cc4b6a3ae53df425771000e9c9540cce9b7bb1 ]
+
+In fjes_hw_setup, it allocates several memory and delay the deallocation
+to the fjes_hw_exit in fjes_probe through the following call chain:
+
+fjes_probe
+  |-> fjes_hw_init
+        |-> fjes_hw_setup
+  |-> fjes_hw_exit
+
+However, when fjes_hw_setup fails, fjes_hw_exit won't be called and thus
+all the resources allocated in fjes_hw_setup will be leaked. In this
+patch, we free those resources in fjes_hw_setup and prevents such leaks.
+
+Fixes: 2fcbca687702 ("fjes: platform_driver's .probe and .remove routine")
+Signed-off-by: Zhipeng Lu <alexious@zju.edu.cn>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240122172445.3841883-1-alexious@zju.edu.cn
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/fjes/fjes_hw.c | 37 ++++++++++++++++++++++++++++++-------
+ 1 file changed, 30 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/net/fjes/fjes_hw.c b/drivers/net/fjes/fjes_hw.c
+index 704e949484d0..b9b5554ea862 100644
+--- a/drivers/net/fjes/fjes_hw.c
++++ b/drivers/net/fjes/fjes_hw.c
+@@ -221,21 +221,25 @@ static int fjes_hw_setup(struct fjes_hw *hw)
+       mem_size = FJES_DEV_REQ_BUF_SIZE(hw->max_epid);
+       hw->hw_info.req_buf = kzalloc(mem_size, GFP_KERNEL);
+-      if (!(hw->hw_info.req_buf))
+-              return -ENOMEM;
++      if (!(hw->hw_info.req_buf)) {
++              result = -ENOMEM;
++              goto free_ep_info;
++      }
+       hw->hw_info.req_buf_size = mem_size;
+       mem_size = FJES_DEV_RES_BUF_SIZE(hw->max_epid);
+       hw->hw_info.res_buf = kzalloc(mem_size, GFP_KERNEL);
+-      if (!(hw->hw_info.res_buf))
+-              return -ENOMEM;
++      if (!(hw->hw_info.res_buf)) {
++              result = -ENOMEM;
++              goto free_req_buf;
++      }
+       hw->hw_info.res_buf_size = mem_size;
+       result = fjes_hw_alloc_shared_status_region(hw);
+       if (result)
+-              return result;
++              goto free_res_buf;
+       hw->hw_info.buffer_share_bit = 0;
+       hw->hw_info.buffer_unshare_reserve_bit = 0;
+@@ -246,11 +250,11 @@ static int fjes_hw_setup(struct fjes_hw *hw)
+                       result = fjes_hw_alloc_epbuf(&buf_pair->tx);
+                       if (result)
+-                              return result;
++                              goto free_epbuf;
+                       result = fjes_hw_alloc_epbuf(&buf_pair->rx);
+                       if (result)
+-                              return result;
++                              goto free_epbuf;
+                       spin_lock_irqsave(&hw->rx_status_lock, flags);
+                       fjes_hw_setup_epbuf(&buf_pair->tx, mac,
+@@ -273,6 +277,25 @@ static int fjes_hw_setup(struct fjes_hw *hw)
+       fjes_hw_init_command_registers(hw, &param);
+       return 0;
++
++free_epbuf:
++      for (epidx = 0; epidx < hw->max_epid ; epidx++) {
++              if (epidx == hw->my_epid)
++                      continue;
++              fjes_hw_free_epbuf(&hw->ep_shm_info[epidx].tx);
++              fjes_hw_free_epbuf(&hw->ep_shm_info[epidx].rx);
++      }
++      fjes_hw_free_shared_status_region(hw);
++free_res_buf:
++      kfree(hw->hw_info.res_buf);
++      hw->hw_info.res_buf = NULL;
++free_req_buf:
++      kfree(hw->hw_info.req_buf);
++      hw->hw_info.req_buf = NULL;
++free_ep_info:
++      kfree(hw->ep_shm_info);
++      hw->ep_shm_info = NULL;
++      return result;
+ }
+ static void fjes_hw_cleanup(struct fjes_hw *hw)
+-- 
+2.43.0
+
diff --git a/queue-6.6/i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch b/queue-6.6/i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch
new file mode 100644 (file)
index 0000000..bfb51a9
--- /dev/null
@@ -0,0 +1,170 @@
+From 815b14ad14c128be5bd96027884270db7cf46ee7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:56 +0100
+Subject: i40e: handle multi-buffer packets that are shrunk by xdp prog
+
+From: Tirthendu Sarkar <tirthendu.sarkar@intel.com>
+
+[ Upstream commit 83014323c642b8faa2d64a5f303b41c019322478 ]
+
+XDP programs can shrink packets by calling the bpf_xdp_adjust_tail()
+helper function. For multi-buffer packets this may lead to reduction of
+frag count stored in skb_shared_info area of the xdp_buff struct. This
+results in issues with the current handling of XDP_PASS and XDP_DROP
+cases.
+
+For XDP_PASS, currently skb is being built using frag count of
+xdp_buffer before it was processed by XDP prog and thus will result in
+an inconsistent skb when frag count gets reduced by XDP prog. To fix
+this, get correct frag count while building the skb instead of using
+pre-obtained frag count.
+
+For XDP_DROP, current page recycling logic will not reuse the page but
+instead will adjust the pagecnt_bias so that the page can be freed. This
+again results in inconsistent behavior as the page refcnt has already
+been changed by the helper while freeing the frag(s) as part of
+shrinking the packet. To fix this, only adjust pagecnt_bias for buffers
+that are stillpart of the packet post-xdp prog run.
+
+Fixes: e213ced19bef ("i40e: add support for XDP multi-buffer Rx")
+Reported-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Signed-off-by: Tirthendu Sarkar <tirthendu.sarkar@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-6-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_txrx.c | 40 ++++++++++++---------
+ 1 file changed, 23 insertions(+), 17 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+index b047c587629b..2e5546e549d9 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+@@ -2100,7 +2100,8 @@ static void i40e_put_rx_buffer(struct i40e_ring *rx_ring,
+ static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res,
+                                 struct xdp_buff *xdp)
+ {
+-      u32 next = rx_ring->next_to_clean;
++      u32 nr_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags;
++      u32 next = rx_ring->next_to_clean, i = 0;
+       struct i40e_rx_buffer *rx_buffer;
+       xdp->flags = 0;
+@@ -2113,10 +2114,10 @@ static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res,
+               if (!rx_buffer->page)
+                       continue;
+-              if (xdp_res == I40E_XDP_CONSUMED)
+-                      rx_buffer->pagecnt_bias++;
+-              else
++              if (xdp_res != I40E_XDP_CONSUMED)
+                       i40e_rx_buffer_flip(rx_buffer, xdp->frame_sz);
++              else if (i++ <= nr_frags)
++                      rx_buffer->pagecnt_bias++;
+               /* EOP buffer will be put in i40e_clean_rx_irq() */
+               if (next == rx_ring->next_to_process)
+@@ -2130,20 +2131,20 @@ static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res,
+  * i40e_construct_skb - Allocate skb and populate it
+  * @rx_ring: rx descriptor ring to transact packets on
+  * @xdp: xdp_buff pointing to the data
+- * @nr_frags: number of buffers for the packet
+  *
+  * This function allocates an skb.  It then populates it with the page
+  * data from the current receive descriptor, taking care to set up the
+  * skb correctly.
+  */
+ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
+-                                        struct xdp_buff *xdp,
+-                                        u32 nr_frags)
++                                        struct xdp_buff *xdp)
+ {
+       unsigned int size = xdp->data_end - xdp->data;
+       struct i40e_rx_buffer *rx_buffer;
++      struct skb_shared_info *sinfo;
+       unsigned int headlen;
+       struct sk_buff *skb;
++      u32 nr_frags = 0;
+       /* prefetch first cache line of first page */
+       net_prefetch(xdp->data);
+@@ -2181,6 +2182,10 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
+       memcpy(__skb_put(skb, headlen), xdp->data,
+              ALIGN(headlen, sizeof(long)));
++      if (unlikely(xdp_buff_has_frags(xdp))) {
++              sinfo = xdp_get_shared_info_from_buff(xdp);
++              nr_frags = sinfo->nr_frags;
++      }
+       rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
+       /* update all of the pointers */
+       size -= headlen;
+@@ -2200,9 +2205,8 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
+       }
+       if (unlikely(xdp_buff_has_frags(xdp))) {
+-              struct skb_shared_info *sinfo, *skinfo = skb_shinfo(skb);
++              struct skb_shared_info *skinfo = skb_shinfo(skb);
+-              sinfo = xdp_get_shared_info_from_buff(xdp);
+               memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0],
+                      sizeof(skb_frag_t) * nr_frags);
+@@ -2225,17 +2229,17 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
+  * i40e_build_skb - Build skb around an existing buffer
+  * @rx_ring: Rx descriptor ring to transact packets on
+  * @xdp: xdp_buff pointing to the data
+- * @nr_frags: number of buffers for the packet
+  *
+  * This function builds an skb around an existing Rx buffer, taking care
+  * to set up the skb correctly and avoid any memcpy overhead.
+  */
+ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
+-                                    struct xdp_buff *xdp,
+-                                    u32 nr_frags)
++                                    struct xdp_buff *xdp)
+ {
+       unsigned int metasize = xdp->data - xdp->data_meta;
++      struct skb_shared_info *sinfo;
+       struct sk_buff *skb;
++      u32 nr_frags;
+       /* Prefetch first cache line of first page. If xdp->data_meta
+        * is unused, this points exactly as xdp->data, otherwise we
+@@ -2244,6 +2248,11 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
+        */
+       net_prefetch(xdp->data_meta);
++      if (unlikely(xdp_buff_has_frags(xdp))) {
++              sinfo = xdp_get_shared_info_from_buff(xdp);
++              nr_frags = sinfo->nr_frags;
++      }
++
+       /* build an skb around the page buffer */
+       skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz);
+       if (unlikely(!skb))
+@@ -2256,9 +2265,6 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
+               skb_metadata_set(skb, metasize);
+       if (unlikely(xdp_buff_has_frags(xdp))) {
+-              struct skb_shared_info *sinfo;
+-
+-              sinfo = xdp_get_shared_info_from_buff(xdp);
+               xdp_update_skb_shared_info(skb, nr_frags,
+                                          sinfo->xdp_frags_size,
+                                          nr_frags * xdp->frame_sz,
+@@ -2603,9 +2609,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget,
+                       total_rx_bytes += size;
+               } else {
+                       if (ring_uses_build_skb(rx_ring))
+-                              skb = i40e_build_skb(rx_ring, xdp, nfrags);
++                              skb = i40e_build_skb(rx_ring, xdp);
+                       else
+-                              skb = i40e_construct_skb(rx_ring, xdp, nfrags);
++                              skb = i40e_construct_skb(rx_ring, xdp);
+                       /* drop if we failed to retrieve a buffer */
+                       if (!skb) {
+-- 
+2.43.0
+
diff --git a/queue-6.6/i40e-set-xdp_rxq_info-frag_size.patch b/queue-6.6/i40e-set-xdp_rxq_info-frag_size.patch
new file mode 100644 (file)
index 0000000..e2118c1
--- /dev/null
@@ -0,0 +1,130 @@
+From 1ea5026b3fb307201dd905d17d22c043163bd1fc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:16:01 +0100
+Subject: i40e: set xdp_rxq_info::frag_size
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit a045d2f2d03d23e7db6772dd83e0ba2705dfad93 ]
+
+i40e support XDP multi-buffer so it is supposed to use
+__xdp_rxq_info_reg() instead of xdp_rxq_info_reg() and set the
+frag_size. It can not be simply converted at existing callsite because
+rx_buf_len could be un-initialized, so let us register xdp_rxq_info
+within i40e_configure_rx_ring(), which happen to be called with already
+initialized rx_buf_len value.
+
+Commit 5180ff1364bc ("i40e: use int for i40e_status") converted 'err' to
+int, so two variables to deal with return codes are not needed within
+i40e_configure_rx_ring(). Remove 'ret' and use 'err' to handle status
+from xdp_rxq_info registration.
+
+Fixes: e213ced19bef ("i40e: add support for XDP multi-buffer Rx")
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-11-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_main.c | 40 ++++++++++++---------
+ drivers/net/ethernet/intel/i40e/i40e_txrx.c |  9 -----
+ 2 files changed, 24 insertions(+), 25 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
+index 5b20eba93d04..aadca7b3443c 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
+@@ -3578,40 +3578,48 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
+       struct i40e_hmc_obj_rxq rx_ctx;
+       int err = 0;
+       bool ok;
+-      int ret;
+       bitmap_zero(ring->state, __I40E_RING_STATE_NBITS);
+       /* clear the context structure first */
+       memset(&rx_ctx, 0, sizeof(rx_ctx));
+-      if (ring->vsi->type == I40E_VSI_MAIN)
+-              xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
++      ring->rx_buf_len = vsi->rx_buf_len;
++
++      /* XDP RX-queue info only needed for RX rings exposed to XDP */
++      if (ring->vsi->type != I40E_VSI_MAIN)
++              goto skip;
++
++      if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) {
++              err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
++                                       ring->queue_index,
++                                       ring->q_vector->napi.napi_id,
++                                       ring->rx_buf_len);
++              if (err)
++                      return err;
++      }
+       ring->xsk_pool = i40e_xsk_pool(ring);
+       if (ring->xsk_pool) {
+-              ring->rx_buf_len =
+-                xsk_pool_get_rx_frame_size(ring->xsk_pool);
+-              ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
++              ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool);
++              err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+                                                MEM_TYPE_XSK_BUFF_POOL,
+                                                NULL);
+-              if (ret)
+-                      return ret;
++              if (err)
++                      return err;
+               dev_info(&vsi->back->pdev->dev,
+                        "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
+                        ring->queue_index);
+       } else {
+-              ring->rx_buf_len = vsi->rx_buf_len;
+-              if (ring->vsi->type == I40E_VSI_MAIN) {
+-                      ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+-                                                       MEM_TYPE_PAGE_SHARED,
+-                                                       NULL);
+-                      if (ret)
+-                              return ret;
+-              }
++              err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
++                                               MEM_TYPE_PAGE_SHARED,
++                                               NULL);
++              if (err)
++                      return err;
+       }
++skip:
+       xdp_init_buff(&ring->xdp, i40e_rx_pg_size(ring) / 2, &ring->xdp_rxq);
+       rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len,
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+index 2e5546e549d9..1df2f9338812 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+@@ -1556,7 +1556,6 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring)
+ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
+ {
+       struct device *dev = rx_ring->dev;
+-      int err;
+       u64_stats_init(&rx_ring->syncp);
+@@ -1577,14 +1576,6 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
+       rx_ring->next_to_process = 0;
+       rx_ring->next_to_use = 0;
+-      /* XDP RX-queue info only needed for RX rings exposed to XDP */
+-      if (rx_ring->vsi->type == I40E_VSI_MAIN) {
+-              err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
+-                                     rx_ring->queue_index, rx_ring->q_vector->napi.napi_id);
+-              if (err < 0)
+-                      return err;
+-      }
+-
+       rx_ring->xdp_prog = rx_ring->vsi->xdp_prog;
+       rx_ring->rx_bi =
+-- 
+2.43.0
+
diff --git a/queue-6.6/i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch b/queue-6.6/i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch
new file mode 100644 (file)
index 0000000..1e42f1b
--- /dev/null
@@ -0,0 +1,46 @@
+From c2cb36664bc2bd6cd8915c9bd1e82b10d85c8a0c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:16:02 +0100
+Subject: i40e: update xdp_rxq_info::frag_size for ZC enabled Rx queue
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 0cbb08707c932b3f004bc1a8ec6200ef572c1f5f ]
+
+Now that i40e driver correctly sets up frag_size in xdp_rxq_info, let us
+make it work for ZC multi-buffer as well. i40e_ring::rx_buf_len for ZC
+is being set via xsk_pool_get_rx_frame_size() and this needs to be
+propagated up to xdp_rxq_info.
+
+Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-12-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_main.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
+index aadca7b3443c..aad39ebff4ab 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
+@@ -3601,7 +3601,14 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
+       ring->xsk_pool = i40e_xsk_pool(ring);
+       if (ring->xsk_pool) {
++              xdp_rxq_info_unreg(&ring->xdp_rxq);
+               ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool);
++              err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
++                                       ring->queue_index,
++                                       ring->q_vector->napi.napi_id,
++                                       ring->rx_buf_len);
++              if (err)
++                      return err;
+               err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+                                                MEM_TYPE_XSK_BUFF_POOL,
+                                                NULL);
+-- 
+2.43.0
+
diff --git a/queue-6.6/ice-remove-redundant-xdp_rxq_info-registration.patch b/queue-6.6/ice-remove-redundant-xdp_rxq_info-registration.patch
new file mode 100644 (file)
index 0000000..81d2c84
--- /dev/null
@@ -0,0 +1,58 @@
+From 0cdbd0273b7e38f65a667530e5aa5049a8a6634f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:57 +0100
+Subject: ice: remove redundant xdp_rxq_info registration
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 2ee788c06493d02ee85855414cca39825e768aaf ]
+
+xdp_rxq_info struct can be registered by drivers via two functions -
+xdp_rxq_info_reg() and __xdp_rxq_info_reg(). The latter one allows
+drivers that support XDP multi-buffer to set up xdp_rxq_info::frag_size
+which in turn will make it possible to grow the packet via
+bpf_xdp_adjust_tail() BPF helper.
+
+Currently, ice registers xdp_rxq_info in two spots:
+1) ice_setup_rx_ring() // via xdp_rxq_info_reg(), BUG
+2) ice_vsi_cfg_rxq()   // via __xdp_rxq_info_reg(), OK
+
+Cited commit under fixes tag took care of setting up frag_size and
+updated registration scheme in 2) but it did not help as
+1) is called before 2) and as shown above it uses old registration
+function. This means that 2) sees that xdp_rxq_info is already
+registered and never calls __xdp_rxq_info_reg() which leaves us with
+xdp_rxq_info::frag_size being set to 0.
+
+To fix this misbehavior, simply remove xdp_rxq_info_reg() call from
+ice_setup_rx_ring().
+
+Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-7-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_txrx.c | 5 -----
+ 1 file changed, 5 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
+index 5b0f9e53f6b4..24c914015973 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
+@@ -513,11 +513,6 @@ int ice_setup_rx_ring(struct ice_rx_ring *rx_ring)
+       if (ice_is_xdp_ena_vsi(rx_ring->vsi))
+               WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog);
+-      if (rx_ring->vsi->type == ICE_VSI_PF &&
+-          !xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
+-              if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
+-                                   rx_ring->q_index, rx_ring->q_vector->napi.napi_id))
+-                      goto err;
+       return 0;
+ err:
+-- 
+2.43.0
+
diff --git a/queue-6.6/ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch b/queue-6.6/ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch
new file mode 100644 (file)
index 0000000..775ff6b
--- /dev/null
@@ -0,0 +1,91 @@
+From f38782f1f2c6fc2c8e5b7c7f66b5ec2ca703bea6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:59 +0100
+Subject: ice: update xdp_rxq_info::frag_size for ZC enabled Rx queue
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 3de38c87174225487fc93befeea7d380db80aef6 ]
+
+Now that ice driver correctly sets up frag_size in xdp_rxq_info, let us
+make it work for ZC multi-buffer as well. ice_rx_ring::rx_buf_len for ZC
+is being set via xsk_pool_get_rx_frame_size() and this needs to be
+propagated up to xdp_rxq_info.
+
+Use a bigger hammer and instead of unregistering only xdp_rxq_info's
+memory model, unregister it altogether and register it again and have
+xdp_rxq_info with correct frag_size value.
+
+Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support")
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-9-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_base.c | 37 ++++++++++++++---------
+ 1 file changed, 23 insertions(+), 14 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c
+index 7fa43827a3f0..4f3e65b47cdc 100644
+--- a/drivers/net/ethernet/intel/ice/ice_base.c
++++ b/drivers/net/ethernet/intel/ice/ice_base.c
+@@ -534,19 +534,27 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring)
+       ring->rx_buf_len = ring->vsi->rx_buf_len;
+       if (ring->vsi->type == ICE_VSI_PF) {
+-              if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
+-                      /* coverity[check_return] */
+-                      __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
+-                                         ring->q_index,
+-                                         ring->q_vector->napi.napi_id,
+-                                         ring->vsi->rx_buf_len);
++              if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) {
++                      err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
++                                               ring->q_index,
++                                               ring->q_vector->napi.napi_id,
++                                               ring->rx_buf_len);
++                      if (err)
++                              return err;
++              }
+               ring->xsk_pool = ice_xsk_pool(ring);
+               if (ring->xsk_pool) {
+-                      xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
++                      xdp_rxq_info_unreg(&ring->xdp_rxq);
+                       ring->rx_buf_len =
+                               xsk_pool_get_rx_frame_size(ring->xsk_pool);
++                      err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
++                                               ring->q_index,
++                                               ring->q_vector->napi.napi_id,
++                                               ring->rx_buf_len);
++                      if (err)
++                              return err;
+                       err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+                                                        MEM_TYPE_XSK_BUFF_POOL,
+                                                        NULL);
+@@ -557,13 +565,14 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring)
+                       dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
+                                ring->q_index);
+               } else {
+-                      if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
+-                              /* coverity[check_return] */
+-                              __xdp_rxq_info_reg(&ring->xdp_rxq,
+-                                                 ring->netdev,
+-                                                 ring->q_index,
+-                                                 ring->q_vector->napi.napi_id,
+-                                                 ring->vsi->rx_buf_len);
++                      if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) {
++                              err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
++                                                       ring->q_index,
++                                                       ring->q_vector->napi.napi_id,
++                                                       ring->rx_buf_len);
++                              if (err)
++                                      return err;
++                      }
+                       err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+                                                        MEM_TYPE_PAGE_SHARED,
+-- 
+2.43.0
+
diff --git a/queue-6.6/ice-work-on-pre-xdp-prog-frag-count.patch b/queue-6.6/ice-work-on-pre-xdp-prog-frag-count.patch
new file mode 100644 (file)
index 0000000..835b358
--- /dev/null
@@ -0,0 +1,170 @@
+From f73ea006fb6cee5b55577c94988f0e702012fe2c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:55 +0100
+Subject: ice: work on pre-XDP prog frag count
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit ad2047cf5d9313200e308612aed516548873d124 ]
+
+Fix an OOM panic in XDP_DRV mode when a XDP program shrinks a
+multi-buffer packet by 4k bytes and then redirects it to an AF_XDP
+socket.
+
+Since support for handling multi-buffer frames was added to XDP, usage
+of bpf_xdp_adjust_tail() helper within XDP program can free the page
+that given fragment occupies and in turn decrease the fragment count
+within skb_shared_info that is embedded in xdp_buff struct. In current
+ice driver codebase, it can become problematic when page recycling logic
+decides not to reuse the page. In such case, __page_frag_cache_drain()
+is used with ice_rx_buf::pagecnt_bias that was not adjusted after
+refcount of page was changed by XDP prog which in turn does not drain
+the refcount to 0 and page is never freed.
+
+To address this, let us store the count of frags before the XDP program
+was executed on Rx ring struct. This will be used to compare with
+current frag count from skb_shared_info embedded in xdp_buff. A smaller
+value in the latter indicates that XDP prog freed frag(s). Then, for
+given delta decrement pagecnt_bias for XDP_DROP verdict.
+
+While at it, let us also handle the EOP frag within
+ice_set_rx_bufs_act() to make our life easier, so all of the adjustments
+needed to be applied against freed frags are performed in the single
+place.
+
+Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-5-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_txrx.c     | 14 ++++++---
+ drivers/net/ethernet/intel/ice/ice_txrx.h     |  1 +
+ drivers/net/ethernet/intel/ice/ice_txrx_lib.h | 31 +++++++++++++------
+ 3 files changed, 32 insertions(+), 14 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
+index 52d0a126eb61..5b0f9e53f6b4 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
+@@ -600,9 +600,7 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
+               ret = ICE_XDP_CONSUMED;
+       }
+ exit:
+-      rx_buf->act = ret;
+-      if (unlikely(xdp_buff_has_frags(xdp)))
+-              ice_set_rx_bufs_act(xdp, rx_ring, ret);
++      ice_set_rx_bufs_act(xdp, rx_ring, ret);
+ }
+ /**
+@@ -890,14 +888,17 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
+       }
+       if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) {
+-              if (unlikely(xdp_buff_has_frags(xdp)))
+-                      ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED);
++              ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED);
+               return -ENOMEM;
+       }
+       __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page,
+                                  rx_buf->page_offset, size);
+       sinfo->xdp_frags_size += size;
++      /* remember frag count before XDP prog execution; bpf_xdp_adjust_tail()
++       * can pop off frags but driver has to handle it on its own
++       */
++      rx_ring->nr_frags = sinfo->nr_frags;
+       if (page_is_pfmemalloc(rx_buf->page))
+               xdp_buff_set_frag_pfmemalloc(xdp);
+@@ -1249,6 +1250,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+               xdp->data = NULL;
+               rx_ring->first_desc = ntc;
++              rx_ring->nr_frags = 0;
+               continue;
+ construct_skb:
+               if (likely(ice_ring_uses_build_skb(rx_ring)))
+@@ -1264,10 +1266,12 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+                                                   ICE_XDP_CONSUMED);
+                       xdp->data = NULL;
+                       rx_ring->first_desc = ntc;
++                      rx_ring->nr_frags = 0;
+                       break;
+               }
+               xdp->data = NULL;
+               rx_ring->first_desc = ntc;
++              rx_ring->nr_frags = 0;
+               stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
+               if (unlikely(ice_test_staterr(rx_desc->wb.status_error0,
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
+index 166413fc33f4..407d4c320097 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
+@@ -333,6 +333,7 @@ struct ice_rx_ring {
+       struct ice_channel *ch;
+       struct ice_tx_ring *xdp_ring;
+       struct xsk_buff_pool *xsk_pool;
++      u32 nr_frags;
+       dma_addr_t dma;                 /* physical address of ring */
+       u64 cached_phctime;
+       u16 rx_buf_len;
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
+index 115969ecdf7b..b0e56675f98b 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
++++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
+@@ -12,26 +12,39 @@
+  * act: action to store onto Rx buffers related to XDP buffer parts
+  *
+  * Set action that should be taken before putting Rx buffer from first frag
+- * to one before last. Last one is handled by caller of this function as it
+- * is the EOP frag that is currently being processed. This function is
+- * supposed to be called only when XDP buffer contains frags.
++ * to the last.
+  */
+ static inline void
+ ice_set_rx_bufs_act(struct xdp_buff *xdp, const struct ice_rx_ring *rx_ring,
+                   const unsigned int act)
+ {
+-      const struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+-      u32 first = rx_ring->first_desc;
+-      u32 nr_frags = sinfo->nr_frags;
++      u32 sinfo_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags;
++      u32 nr_frags = rx_ring->nr_frags + 1;
++      u32 idx = rx_ring->first_desc;
+       u32 cnt = rx_ring->count;
+       struct ice_rx_buf *buf;
+       for (int i = 0; i < nr_frags; i++) {
+-              buf = &rx_ring->rx_buf[first];
++              buf = &rx_ring->rx_buf[idx];
+               buf->act = act;
+-              if (++first == cnt)
+-                      first = 0;
++              if (++idx == cnt)
++                      idx = 0;
++      }
++
++      /* adjust pagecnt_bias on frags freed by XDP prog */
++      if (sinfo_frags < rx_ring->nr_frags && act == ICE_XDP_CONSUMED) {
++              u32 delta = rx_ring->nr_frags - sinfo_frags;
++
++              while (delta) {
++                      if (idx == 0)
++                              idx = cnt - 1;
++                      else
++                              idx--;
++                      buf = &rx_ring->rx_buf[idx];
++                      buf->pagecnt_bias--;
++                      delta--;
++              }
+       }
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.6/intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch b/queue-6.6/intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch
new file mode 100644 (file)
index 0000000..a4686ed
--- /dev/null
@@ -0,0 +1,60 @@
+From 419c1ea790a3ac9d23f0cb06f5d86f0f6e4b2608 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:58 +0100
+Subject: intel: xsk: initialize skb_frag_t::bv_offset in ZC drivers
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 290779905d09d5fdf6caa4f58ddefc3f4db0c0a9 ]
+
+Ice and i40e ZC drivers currently set offset of a frag within
+skb_shared_info to 0, which is incorrect. xdp_buffs that come from
+xsk_buff_pool always have 256 bytes of a headroom, so they need to be
+taken into account to retrieve xdp_buff::data via skb_frag_address().
+Otherwise, bpf_xdp_frags_increase_tail() would be starting its job from
+xdp_buff::data_hard_start which would result in overwriting existing
+payload.
+
+Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support")
+Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-8-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_xsk.c | 3 ++-
+ drivers/net/ethernet/intel/ice/ice_xsk.c   | 3 ++-
+ 2 files changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+index b75e6b6d317c..1f8ae6f5d980 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+@@ -418,7 +418,8 @@ i40e_add_xsk_frag(struct i40e_ring *rx_ring, struct xdp_buff *first,
+       }
+       __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++,
+-                                 virt_to_page(xdp->data_hard_start), 0, size);
++                                 virt_to_page(xdp->data_hard_start),
++                                 XDP_PACKET_HEADROOM, size);
+       sinfo->xdp_frags_size += size;
+       xsk_buff_add_frag(xdp);
+diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
+index 33f194c870bb..307c609137bd 100644
+--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
++++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
+@@ -826,7 +826,8 @@ ice_add_xsk_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *first,
+       }
+       __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++,
+-                                 virt_to_page(xdp->data_hard_start), 0, size);
++                                 virt_to_page(xdp->data_hard_start),
++                                 XDP_PACKET_HEADROOM, size);
+       sinfo->xdp_frags_size += size;
+       xsk_buff_add_frag(xdp);
+-- 
+2.43.0
+
diff --git a/queue-6.6/ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch b/queue-6.6/ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch
new file mode 100644 (file)
index 0000000..5d0ec48
--- /dev/null
@@ -0,0 +1,70 @@
+From aaeffb5c99d462e2f6f3a27a6ce9171c60ed2741 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 18:20:01 +0800
+Subject: ipv6: init the accept_queue's spinlocks in inet6_create
+
+From: Zhengchao Shao <shaozhengchao@huawei.com>
+
+[ Upstream commit 435e202d645c197dcfd39d7372eb2a56529b6640 ]
+
+In commit 198bc90e0e73("tcp: make sure init the accept_queue's spinlocks
+once"), the spinlocks of accept_queue are initialized only when socket is
+created in the inet4 scenario. The locks are not initialized when socket
+is created in the inet6 scenario. The kernel reports the following error:
+INFO: trying to register non-static key.
+The code is fine but needs lockdep annotation, or maybe
+you didn't initialize this object before use?
+turning off the locking correctness validator.
+Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
+Call Trace:
+<TASK>
+       dump_stack_lvl (lib/dump_stack.c:107)
+       register_lock_class (kernel/locking/lockdep.c:1289)
+       __lock_acquire (kernel/locking/lockdep.c:5015)
+       lock_acquire.part.0 (kernel/locking/lockdep.c:5756)
+       _raw_spin_lock_bh (kernel/locking/spinlock.c:178)
+       inet_csk_listen_stop (net/ipv4/inet_connection_sock.c:1386)
+       tcp_disconnect (net/ipv4/tcp.c:2981)
+       inet_shutdown (net/ipv4/af_inet.c:935)
+       __sys_shutdown (./include/linux/file.h:32 net/socket.c:2438)
+       __x64_sys_shutdown (net/socket.c:2445)
+       do_syscall_64 (arch/x86/entry/common.c:52)
+       entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129)
+RIP: 0033:0x7f52ecd05a3d
+Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7
+48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff
+ff 73 01 c3 48 8b 0d ab a3 0e 00 f7 d8 64 89 01 48
+RSP: 002b:00007f52ecf5dde8 EFLAGS: 00000293 ORIG_RAX: 0000000000000030
+RAX: ffffffffffffffda RBX: 00007f52ecf5e640 RCX: 00007f52ecd05a3d
+RDX: 00007f52ecc8b188 RSI: 0000000000000000 RDI: 0000000000000004
+RBP: 00007f52ecf5de20 R08: 00007ffdae45c69f R09: 0000000000000000
+R10: 0000000000000000 R11: 0000000000000293 R12: 00007f52ecf5e640
+R13: 0000000000000000 R14: 00007f52ecc8b060 R15: 00007ffdae45c6e0
+
+Fixes: 198bc90e0e73 ("tcp: make sure init the accept_queue's spinlocks once")
+Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20240122102001.2851701-1-shaozhengchao@huawei.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv6/af_inet6.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
+index 368824fe9719..b6c5b5e25a2f 100644
+--- a/net/ipv6/af_inet6.c
++++ b/net/ipv6/af_inet6.c
+@@ -199,6 +199,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
+       if (INET_PROTOSW_REUSE & answer_flags)
+               sk->sk_reuse = SK_CAN_REUSE;
++      if (INET_PROTOSW_ICSK & answer_flags)
++              inet_init_csk_locks(sk);
++
+       inet = inet_sk(sk);
+       inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);
+-- 
+2.43.0
+
diff --git a/queue-6.6/llc-drop-support-for-eth_p_tr_802_2.patch b/queue-6.6/llc-drop-support-for-eth_p_tr_802_2.patch
new file mode 100644 (file)
index 0000000..ea3b04d
--- /dev/null
@@ -0,0 +1,130 @@
+From c75db2d6fe464bfca6de917ba19e9eb3c5e80953 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 17:55:15 -0800
+Subject: llc: Drop support for ETH_P_TR_802_2.
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+[ Upstream commit e3f9bed9bee261e3347131764e42aeedf1ffea61 ]
+
+syzbot reported an uninit-value bug below. [0]
+
+llc supports ETH_P_802_2 (0x0004) and used to support ETH_P_TR_802_2
+(0x0011), and syzbot abused the latter to trigger the bug.
+
+  write$tun(r0, &(0x7f0000000040)={@val={0x0, 0x11}, @val, @mpls={[], @llc={@snap={0xaa, 0x1, ')', "90e5dd"}}}}, 0x16)
+
+llc_conn_handler() initialises local variables {saddr,daddr}.mac
+based on skb in llc_pdu_decode_sa()/llc_pdu_decode_da() and passes
+them to __llc_lookup().
+
+However, the initialisation is done only when skb->protocol is
+htons(ETH_P_802_2), otherwise, __llc_lookup_established() and
+__llc_lookup_listener() will read garbage.
+
+The missing initialisation existed prior to commit 211ed865108e
+("net: delete all instances of special processing for token ring").
+
+It removed the part to kick out the token ring stuff but forgot to
+close the door allowing ETH_P_TR_802_2 packets to sneak into llc_rcv().
+
+Let's remove llc_tr_packet_type and complete the deprecation.
+
+[0]:
+BUG: KMSAN: uninit-value in __llc_lookup_established+0xe9d/0xf90
+ __llc_lookup_established+0xe9d/0xf90
+ __llc_lookup net/llc/llc_conn.c:611 [inline]
+ llc_conn_handler+0x4bd/0x1360 net/llc/llc_conn.c:791
+ llc_rcv+0xfbb/0x14a0 net/llc/llc_input.c:206
+ __netif_receive_skb_one_core net/core/dev.c:5527 [inline]
+ __netif_receive_skb+0x1a6/0x5a0 net/core/dev.c:5641
+ netif_receive_skb_internal net/core/dev.c:5727 [inline]
+ netif_receive_skb+0x58/0x660 net/core/dev.c:5786
+ tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1555
+ tun_get_user+0x53af/0x66d0 drivers/net/tun.c:2002
+ tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048
+ call_write_iter include/linux/fs.h:2020 [inline]
+ new_sync_write fs/read_write.c:491 [inline]
+ vfs_write+0x8ef/0x1490 fs/read_write.c:584
+ ksys_write+0x20f/0x4c0 fs/read_write.c:637
+ __do_sys_write fs/read_write.c:649 [inline]
+ __se_sys_write fs/read_write.c:646 [inline]
+ __x64_sys_write+0x93/0xd0 fs/read_write.c:646
+ do_syscall_x64 arch/x86/entry/common.c:51 [inline]
+ do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82
+ entry_SYSCALL_64_after_hwframe+0x63/0x6b
+
+Local variable daddr created at:
+ llc_conn_handler+0x53/0x1360 net/llc/llc_conn.c:783
+ llc_rcv+0xfbb/0x14a0 net/llc/llc_input.c:206
+
+CPU: 1 PID: 5004 Comm: syz-executor994 Not tainted 6.6.0-syzkaller-14500-g1c41041124bd #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023
+
+Fixes: 211ed865108e ("net: delete all instances of special processing for token ring")
+Reported-by: syzbot+b5ad66046b913bc04c6f@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=b5ad66046b913bc04c6f
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20240119015515.61898-1-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/llc_pdu.h | 6 ++----
+ net/llc/llc_core.c    | 7 -------
+ 2 files changed, 2 insertions(+), 11 deletions(-)
+
+diff --git a/include/net/llc_pdu.h b/include/net/llc_pdu.h
+index 7e73f8e5e497..1d55ba7c45be 100644
+--- a/include/net/llc_pdu.h
++++ b/include/net/llc_pdu.h
+@@ -262,8 +262,7 @@ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type,
+  */
+ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa)
+ {
+-      if (skb->protocol == htons(ETH_P_802_2))
+-              memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN);
++      memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN);
+ }
+ /**
+@@ -275,8 +274,7 @@ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa)
+  */
+ static inline void llc_pdu_decode_da(struct sk_buff *skb, u8 *da)
+ {
+-      if (skb->protocol == htons(ETH_P_802_2))
+-              memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN);
++      memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN);
+ }
+ /**
+diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
+index 6e387aadffce..4f16d9c88350 100644
+--- a/net/llc/llc_core.c
++++ b/net/llc/llc_core.c
+@@ -135,22 +135,15 @@ static struct packet_type llc_packet_type __read_mostly = {
+       .func = llc_rcv,
+ };
+-static struct packet_type llc_tr_packet_type __read_mostly = {
+-      .type = cpu_to_be16(ETH_P_TR_802_2),
+-      .func = llc_rcv,
+-};
+-
+ static int __init llc_init(void)
+ {
+       dev_add_pack(&llc_packet_type);
+-      dev_add_pack(&llc_tr_packet_type);
+       return 0;
+ }
+ static void __exit llc_exit(void)
+ {
+       dev_remove_pack(&llc_packet_type);
+-      dev_remove_pack(&llc_tr_packet_type);
+ }
+ module_init(llc_init);
+-- 
+2.43.0
+
diff --git a/queue-6.6/llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch b/queue-6.6/llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch
new file mode 100644 (file)
index 0000000..7510b54
--- /dev/null
@@ -0,0 +1,154 @@
+From 944594f7e0e1ca09dc19ebf731dd411ee07a1690 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 18:36:25 +0000
+Subject: llc: make llc_ui_sendmsg() more robust against bonding changes
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit dad555c816a50c6a6a8a86be1f9177673918c647 ]
+
+syzbot was able to trick llc_ui_sendmsg(), allocating an skb with no
+headroom, but subsequently trying to push 14 bytes of Ethernet header [1]
+
+Like some others, llc_ui_sendmsg() releases the socket lock before
+calling sock_alloc_send_skb().
+Then it acquires it again, but does not redo all the sanity checks
+that were performed.
+
+This fix:
+
+- Uses LL_RESERVED_SPACE() to reserve space.
+- Check all conditions again after socket lock is held again.
+- Do not account Ethernet header for mtu limitation.
+
+[1]
+
+skbuff: skb_under_panic: text:ffff800088baa334 len:1514 put:14 head:ffff0000c9c37000 data:ffff0000c9c36ff2 tail:0x5dc end:0x6c0 dev:bond0
+
+ kernel BUG at net/core/skbuff.c:193 !
+Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP
+Modules linked in:
+CPU: 0 PID: 6875 Comm: syz-executor.0 Not tainted 6.7.0-rc8-syzkaller-00101-g0802e17d9aca-dirty #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023
+pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
+ pc : skb_panic net/core/skbuff.c:189 [inline]
+ pc : skb_under_panic+0x13c/0x140 net/core/skbuff.c:203
+ lr : skb_panic net/core/skbuff.c:189 [inline]
+ lr : skb_under_panic+0x13c/0x140 net/core/skbuff.c:203
+sp : ffff800096f97000
+x29: ffff800096f97010 x28: ffff80008cc8d668 x27: dfff800000000000
+x26: ffff0000cb970c90 x25: 00000000000005dc x24: ffff0000c9c36ff2
+x23: ffff0000c9c37000 x22: 00000000000005ea x21: 00000000000006c0
+x20: 000000000000000e x19: ffff800088baa334 x18: 1fffe000368261ce
+x17: ffff80008e4ed000 x16: ffff80008a8310f8 x15: 0000000000000001
+x14: 1ffff00012df2d58 x13: 0000000000000000 x12: 0000000000000000
+x11: 0000000000000001 x10: 0000000000ff0100 x9 : e28a51f1087e8400
+x8 : e28a51f1087e8400 x7 : ffff80008028f8d0 x6 : 0000000000000000
+x5 : 0000000000000001 x4 : 0000000000000001 x3 : ffff800082b78714
+x2 : 0000000000000001 x1 : 0000000100000000 x0 : 0000000000000089
+Call trace:
+  skb_panic net/core/skbuff.c:189 [inline]
+  skb_under_panic+0x13c/0x140 net/core/skbuff.c:203
+  skb_push+0xf0/0x108 net/core/skbuff.c:2451
+  eth_header+0x44/0x1f8 net/ethernet/eth.c:83
+  dev_hard_header include/linux/netdevice.h:3188 [inline]
+  llc_mac_hdr_init+0x110/0x17c net/llc/llc_output.c:33
+  llc_sap_action_send_xid_c+0x170/0x344 net/llc/llc_s_ac.c:85
+  llc_exec_sap_trans_actions net/llc/llc_sap.c:153 [inline]
+  llc_sap_next_state net/llc/llc_sap.c:182 [inline]
+  llc_sap_state_process+0x1ec/0x774 net/llc/llc_sap.c:209
+  llc_build_and_send_xid_pkt+0x12c/0x1c0 net/llc/llc_sap.c:270
+  llc_ui_sendmsg+0x7bc/0xb1c net/llc/af_llc.c:997
+  sock_sendmsg_nosec net/socket.c:730 [inline]
+  __sock_sendmsg net/socket.c:745 [inline]
+  sock_sendmsg+0x194/0x274 net/socket.c:767
+  splice_to_socket+0x7cc/0xd58 fs/splice.c:881
+  do_splice_from fs/splice.c:933 [inline]
+  direct_splice_actor+0xe4/0x1c0 fs/splice.c:1142
+  splice_direct_to_actor+0x2a0/0x7e4 fs/splice.c:1088
+  do_splice_direct+0x20c/0x348 fs/splice.c:1194
+  do_sendfile+0x4bc/0xc70 fs/read_write.c:1254
+  __do_sys_sendfile64 fs/read_write.c:1322 [inline]
+  __se_sys_sendfile64 fs/read_write.c:1308 [inline]
+  __arm64_sys_sendfile64+0x160/0x3b4 fs/read_write.c:1308
+  __invoke_syscall arch/arm64/kernel/syscall.c:37 [inline]
+  invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:51
+  el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:136
+  do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:155
+  el0_svc+0x54/0x158 arch/arm64/kernel/entry-common.c:678
+  el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:696
+  el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:595
+Code: aa1803e6 aa1903e7 a90023f5 94792f6a (d4210000)
+
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Reported-and-tested-by: syzbot+2a7024e9502df538e8ef@syzkaller.appspotmail.com
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240118183625.4007013-1-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/llc/af_llc.c | 24 ++++++++++++++++--------
+ 1 file changed, 16 insertions(+), 8 deletions(-)
+
+diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
+index 9b06c380866b..20551cfb7da6 100644
+--- a/net/llc/af_llc.c
++++ b/net/llc/af_llc.c
+@@ -928,14 +928,15 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+  */
+ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+ {
++      DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name);
+       struct sock *sk = sock->sk;
+       struct llc_sock *llc = llc_sk(sk);
+-      DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name);
+       int flags = msg->msg_flags;
+       int noblock = flags & MSG_DONTWAIT;
++      int rc = -EINVAL, copied = 0, hdrlen, hh_len;
+       struct sk_buff *skb = NULL;
++      struct net_device *dev;
+       size_t size = 0;
+-      int rc = -EINVAL, copied = 0, hdrlen;
+       dprintk("%s: sending from %02X to %02X\n", __func__,
+               llc->laddr.lsap, llc->daddr.lsap);
+@@ -955,22 +956,29 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+               if (rc)
+                       goto out;
+       }
+-      hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr);
++      dev = llc->dev;
++      hh_len = LL_RESERVED_SPACE(dev);
++      hdrlen = llc_ui_header_len(sk, addr);
+       size = hdrlen + len;
+-      if (size > llc->dev->mtu)
+-              size = llc->dev->mtu;
++      size = min_t(size_t, size, READ_ONCE(dev->mtu));
+       copied = size - hdrlen;
+       rc = -EINVAL;
+       if (copied < 0)
+               goto out;
+       release_sock(sk);
+-      skb = sock_alloc_send_skb(sk, size, noblock, &rc);
++      skb = sock_alloc_send_skb(sk, hh_len + size, noblock, &rc);
+       lock_sock(sk);
+       if (!skb)
+               goto out;
+-      skb->dev      = llc->dev;
++      if (sock_flag(sk, SOCK_ZAPPED) ||
++          llc->dev != dev ||
++          hdrlen != llc_ui_header_len(sk, addr) ||
++          hh_len != LL_RESERVED_SPACE(dev) ||
++          size > READ_ONCE(dev->mtu))
++              goto out;
++      skb->dev      = dev;
+       skb->protocol = llc_proto_type(addr->sllc_arphrd);
+-      skb_reserve(skb, hdrlen);
++      skb_reserve(skb, hh_len + hdrlen);
+       rc = memcpy_from_msg(skb_put(skb, copied), msg, copied);
+       if (rc)
+               goto out;
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-fec-fix-the-unhandled-context-fault-from-smmu.patch b/queue-6.6/net-fec-fix-the-unhandled-context-fault-from-smmu.patch
new file mode 100644 (file)
index 0000000..ff8706c
--- /dev/null
@@ -0,0 +1,58 @@
+From 1c045bea0adc7b49aa3ff4b77a482810aca37ccb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 10:51:41 -0600
+Subject: net: fec: fix the unhandled context fault from smmu
+
+From: Shenwei Wang <shenwei.wang@nxp.com>
+
+[ Upstream commit 5e344807735023cd3a67c37a1852b849caa42620 ]
+
+When repeatedly changing the interface link speed using the command below:
+
+ethtool -s eth0 speed 100 duplex full
+ethtool -s eth0 speed 1000 duplex full
+
+The following errors may sometimes be reported by the ARM SMMU driver:
+
+[ 5395.035364] fec 5b040000.ethernet eth0: Link is Down
+[ 5395.039255] arm-smmu 51400000.iommu: Unhandled context fault:
+fsr=0x402, iova=0x00000000, fsynr=0x100001, cbfrsynra=0x852, cb=2
+[ 5398.108460] fec 5b040000.ethernet eth0: Link is Up - 100Mbps/Full -
+flow control off
+
+It is identified that the FEC driver does not properly stop the TX queue
+during the link speed transitions, and this results in the invalid virtual
+I/O address translations from the SMMU and causes the context faults.
+
+Fixes: dbc64a8ea231 ("net: fec: move calls to quiesce/resume packet processing out of fec_restart()")
+Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com>
+Link: https://lore.kernel.org/r/20240123165141.2008104-1-shenwei.wang@nxp.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/freescale/fec_main.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
+index 35c95f07fd6d..54da59286df4 100644
+--- a/drivers/net/ethernet/freescale/fec_main.c
++++ b/drivers/net/ethernet/freescale/fec_main.c
+@@ -2011,6 +2011,7 @@ static void fec_enet_adjust_link(struct net_device *ndev)
+               /* if any of the above changed restart the FEC */
+               if (status_change) {
++                      netif_stop_queue(ndev);
+                       napi_disable(&fep->napi);
+                       netif_tx_lock_bh(ndev);
+                       fec_restart(ndev);
+@@ -2020,6 +2021,7 @@ static void fec_enet_adjust_link(struct net_device *ndev)
+               }
+       } else {
+               if (fep->link) {
++                      netif_stop_queue(ndev);
+                       napi_disable(&fep->napi);
+                       netif_tx_lock_bh(ndev);
+                       fec_stop(ndev);
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-fix-removing-a-namespace-with-conflicting-altnam.patch b/queue-6.6/net-fix-removing-a-namespace-with-conflicting-altnam.patch
new file mode 100644 (file)
index 0000000..2fbce0f
--- /dev/null
@@ -0,0 +1,81 @@
+From 43571c969a161b4d3602a3f69d022be34180c33f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 16:58:59 -0800
+Subject: net: fix removing a namespace with conflicting altnames
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit d09486a04f5da0a812c26217213b89a3b1acf836 ]
+
+Mark reports a BUG() when a net namespace is removed.
+
+    kernel BUG at net/core/dev.c:11520!
+
+Physical interfaces moved outside of init_net get "refunded"
+to init_net when that namespace disappears. The main interface
+name may get overwritten in the process if it would have
+conflicted. We need to also discard all conflicting altnames.
+Recent fixes addressed ensuring that altnames get moved
+with the main interface, which surfaced this problem.
+
+Reported-by: ÐœÐ°Ñ€Ðº ÐšÐ¾Ñ€ÐµÐ½Ð±ÐµÑ€Ð³ <socketpair@gmail.com>
+Link: https://lore.kernel.org/all/CAEmTpZFZ4Sv3KwqFOY2WKDHeZYdi0O7N5H1nTvcGp=SAEavtDg@mail.gmail.com/
+Fixes: 7663d522099e ("net: check for altname conflicts when changing netdev's netns")
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Reviewed-by: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/core/dev.c | 9 +++++++++
+ net/core/dev.h | 3 +++
+ 2 files changed, 12 insertions(+)
+
+diff --git a/net/core/dev.c b/net/core/dev.c
+index e480afb50d4c..d72a4ff689ca 100644
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -11491,6 +11491,7 @@ static struct pernet_operations __net_initdata netdev_net_ops = {
+ static void __net_exit default_device_exit_net(struct net *net)
+ {
++      struct netdev_name_node *name_node, *tmp;
+       struct net_device *dev, *aux;
+       /*
+        * Push all migratable network devices back to the
+@@ -11513,6 +11514,14 @@ static void __net_exit default_device_exit_net(struct net *net)
+               snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
+               if (netdev_name_in_use(&init_net, fb_name))
+                       snprintf(fb_name, IFNAMSIZ, "dev%%d");
++
++              netdev_for_each_altname_safe(dev, name_node, tmp)
++                      if (netdev_name_in_use(&init_net, name_node->name)) {
++                              netdev_name_node_del(name_node);
++                              synchronize_rcu();
++                              __netdev_name_node_alt_destroy(name_node);
++                      }
++
+               err = dev_change_net_namespace(dev, &init_net, fb_name);
+               if (err) {
+                       pr_emerg("%s: failed to move %s to init_net: %d\n",
+diff --git a/net/core/dev.h b/net/core/dev.h
+index fa2e9c5c4122..f2037d402144 100644
+--- a/net/core/dev.h
++++ b/net/core/dev.h
+@@ -64,6 +64,9 @@ int dev_change_name(struct net_device *dev, const char *newname);
+ #define netdev_for_each_altname(dev, namenode)                                \
+       list_for_each_entry((namenode), &(dev)->name_node->list, list)
++#define netdev_for_each_altname_safe(dev, namenode, next)             \
++      list_for_each_entry_safe((namenode), (next), &(dev)->name_node->list, \
++                               list)
+ int netdev_name_node_alt_create(struct net_device *dev, const char *name);
+ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-micrel-fix-ptp-frame-parsing-for-lan8814.patch b/queue-6.6/net-micrel-fix-ptp-frame-parsing-for-lan8814.patch
new file mode 100644 (file)
index 0000000..f6c06fd
--- /dev/null
@@ -0,0 +1,61 @@
+From fa78e95cef05ac17e903022c324f296ede7b9f45 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 11:47:50 +0100
+Subject: net: micrel: Fix PTP frame parsing for lan8814
+
+From: Horatiu Vultur <horatiu.vultur@microchip.com>
+
+[ Upstream commit aaf632f7ab6dec57bc9329a438f94504fe8034b9 ]
+
+The HW has the capability to check each frame if it is a PTP frame,
+which domain it is, which ptp frame type it is, different ip address in
+the frame. And if one of these checks fail then the frame is not
+timestamp. Most of these checks were disabled except checking the field
+minorVersionPTP inside the PTP header. Meaning that once a partner sends
+a frame compliant to 8021AS which has minorVersionPTP set to 1, then the
+frame was not timestamp because the HW expected by default a value of 0
+in minorVersionPTP. This is exactly the same issue as on lan8841.
+Fix this issue by removing this check so the userspace can decide on this.
+
+Fixes: ece19502834d ("net: phy: micrel: 1588 support for LAN8814 phy")
+Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
+Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
+Reviewed-by: Divya Koppera <divya.koppera@microchip.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/phy/micrel.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
+index dfd5f8e78e29..27ca25bbd141 100644
+--- a/drivers/net/phy/micrel.c
++++ b/drivers/net/phy/micrel.c
+@@ -120,6 +120,11 @@
+  */
+ #define LAN8814_1PPM_FORMAT                   17179
++#define PTP_RX_VERSION                                0x0248
++#define PTP_TX_VERSION                                0x0288
++#define PTP_MAX_VERSION(x)                    (((x) & GENMASK(7, 0)) << 8)
++#define PTP_MIN_VERSION(x)                    ((x) & GENMASK(7, 0))
++
+ #define PTP_RX_MOD                            0x024F
+ #define PTP_RX_MOD_BAD_UDPV4_CHKSUM_FORCE_FCS_DIS_ BIT(3)
+ #define PTP_RX_TIMESTAMP_EN                   0x024D
+@@ -3125,6 +3130,12 @@ static void lan8814_ptp_init(struct phy_device *phydev)
+       lanphy_write_page_reg(phydev, 5, PTP_TX_PARSE_IP_ADDR_EN, 0);
+       lanphy_write_page_reg(phydev, 5, PTP_RX_PARSE_IP_ADDR_EN, 0);
++      /* Disable checking for minorVersionPTP field */
++      lanphy_write_page_reg(phydev, 5, PTP_RX_VERSION,
++                            PTP_MAX_VERSION(0xff) | PTP_MIN_VERSION(0x0));
++      lanphy_write_page_reg(phydev, 5, PTP_TX_VERSION,
++                            PTP_MAX_VERSION(0xff) | PTP_MIN_VERSION(0x0));
++
+       skb_queue_head_init(&ptp_priv->tx_queue);
+       skb_queue_head_init(&ptp_priv->rx_queue);
+       INIT_LIST_HEAD(&ptp_priv->rx_ts_list);
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-mlx5-bridge-enable-mcast-in-smfs-steering-mode.patch b/queue-6.6/net-mlx5-bridge-enable-mcast-in-smfs-steering-mode.patch
new file mode 100644 (file)
index 0000000..fc3a774
--- /dev/null
@@ -0,0 +1,77 @@
+From 25ad4a55df0ff2b5837783caa8ab9befc0635bd1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 28 Aug 2023 14:20:00 +0300
+Subject: net/mlx5: Bridge, Enable mcast in smfs steering mode
+
+From: Erez Shitrit <erezsh@nvidia.com>
+
+[ Upstream commit 653b7eb9d74426397c95061fd57da3063625af65 ]
+
+In order to have mcast offloads the driver needs the following:
+It should know if that mcast comes from wire port, in addition the flow
+should not be marked as any specific source, that way it will give the
+flexibility for the driver not to be depended on the way iterator
+implemented in the FW.
+
+Signed-off-by: Erez Shitrit <erezsh@nvidia.com>
+Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
+Reviewed-by: Vlad Buslov <vladbu@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Stable-dep-of: ec7cc38ef9f8 ("net/mlx5: Bridge, fix multicast packets sent to uplink")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../ethernet/mellanox/mlx5/core/esw/bridge_mcast.c    | 11 ++---------
+ include/linux/mlx5/fs.h                               |  1 +
+ 2 files changed, 3 insertions(+), 9 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
+index 7a01714b3780..a7ed87e9d842 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
+@@ -78,6 +78,8 @@ mlx5_esw_bridge_mdb_flow_create(u16 esw_owner_vhca_id, struct mlx5_esw_bridge_md
+       xa_for_each(&entry->ports, idx, port) {
+               dests[i].type = MLX5_FLOW_DESTINATION_TYPE_FLOW_TABLE;
+               dests[i].ft = port->mcast.ft;
++              if (port->vport_num == MLX5_VPORT_UPLINK)
++                      dests[i].ft->flags |= MLX5_FLOW_TABLE_UPLINK_VPORT;
+               i++;
+       }
+@@ -585,10 +587,6 @@ mlx5_esw_bridge_mcast_vlan_flow_create(u16 vlan_proto, struct mlx5_esw_bridge_po
+       if (!rule_spec)
+               return ERR_PTR(-ENOMEM);
+-      if (MLX5_CAP_ESW_FLOWTABLE(bridge->br_offloads->esw->dev, flow_source) &&
+-          port->vport_num == MLX5_VPORT_UPLINK)
+-              rule_spec->flow_context.flow_source =
+-                      MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT;
+       rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+       flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
+@@ -660,11 +658,6 @@ mlx5_esw_bridge_mcast_fwd_flow_create(struct mlx5_esw_bridge_port *port)
+       if (!rule_spec)
+               return ERR_PTR(-ENOMEM);
+-      if (MLX5_CAP_ESW_FLOWTABLE(bridge->br_offloads->esw->dev, flow_source) &&
+-          port->vport_num == MLX5_VPORT_UPLINK)
+-              rule_spec->flow_context.flow_source =
+-                      MLX5_FLOW_CONTEXT_FLOW_SOURCE_LOCAL_VPORT;
+-
+       if (MLX5_CAP_ESW(bridge->br_offloads->esw->dev, merged_eswitch)) {
+               dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID;
+               dest.vport.vhca_id = port->esw_owner_vhca_id;
+diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
+index 1e00c2436377..6f7725238abc 100644
+--- a/include/linux/mlx5/fs.h
++++ b/include/linux/mlx5/fs.h
+@@ -67,6 +67,7 @@ enum {
+       MLX5_FLOW_TABLE_TERMINATION = BIT(2),
+       MLX5_FLOW_TABLE_UNMANAGED = BIT(3),
+       MLX5_FLOW_TABLE_OTHER_VPORT = BIT(4),
++      MLX5_FLOW_TABLE_UPLINK_VPORT = BIT(5),
+ };
+ #define LEFTOVERS_RULE_NUM     2
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch b/queue-6.6/net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch
new file mode 100644 (file)
index 0000000..e773122
--- /dev/null
@@ -0,0 +1,94 @@
+From 3e9a05b0b360985ccfc992da823351a870910b36 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 30 Dec 2023 22:40:37 +0200
+Subject: net/mlx5: Bridge, fix multicast packets sent to uplink
+
+From: Moshe Shemesh <moshe@nvidia.com>
+
+[ Upstream commit ec7cc38ef9f83553102e84c82536971a81630739 ]
+
+To enable multicast packets which are offloaded in bridge multicast
+offload mode to be sent also to uplink, FTE bit uplink_hairpin_en should
+be set. Add this bit to FTE for the bridge multicast offload rules.
+
+Fixes: 18c2916cee12 ("net/mlx5: Bridge, snoop igmp/mld packets")
+Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
+Reviewed-by: Gal Pressman <gal@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c | 3 +++
+ drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c           | 2 ++
+ include/linux/mlx5/fs.h                                    | 1 +
+ include/linux/mlx5/mlx5_ifc.h                              | 2 +-
+ 4 files changed, 7 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
+index a7ed87e9d842..22dd30cf8033 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
+@@ -83,6 +83,7 @@ mlx5_esw_bridge_mdb_flow_create(u16 esw_owner_vhca_id, struct mlx5_esw_bridge_md
+               i++;
+       }
++      rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN;
+       rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+       dmac_v = MLX5_ADDR_OF(fte_match_param, rule_spec->match_value, outer_headers.dmac_47_16);
+       ether_addr_copy(dmac_v, entry->key.addr);
+@@ -587,6 +588,7 @@ mlx5_esw_bridge_mcast_vlan_flow_create(u16 vlan_proto, struct mlx5_esw_bridge_po
+       if (!rule_spec)
+               return ERR_PTR(-ENOMEM);
++      rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN;
+       rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+       flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
+@@ -662,6 +664,7 @@ mlx5_esw_bridge_mcast_fwd_flow_create(struct mlx5_esw_bridge_port *port)
+               dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID;
+               dest.vport.vhca_id = port->esw_owner_vhca_id;
+       }
++      rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN;
+       handle = mlx5_add_flow_rules(port->mcast.ft, rule_spec, &flow_act, &dest, 1);
+       kvfree(rule_spec);
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+index a4b925331661..b29299c49ab3 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+@@ -566,6 +566,8 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
+                fte->flow_context.flow_tag);
+       MLX5_SET(flow_context, in_flow_context, flow_source,
+                fte->flow_context.flow_source);
++      MLX5_SET(flow_context, in_flow_context, uplink_hairpin_en,
++               !!(fte->flow_context.flags & FLOW_CONTEXT_UPLINK_HAIRPIN_EN));
+       MLX5_SET(flow_context, in_flow_context, extended_destination,
+                extended_dest);
+diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
+index 6f7725238abc..3fb428ce7d1c 100644
+--- a/include/linux/mlx5/fs.h
++++ b/include/linux/mlx5/fs.h
+@@ -132,6 +132,7 @@ struct mlx5_flow_handle;
+ enum {
+       FLOW_CONTEXT_HAS_TAG = BIT(0),
++      FLOW_CONTEXT_UPLINK_HAIRPIN_EN = BIT(1),
+ };
+ struct mlx5_flow_context {
+diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
+index 8ac6ae79e083..51eb83f77938 100644
+--- a/include/linux/mlx5/mlx5_ifc.h
++++ b/include/linux/mlx5/mlx5_ifc.h
+@@ -3536,7 +3536,7 @@ struct mlx5_ifc_flow_context_bits {
+       u8         action[0x10];
+       u8         extended_destination[0x1];
+-      u8         reserved_at_81[0x1];
++      u8         uplink_hairpin_en[0x1];
+       u8         flow_source[0x2];
+       u8         encrypt_decrypt_type[0x4];
+       u8         destination_list_size[0x18];
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch b/queue-6.6/net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch
new file mode 100644 (file)
index 0000000..db273b8
--- /dev/null
@@ -0,0 +1,51 @@
+From 3f533d9b6010bac88a223b8ebc3d9db43dec4cf5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 17 Dec 2023 13:20:36 +0200
+Subject: net/mlx5: DR, Can't go to uplink vport on RX rule
+
+From: Yevgeny Kliteynik <kliteyn@nvidia.com>
+
+[ Upstream commit 5b2a2523eeea5f03d39a9d1ff1bad2e9f8eb98d2 ]
+
+Go-To-Vport action on RX is not allowed when the vport is uplink.
+In such case, the packet should be dropped.
+
+Fixes: 9db810ed2d37 ("net/mlx5: DR, Expose steering action functionality")
+Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
+Reviewed-by: Erez Shitrit <erezsh@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../mellanox/mlx5/core/steering/dr_action.c      | 16 +++++++++++-----
+ 1 file changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+index 1a5aee8a7f13..90c38cbbde18 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+@@ -867,11 +867,17 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher,
+                                                       action->sampler->tx_icm_addr;
+                       break;
+               case DR_ACTION_TYP_VPORT:
+-                      attr.hit_gvmi = action->vport->caps->vhca_gvmi;
+-                      dest_action = action;
+-                      attr.final_icm_addr = rx_rule ?
+-                              action->vport->caps->icm_address_rx :
+-                              action->vport->caps->icm_address_tx;
++                      if (unlikely(rx_rule && action->vport->caps->num == MLX5_VPORT_UPLINK)) {
++                              /* can't go to uplink on RX rule - dropping instead */
++                              attr.final_icm_addr = nic_dmn->drop_icm_addr;
++                              attr.hit_gvmi = nic_dmn->drop_icm_addr >> 48;
++                      } else {
++                              attr.hit_gvmi = action->vport->caps->vhca_gvmi;
++                              dest_action = action;
++                              attr.final_icm_addr = rx_rule ?
++                                                    action->vport->caps->icm_address_rx :
++                                                    action->vport->caps->icm_address_tx;
++                      }
+                       break;
+               case DR_ACTION_TYP_POP_VLAN:
+                       if (!rx_rule && !(dmn->ste_ctx->actions_caps &
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch b/queue-6.6/net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch
new file mode 100644 (file)
index 0000000..dab133f
--- /dev/null
@@ -0,0 +1,39 @@
+From 7970f7db5b50822e26e6e1c43136dace5779369f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 17 Dec 2023 11:24:08 +0200
+Subject: net/mlx5: DR, Use the right GVMI number for drop action
+
+From: Yevgeny Kliteynik <kliteyn@nvidia.com>
+
+[ Upstream commit 5665954293f13642f9c052ead83c1e9d8cff186f ]
+
+When FW provides ICM addresses for drop RX/TX, the provided capability
+is 64 bits that contain its GVMI as well as the ICM address itself.
+In case of TX DROP this GVMI is different from the GVMI that the
+domain is operating on.
+
+This patch fixes the action to use these GVMI IDs, as provided by FW.
+
+Fixes: 9db810ed2d37 ("net/mlx5: DR, Expose steering action functionality")
+Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+index 5b83da08692d..1a5aee8a7f13 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+@@ -781,6 +781,7 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher,
+               switch (action_type) {
+               case DR_ACTION_TYP_DROP:
+                       attr.final_icm_addr = nic_dmn->drop_icm_addr;
++                      attr.hit_gvmi = nic_dmn->drop_icm_addr >> 48;
+                       break;
+               case DR_ACTION_TYP_FT:
+                       dest_action = action;
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch b/queue-6.6/net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch
new file mode 100644 (file)
index 0000000..e346ee3
--- /dev/null
@@ -0,0 +1,149 @@
+From 578d3902c8cede082b4623c2bb915387b622b49d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 31 Dec 2023 15:19:50 +0200
+Subject: net/mlx5: Fix a WARN upon a callback command failure
+
+From: Yishai Hadas <yishaih@nvidia.com>
+
+[ Upstream commit cc8091587779cfaddb6b29c9e9edb9079a282cad ]
+
+The below WARN [1] is reported once a callback command failed.
+
+As a callback runs under an interrupt context, needs to use the IRQ
+save/restore variant.
+
+[1]
+DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context())
+WARNING: CPU: 15 PID: 0 at kernel/locking/lockdep.c:4353
+              lockdep_hardirqs_on_prepare+0x11b/0x180
+Modules linked in: vhost_net vhost tap mlx5_vfio_pci
+vfio_pci vfio_pci_core vfio_iommu_type1 vfio mlx5_vdpa vringh
+vhost_iotlb vdpa nfnetlink_cttimeout openvswitch nsh ip6table_mangle
+ip6table_nat ip6table_filter ip6_tables iptable_mangle
+xt_conntrackxt_MASQUERADE nf_conntrack_netlink nfnetlink
+xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5
+auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi
+scsi_transport_iscsi rdma_cm iw_cm ib_umad ib_ipoib ib_cm
+mlx5_ib ib_uverbs ib_core fuse mlx5_core
+CPU: 15 PID: 0 Comm: swapper/15 Tainted: G        W 6.7.0-rc4+ #1587
+Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
+rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
+RIP: 0010:lockdep_hardirqs_on_prepare+0x11b/0x180
+Code: 00 5b c3 c3 e8 e6 0d 58 00 85 c0 74 d6 8b 15 f0 c3
+      76 01 85 d2 75 cc 48 c7 c6 04 a5 3b 82 48 c7 c7 f1
+      e9 39 82 e8 95 12 f9 ff <0f> 0b 5b c3 e8 bc 0d 58 00
+      85 c0 74 ac 8b 3d c6 c3 76 01 85 ff 75
+RSP: 0018:ffffc900003ecd18 EFLAGS: 00010086
+RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000027
+RDX: 0000000000000000 RSI: ffff88885fbdb880 RDI: ffff88885fbdb888
+RBP: 00000000ffffff87 R08: 0000000000000000 R09: 0000000000000001
+R10: 0000000000000000 R11: 284e4f5f4e524157 R12: 00000000002c9aa1
+R13: ffff88810aace980 R14: ffff88810aace9b8 R15: 0000000000000003
+FS:  0000000000000000(0000) GS:ffff88885fbc0000(0000)
+knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007f731436f4c8 CR3: 000000010aae6001 CR4: 0000000000372eb0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+Call Trace:
+ <IRQ>
+? __warn+0x81/0x170
+? lockdep_hardirqs_on_prepare+0x11b/0x180
+? report_bug+0xf8/0x1c0
+? handle_bug+0x3f/0x70
+? exc_invalid_op+0x13/0x60
+? asm_exc_invalid_op+0x16/0x20
+? lockdep_hardirqs_on_prepare+0x11b/0x180
+? lockdep_hardirqs_on_prepare+0x11b/0x180
+trace_hardirqs_on+0x4a/0xa0
+raw_spin_unlock_irq+0x24/0x30
+cmd_status_err+0xc0/0x1a0 [mlx5_core]
+cmd_status_err+0x1a0/0x1a0 [mlx5_core]
+mlx5_cmd_exec_cb_handler+0x24/0x40 [mlx5_core]
+mlx5_cmd_comp_handler+0x129/0x4b0 [mlx5_core]
+cmd_comp_notifier+0x1a/0x20 [mlx5_core]
+notifier_call_chain+0x3e/0xe0
+atomic_notifier_call_chain+0x5f/0x130
+mlx5_eq_async_int+0xe7/0x200 [mlx5_core]
+notifier_call_chain+0x3e/0xe0
+atomic_notifier_call_chain+0x5f/0x130
+irq_int_handler+0x11/0x20 [mlx5_core]
+__handle_irq_event_percpu+0x99/0x220
+? tick_irq_enter+0x5d/0x80
+handle_irq_event_percpu+0xf/0x40
+handle_irq_event+0x3a/0x60
+handle_edge_irq+0xa2/0x1c0
+__common_interrupt+0x55/0x140
+common_interrupt+0x7d/0xa0
+</IRQ>
+<TASK>
+asm_common_interrupt+0x22/0x40
+RIP: 0010:default_idle+0x13/0x20
+Code: c0 08 00 00 00 4d 29 c8 4c 01 c7 4c 29 c2 e9 72 ff
+ff ff cc cc cc cc 8b 05 ea 08 25 01 85 c0 7e 07 0f 00 2d 7f b0 26 00 fb
+f4 <fa> c3 90 66 2e 0f 1f 84 00 00 00 00 00 65 48 8b 04 25 80 d0 02 00
+RSP: 0018:ffffc9000010fec8 EFLAGS: 00000242
+RAX: 0000000000000001 RBX: 000000000000000f RCX: 4000000000000000
+RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff811c410c
+RBP: ffffffff829478c0 R08: 0000000000000001 R09: 0000000000000001
+R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
+R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
+? do_idle+0x1ec/0x210
+default_idle_call+0x6c/0x90
+do_idle+0x1ec/0x210
+cpu_startup_entry+0x26/0x30
+start_secondary+0x11b/0x150
+secondary_startup_64_no_verify+0x165/0x16b
+</TASK>
+irq event stamp: 833284
+hardirqs last  enabled at (833283): [<ffffffff811c410c>]
+do_idle+0x1ec/0x210
+hardirqs last disabled at (833284): [<ffffffff81daf9ef>]
+common_interrupt+0xf/0xa0
+softirqs last  enabled at (833224): [<ffffffff81dc199f>]
+__do_softirq+0x2bf/0x40e
+softirqs last disabled at (833177): [<ffffffff81178ddf>]
+irq_exit_rcu+0x7f/0xa0
+
+Fixes: 34f46ae0d4b3 ("net/mlx5: Add command failures data to debugfs")
+Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
+Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+index 7013e1c8741a..55efb932ab2c 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+@@ -1921,6 +1921,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status,
+ {
+       const char *namep = mlx5_command_str(opcode);
+       struct mlx5_cmd_stats *stats;
++      unsigned long flags;
+       if (!err || !(strcmp(namep, "unknown command opcode")))
+               return;
+@@ -1928,7 +1929,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status,
+       stats = xa_load(&dev->cmd.stats, opcode);
+       if (!stats)
+               return;
+-      spin_lock_irq(&stats->lock);
++      spin_lock_irqsave(&stats->lock, flags);
+       stats->failed++;
+       if (err < 0)
+               stats->last_failed_errno = -err;
+@@ -1937,7 +1938,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status,
+               stats->last_failed_mbox_status = status;
+               stats->last_failed_syndrome = syndrome;
+       }
+-      spin_unlock_irq(&stats->lock);
++      spin_unlock_irqrestore(&stats->lock, flags);
+ }
+ /* preserve -EREMOTEIO for outbox.status != OK, otherwise return err as is */
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch b/queue-6.6/net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch
new file mode 100644 (file)
index 0000000..8df4369
--- /dev/null
@@ -0,0 +1,39 @@
+From 5a6c2772e477fb69792c2f463d174d6e3960cb54 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 28 Nov 2023 14:01:54 -0800
+Subject: net/mlx5: Use mlx5 device constant for selecting CQ period mode for
+ ASO
+
+From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
+
+[ Upstream commit 20cbf8cbb827094197f3b17db60d71449415db1e ]
+
+mlx5 devices have specific constants for choosing the CQ period mode. These
+constants do not have to match the constants used by the kernel software
+API for DIM period mode selection.
+
+Fixes: cdd04f4d4d71 ("net/mlx5: Add support to create SQ and CQ for ASO")
+Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
+Reviewed-by: Jianbo Liu <jianbol@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c
+index 40c7be124041..58bd749b5e4d 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c
+@@ -98,7 +98,7 @@ static int create_aso_cq(struct mlx5_aso_cq *cq, void *cqc_data)
+       mlx5_fill_page_frag_array(&cq->wq_ctrl.buf,
+                                 (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas));
+-      MLX5_SET(cqc,   cqc, cq_period_mode, DIM_CQ_PERIOD_MODE_START_FROM_EQE);
++      MLX5_SET(cqc,   cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
+       MLX5_SET(cqc,   cqc, c_eqn_or_apu_element, eqn);
+       MLX5_SET(cqc,   cqc, uar_page,      mdev->priv.uar->index);
+       MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch b/queue-6.6/net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch
new file mode 100644 (file)
index 0000000..86dbe34
--- /dev/null
@@ -0,0 +1,39 @@
+From 7c90bd2ab1fafff65de6ffe9d396b9d712ecda24 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Dec 2023 13:52:55 +0200
+Subject: net/mlx5e: Allow software parsing when IPsec crypto is enabled
+
+From: Leon Romanovsky <leonro@nvidia.com>
+
+[ Upstream commit 20f5468a7988dedd94a57ba8acd65ebda6a59723 ]
+
+All ConnectX devices have software parsing capability enabled, but it is
+more correct to set allow_swp only if capability exists, which for IPsec
+means that crypto offload is supported.
+
+Fixes: 2451da081a34 ("net/mlx5: Unify device IPsec capabilities check")
+Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en/params.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+index e097f336e1c4..30507b7c2fb1 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+@@ -1062,8 +1062,8 @@ void mlx5e_build_sq_param(struct mlx5_core_dev *mdev,
+       void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+       bool allow_swp;
+-      allow_swp =
+-              mlx5_geneve_tx_allowed(mdev) || !!mlx5_ipsec_device_caps(mdev);
++      allow_swp = mlx5_geneve_tx_allowed(mdev) ||
++                  (mlx5_ipsec_device_caps(mdev) & MLX5_IPSEC_CAP_CRYPTO);
+       mlx5e_build_sq_param_common(mdev, param);
+       MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size);
+       MLX5_SET(sqc, sqc, allow_swp, allow_swp);
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch b/queue-6.6/net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch
new file mode 100644 (file)
index 0000000..eabb175
--- /dev/null
@@ -0,0 +1,100 @@
+From a999382790f95d77671c82459ded8a59f3b43b45 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 15:17:36 +0800
+Subject: net/mlx5e: fix a double-free in arfs_create_groups
+
+From: Zhipeng Lu <alexious@zju.edu.cn>
+
+[ Upstream commit 3c6d5189246f590e4e1f167991558bdb72a4738b ]
+
+When `in` allocated by kvzalloc fails, arfs_create_groups will free
+ft->g and return an error. However, arfs_create_table, the only caller of
+arfs_create_groups, will hold this error and call to
+mlx5e_destroy_flow_table, in which the ft->g will be freed again.
+
+Fixes: 1cabe6b0965e ("net/mlx5e: Create aRFS flow tables")
+Signed-off-by: Zhipeng Lu <alexious@zju.edu.cn>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../net/ethernet/mellanox/mlx5/core/en_arfs.c | 26 +++++++++++--------
+ 1 file changed, 15 insertions(+), 11 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+index bb7f86c993e5..e66f486faafe 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+@@ -254,11 +254,13 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft,
+       ft->g = kcalloc(MLX5E_ARFS_NUM_GROUPS,
+                       sizeof(*ft->g), GFP_KERNEL);
+-      in = kvzalloc(inlen, GFP_KERNEL);
+-      if  (!in || !ft->g) {
+-              kfree(ft->g);
+-              kvfree(in);
++      if (!ft->g)
+               return -ENOMEM;
++
++      in = kvzalloc(inlen, GFP_KERNEL);
++      if (!in) {
++              err = -ENOMEM;
++              goto err_free_g;
+       }
+       mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+@@ -278,7 +280,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft,
+               break;
+       default:
+               err = -EINVAL;
+-              goto out;
++              goto err_free_in;
+       }
+       switch (type) {
+@@ -300,7 +302,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft,
+               break;
+       default:
+               err = -EINVAL;
+-              goto out;
++              goto err_free_in;
+       }
+       MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+@@ -309,7 +311,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft,
+       MLX5_SET_CFG(in, end_flow_index, ix - 1);
+       ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+       if (IS_ERR(ft->g[ft->num_groups]))
+-              goto err;
++              goto err_clean_group;
+       ft->num_groups++;
+       memset(in, 0, inlen);
+@@ -318,18 +320,20 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft,
+       MLX5_SET_CFG(in, end_flow_index, ix - 1);
+       ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+       if (IS_ERR(ft->g[ft->num_groups]))
+-              goto err;
++              goto err_clean_group;
+       ft->num_groups++;
+       kvfree(in);
+       return 0;
+-err:
++err_clean_group:
+       err = PTR_ERR(ft->g[ft->num_groups]);
+       ft->g[ft->num_groups] = NULL;
+-out:
++err_free_in:
+       kvfree(in);
+-
++err_free_g:
++      kfree(ft->g);
++      ft->g = NULL;
+       return err;
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch b/queue-6.6/net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch
new file mode 100644 (file)
index 0000000..dcab677
--- /dev/null
@@ -0,0 +1,40 @@
+From d7ccda1acf1057d69981953e66087f55dc0daf9e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 28 Nov 2023 17:29:01 +0800
+Subject: net/mlx5e: fix a potential double-free in fs_any_create_groups
+
+From: Dinghao Liu <dinghao.liu@zju.edu.cn>
+
+[ Upstream commit aef855df7e1bbd5aa4484851561211500b22707e ]
+
+When kcalloc() for ft->g succeeds but kvzalloc() for in fails,
+fs_any_create_groups() will free ft->g. However, its caller
+fs_any_create_table() will free ft->g again through calling
+mlx5e_destroy_flow_table(), which will lead to a double-free.
+Fix this by setting ft->g to NULL in fs_any_create_groups().
+
+Fixes: 0f575c20bf06 ("net/mlx5e: Introduce Flow Steering ANY API")
+Signed-off-by: Dinghao Liu <dinghao.liu@zju.edu.cn>
+Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c b/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c
+index e1283531e0b8..671adbad0a40 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c
+@@ -436,6 +436,7 @@ static int fs_any_create_groups(struct mlx5e_flow_table *ft)
+       in = kvzalloc(inlen, GFP_KERNEL);
+       if  (!in || !ft->g) {
+               kfree(ft->g);
++              ft->g = NULL;
+               kvfree(in);
+               return -ENOMEM;
+       }
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-mlx5e-fix-operation-precedence-bug-in-port-times.patch b/queue-6.6/net-mlx5e-fix-operation-precedence-bug-in-port-times.patch
new file mode 100644 (file)
index 0000000..3aa24f8
--- /dev/null
@@ -0,0 +1,41 @@
+From 7393f91797c248e3fd72d418ad4692e02bba7058 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 22 Nov 2023 18:32:11 -0800
+Subject: net/mlx5e: Fix operation precedence bug in port timestamping
+ napi_poll context
+
+From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
+
+[ Upstream commit 3876638b2c7ebb2c9d181de1191db0de8cac143a ]
+
+Indirection (*) is of lower precedence than postfix increment (++). Logic
+in napi_poll context would cause an out-of-bound read by first increment
+the pointer address by byte address space and then dereference the value.
+Rather, the intended logic was to dereference first and then increment the
+underlying value.
+
+Fixes: 92214be5979c ("net/mlx5e: Update doorbell for port timestamping CQ before the software counter")
+Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
+Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+index af3928eddafd..803035d4e597 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+@@ -213,7 +213,7 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq,
+       mlx5e_ptpsq_mark_ts_cqes_undelivered(ptpsq, hwtstamp);
+ out:
+       napi_consume_skb(skb, budget);
+-      md_buff[*md_buff_sz++] = metadata_id;
++      md_buff[(*md_buff_sz)++] = metadata_id;
+       if (unlikely(mlx5e_ptp_metadata_map_unhealthy(&ptpsq->metadata_map)) &&
+           !test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
+               queue_work(ptpsq->txqsq.priv->wq, &ptpsq->report_unhealthy_work);
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-mlx5e-fix-peer-flow-lists-handling.patch b/queue-6.6/net-mlx5e-fix-peer-flow-lists-handling.patch
new file mode 100644 (file)
index 0000000..6090d43
--- /dev/null
@@ -0,0 +1,126 @@
+From fc863a43647a9252fc4207aae912d7822ced707d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 10 Nov 2023 11:10:22 +0100
+Subject: net/mlx5e: Fix peer flow lists handling
+
+From: Vlad Buslov <vladbu@nvidia.com>
+
+[ Upstream commit d76fdd31f953ac5046555171620f2562715e9b71 ]
+
+The cited change refactored mlx5e_tc_del_fdb_peer_flow() to only clear DUP
+flag when list of peer flows has become empty. However, if any concurrent
+user holds a reference to a peer flow (for example, the neighbor update
+workqueue task is updating peer flow's parent encap entry concurrently),
+then the flow will not be removed from the peer list and, consecutively,
+DUP flag will remain set. Since mlx5e_tc_del_fdb_peers_flow() calls
+mlx5e_tc_del_fdb_peer_flow() for every possible peer index the algorithm
+will try to remove the flow from eswitch instances that it has never peered
+with causing either NULL pointer dereference when trying to remove the flow
+peer list head of peer_index that was never initialized or a warning if the
+list debug config is enabled[0].
+
+Fix the issue by always removing the peer flow from the list even when not
+releasing the last reference to it.
+
+[0]:
+
+[ 3102.985806] ------------[ cut here ]------------
+[ 3102.986223] list_del corruption, ffff888139110698->next is NULL
+[ 3102.986757] WARNING: CPU: 2 PID: 22109 at lib/list_debug.c:53 __list_del_entry_valid_or_report+0x4f/0xc0
+[ 3102.987561] Modules linked in: act_ct nf_flow_table bonding act_tunnel_key act_mirred act_skbedit vxlan cls_matchall nfnetlink_cttimeout act_gact cls_flower sch_ingress mlx5_vdpa vringh vhost_iotlb vdpa openvswitch nsh xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcg
+ss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5_core [last unloaded: bonding]
+[ 3102.991113] CPU: 2 PID: 22109 Comm: revalidator28 Not tainted 6.6.0-rc6+ #3
+[ 3102.991695] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
+[ 3102.992605] RIP: 0010:__list_del_entry_valid_or_report+0x4f/0xc0
+[ 3102.993122] Code: 39 c2 74 56 48 8b 32 48 39 fe 75 62 48 8b 51 08 48 39 f2 75 73 b8 01 00 00 00 c3 48 89 fe 48 c7 c7 48 fd 0a 82 e8 41 0b ad ff <0f> 0b 31 c0 c3 48 89 fe 48 c7 c7 70 fd 0a 82 e8 2d 0b ad ff 0f 0b
+[ 3102.994615] RSP: 0018:ffff8881383e7710 EFLAGS: 00010286
+[ 3102.995078] RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000000
+[ 3102.995670] RDX: 0000000000000001 RSI: ffff88885f89b640 RDI: ffff88885f89b640
+[ 3102.997188] DEL flow 00000000be367878 on port 0
+[ 3102.998594] RBP: dead000000000122 R08: 0000000000000000 R09: c0000000ffffdfff
+[ 3102.999604] R10: 0000000000000008 R11: ffff8881383e7598 R12: dead000000000100
+[ 3103.000198] R13: 0000000000000002 R14: ffff888139110000 R15: ffff888101901240
+[ 3103.000790] FS:  00007f424cde4700(0000) GS:ffff88885f880000(0000) knlGS:0000000000000000
+[ 3103.001486] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[ 3103.001986] CR2: 00007fd42e8dcb70 CR3: 000000011e68a003 CR4: 0000000000370ea0
+[ 3103.002596] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+[ 3103.003190] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+[ 3103.003787] Call Trace:
+[ 3103.004055]  <TASK>
+[ 3103.004297]  ? __warn+0x7d/0x130
+[ 3103.004623]  ? __list_del_entry_valid_or_report+0x4f/0xc0
+[ 3103.005094]  ? report_bug+0xf1/0x1c0
+[ 3103.005439]  ? console_unlock+0x4a/0xd0
+[ 3103.005806]  ? handle_bug+0x3f/0x70
+[ 3103.006149]  ? exc_invalid_op+0x13/0x60
+[ 3103.006531]  ? asm_exc_invalid_op+0x16/0x20
+[ 3103.007430]  ? __list_del_entry_valid_or_report+0x4f/0xc0
+[ 3103.007910]  mlx5e_tc_del_fdb_peers_flow+0xcf/0x240 [mlx5_core]
+[ 3103.008463]  mlx5e_tc_del_flow+0x46/0x270 [mlx5_core]
+[ 3103.008944]  mlx5e_flow_put+0x26/0x50 [mlx5_core]
+[ 3103.009401]  mlx5e_delete_flower+0x25f/0x380 [mlx5_core]
+[ 3103.009901]  tc_setup_cb_destroy+0xab/0x180
+[ 3103.010292]  fl_hw_destroy_filter+0x99/0xc0 [cls_flower]
+[ 3103.010779]  __fl_delete+0x2d4/0x2f0 [cls_flower]
+[ 3103.011207]  fl_delete+0x36/0x80 [cls_flower]
+[ 3103.011614]  tc_del_tfilter+0x56f/0x750
+[ 3103.011982]  rtnetlink_rcv_msg+0xff/0x3a0
+[ 3103.012362]  ? netlink_ack+0x1c7/0x4e0
+[ 3103.012719]  ? rtnl_calcit.isra.44+0x130/0x130
+[ 3103.013134]  netlink_rcv_skb+0x54/0x100
+[ 3103.013533]  netlink_unicast+0x1ca/0x2b0
+[ 3103.013902]  netlink_sendmsg+0x361/0x4d0
+[ 3103.014269]  __sock_sendmsg+0x38/0x60
+[ 3103.014643]  ____sys_sendmsg+0x1f2/0x200
+[ 3103.015018]  ? copy_msghdr_from_user+0x72/0xa0
+[ 3103.015265]  ___sys_sendmsg+0x87/0xd0
+[ 3103.016608]  ? copy_msghdr_from_user+0x72/0xa0
+[ 3103.017014]  ? ___sys_recvmsg+0x9b/0xd0
+[ 3103.017381]  ? ttwu_do_activate.isra.137+0x58/0x180
+[ 3103.017821]  ? wake_up_q+0x49/0x90
+[ 3103.018157]  ? futex_wake+0x137/0x160
+[ 3103.018521]  ? __sys_sendmsg+0x51/0x90
+[ 3103.018882]  __sys_sendmsg+0x51/0x90
+[ 3103.019230]  ? exit_to_user_mode_prepare+0x56/0x130
+[ 3103.019670]  do_syscall_64+0x3c/0x80
+[ 3103.020017]  entry_SYSCALL_64_after_hwframe+0x46/0xb0
+[ 3103.020469] RIP: 0033:0x7f4254811ef4
+[ 3103.020816] Code: 89 f3 48 83 ec 10 48 89 7c 24 08 48 89 14 24 e8 42 eb ff ff 48 8b 14 24 41 89 c0 48 89 de 48 8b 7c 24 08 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 30 44 89 c7 48 89 04 24 e8 78 eb ff ff 48 8b
+[ 3103.022290] RSP: 002b:00007f424cdd9480 EFLAGS: 00000293 ORIG_RAX: 000000000000002e
+[ 3103.022970] RAX: ffffffffffffffda RBX: 00007f424cdd9510 RCX: 00007f4254811ef4
+[ 3103.023564] RDX: 0000000000000000 RSI: 00007f424cdd9510 RDI: 0000000000000012
+[ 3103.024158] RBP: 00007f424cdda238 R08: 0000000000000000 R09: 00007f41d801a4b0
+[ 3103.024748] R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000001
+[ 3103.025341] R13: 00007f424cdd9510 R14: 00007f424cdda240 R15: 00007f424cdd99a0
+[ 3103.025931]  </TASK>
+[ 3103.026182] ---[ end trace 0000000000000000 ]---
+[ 3103.027033] ------------[ cut here ]------------
+
+Fixes: 9be6c21fdcf8 ("net/mlx5e: Handle offloads flows per peer")
+Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
+Reviewed-by: Mark Bloch <mbloch@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+index 25e44ee5121a..dc9b157a4499 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+@@ -2012,9 +2012,10 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow,
+       list_for_each_entry_safe(peer_flow, tmp, &flow->peer_flows, peer_flows) {
+               if (peer_index != mlx5_get_dev_index(peer_flow->priv->mdev))
+                       continue;
++
++              list_del(&peer_flow->peer_flows);
+               if (refcount_dec_and_test(&peer_flow->refcnt)) {
+                       mlx5e_tc_del_fdb_flow(peer_flow->priv, peer_flow);
+-                      list_del(&peer_flow->peer_flows);
+                       kfree(peer_flow);
+               }
+       }
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch b/queue-6.6/net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch
new file mode 100644 (file)
index 0000000..f5cf883
--- /dev/null
@@ -0,0 +1,68 @@
+From fe6083067bcb4589a237bb79bfe1c9821384c328 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 26 Nov 2023 11:08:10 +0200
+Subject: net/mlx5e: Ignore IPsec replay window values on sender side
+
+From: Leon Romanovsky <leonro@nvidia.com>
+
+[ Upstream commit 315a597f9bcfe7fe9980985031413457bee95510 ]
+
+XFRM stack doesn't prevent from users to configure replay window
+in TX side and strongswan sets replay_window to be 1. It causes
+to failures in validation logic when trying to offload the SA.
+
+Replay window is not relevant in TX side and should be ignored.
+
+Fixes: cded6d80129b ("net/mlx5e: Store replay window in XFRM attributes")
+Signed-off-by: Aya Levin <ayal@nvidia.com>
+Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c   | 10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
+index 5834e47e72d8..e2ffc572de18 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
+@@ -336,12 +336,17 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry,
+       /* iv len */
+       aes_gcm->icv_len = x->aead->alg_icv_len;
++      attrs->dir = x->xso.dir;
++
+       /* esn */
+       if (x->props.flags & XFRM_STATE_ESN) {
+               attrs->replay_esn.trigger = true;
+               attrs->replay_esn.esn = sa_entry->esn_state.esn;
+               attrs->replay_esn.esn_msb = sa_entry->esn_state.esn_msb;
+               attrs->replay_esn.overlap = sa_entry->esn_state.overlap;
++              if (attrs->dir == XFRM_DEV_OFFLOAD_OUT)
++                      goto skip_replay_window;
++
+               switch (x->replay_esn->replay_window) {
+               case 32:
+                       attrs->replay_esn.replay_window =
+@@ -365,7 +370,7 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry,
+               }
+       }
+-      attrs->dir = x->xso.dir;
++skip_replay_window:
+       /* spi */
+       attrs->spi = be32_to_cpu(x->id.spi);
+@@ -501,7 +506,8 @@ static int mlx5e_xfrm_validate_state(struct mlx5_core_dev *mdev,
+                       return -EINVAL;
+               }
+-              if (x->replay_esn && x->replay_esn->replay_window != 32 &&
++              if (x->replay_esn && x->xso.dir == XFRM_DEV_OFFLOAD_IN &&
++                  x->replay_esn->replay_window != 32 &&
+                   x->replay_esn->replay_window != 64 &&
+                   x->replay_esn->replay_window != 128 &&
+                   x->replay_esn->replay_window != 256) {
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-mvpp2-clear-bm-pool-before-initialization.patch b/queue-6.6/net-mvpp2-clear-bm-pool-before-initialization.patch
new file mode 100644 (file)
index 0000000..43fb96d
--- /dev/null
@@ -0,0 +1,77 @@
+From fd80834d784fc993055f1ff5e1194a7e02c0b83a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 19:59:14 -0800
+Subject: net: mvpp2: clear BM pool before initialization
+
+From: Jenishkumar Maheshbhai Patel <jpatel2@marvell.com>
+
+[ Upstream commit 9f538b415db862e74b8c5d3abbccfc1b2b6caa38 ]
+
+Register value persist after booting the kernel using
+kexec which results in kernel panic. Thus clear the
+BM pool registers before initialisation to fix the issue.
+
+Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit")
+Signed-off-by: Jenishkumar Maheshbhai Patel <jpatel2@marvell.com>
+Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
+Link: https://lore.kernel.org/r/20240119035914.2595665-1-jpatel2@marvell.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../net/ethernet/marvell/mvpp2/mvpp2_main.c   | 27 ++++++++++++++++++-
+ 1 file changed, 26 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+index 21c3f9b015c8..aca17082b9ec 100644
+--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
++++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+@@ -614,12 +614,38 @@ static void mvpp23_bm_set_8pool_mode(struct mvpp2 *priv)
+       mvpp2_write(priv, MVPP22_BM_POOL_BASE_ADDR_HIGH_REG, val);
+ }
++/* Cleanup pool before actual initialization in the OS */
++static void mvpp2_bm_pool_cleanup(struct mvpp2 *priv, int pool_id)
++{
++      unsigned int thread = mvpp2_cpu_to_thread(priv, get_cpu());
++      u32 val;
++      int i;
++
++      /* Drain the BM from all possible residues left by firmware */
++      for (i = 0; i < MVPP2_BM_POOL_SIZE_MAX; i++)
++              mvpp2_thread_read(priv, thread, MVPP2_BM_PHY_ALLOC_REG(pool_id));
++
++      put_cpu();
++
++      /* Stop the BM pool */
++      val = mvpp2_read(priv, MVPP2_BM_POOL_CTRL_REG(pool_id));
++      val |= MVPP2_BM_STOP_MASK;
++      mvpp2_write(priv, MVPP2_BM_POOL_CTRL_REG(pool_id), val);
++}
++
+ static int mvpp2_bm_init(struct device *dev, struct mvpp2 *priv)
+ {
+       enum dma_data_direction dma_dir = DMA_FROM_DEVICE;
+       int i, err, poolnum = MVPP2_BM_POOLS_NUM;
+       struct mvpp2_port *port;
++      if (priv->percpu_pools)
++              poolnum = mvpp2_get_nrxqs(priv) * 2;
++
++      /* Clean up the pool state in case it contains stale state */
++      for (i = 0; i < poolnum; i++)
++              mvpp2_bm_pool_cleanup(priv, i);
++
+       if (priv->percpu_pools) {
+               for (i = 0; i < priv->port_count; i++) {
+                       port = priv->port_list[i];
+@@ -629,7 +655,6 @@ static int mvpp2_bm_init(struct device *dev, struct mvpp2 *priv)
+                       }
+               }
+-              poolnum = mvpp2_get_nrxqs(priv) * 2;
+               for (i = 0; i < poolnum; i++) {
+                       /* the pool in use */
+                       int pn = i / (poolnum / 2);
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch b/queue-6.6/net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch
new file mode 100644 (file)
index 0000000..2bb1bc3
--- /dev/null
@@ -0,0 +1,71 @@
+From a767ed016555274d24d06e30826c54ba74e0c0a8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 17:48:39 -0800
+Subject: net/rds: Fix UBSAN: array-index-out-of-bounds in rds_cmsg_recv
+
+From: Sharath Srinivasan <sharath.srinivasan@oracle.com>
+
+[ Upstream commit 13e788deb7348cc88df34bed736c3b3b9927ea52 ]
+
+Syzcaller UBSAN crash occurs in rds_cmsg_recv(),
+which reads inc->i_rx_lat_trace[j + 1] with index 4 (3 + 1),
+but with array size of 4 (RDS_RX_MAX_TRACES).
+Here 'j' is assigned from rs->rs_rx_trace[i] and in-turn from
+trace.rx_trace_pos[i] in rds_recv_track_latency(),
+with both arrays sized 3 (RDS_MSG_RX_DGRAM_TRACE_MAX). So fix the
+off-by-one bounds check in rds_recv_track_latency() to prevent
+a potential crash in rds_cmsg_recv().
+
+Found by syzcaller:
+=================================================================
+UBSAN: array-index-out-of-bounds in net/rds/recv.c:585:39
+index 4 is out of range for type 'u64 [4]'
+CPU: 1 PID: 8058 Comm: syz-executor228 Not tainted 6.6.0-gd2f51b3516da #1
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
+BIOS 1.15.0-1 04/01/2014
+Call Trace:
+ <TASK>
+ __dump_stack lib/dump_stack.c:88 [inline]
+ dump_stack_lvl+0x136/0x150 lib/dump_stack.c:106
+ ubsan_epilogue lib/ubsan.c:217 [inline]
+ __ubsan_handle_out_of_bounds+0xd5/0x130 lib/ubsan.c:348
+ rds_cmsg_recv+0x60d/0x700 net/rds/recv.c:585
+ rds_recvmsg+0x3fb/0x1610 net/rds/recv.c:716
+ sock_recvmsg_nosec net/socket.c:1044 [inline]
+ sock_recvmsg+0xe2/0x160 net/socket.c:1066
+ __sys_recvfrom+0x1b6/0x2f0 net/socket.c:2246
+ __do_sys_recvfrom net/socket.c:2264 [inline]
+ __se_sys_recvfrom net/socket.c:2260 [inline]
+ __x64_sys_recvfrom+0xe0/0x1b0 net/socket.c:2260
+ do_syscall_x64 arch/x86/entry/common.c:51 [inline]
+ do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82
+ entry_SYSCALL_64_after_hwframe+0x63/0x6b
+==================================================================
+
+Fixes: 3289025aedc0 ("RDS: add receive message trace used by application")
+Reported-by: Chenyuan Yang <chenyuan0y@gmail.com>
+Closes: https://lore.kernel.org/linux-rdma/CALGdzuoVdq-wtQ4Az9iottBqC5cv9ZhcE5q8N7LfYFvkRsOVcw@mail.gmail.com/
+Signed-off-by: Sharath Srinivasan <sharath.srinivasan@oracle.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/rds/af_rds.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
+index 01c4cdfef45d..8435a20968ef 100644
+--- a/net/rds/af_rds.c
++++ b/net/rds/af_rds.c
+@@ -419,7 +419,7 @@ static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval,
+       rs->rs_rx_traces = trace.rx_traces;
+       for (i = 0; i < rs->rs_rx_traces; i++) {
+-              if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
++              if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) {
+                       rs->rs_rx_traces = 0;
+                       return -EFAULT;
+               }
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-sched-flower-fix-chain-template-offload.patch b/queue-6.6/net-sched-flower-fix-chain-template-offload.patch
new file mode 100644 (file)
index 0000000..43250e9
--- /dev/null
@@ -0,0 +1,190 @@
+From d67e18be087db2c13dd8a5330ae3b06720b9b591 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 15:28:43 +0200
+Subject: net/sched: flower: Fix chain template offload
+
+From: Ido Schimmel <idosch@nvidia.com>
+
+[ Upstream commit 32f2a0afa95fae0d1ceec2ff06e0e816939964b8 ]
+
+When a qdisc is deleted from a net device the stack instructs the
+underlying driver to remove its flow offload callback from the
+associated filter block using the 'FLOW_BLOCK_UNBIND' command. The stack
+then continues to replay the removal of the filters in the block for
+this driver by iterating over the chains in the block and invoking the
+'reoffload' operation of the classifier being used. In turn, the
+classifier in its 'reoffload' operation prepares and emits a
+'FLOW_CLS_DESTROY' command for each filter.
+
+However, the stack does not do the same for chain templates and the
+underlying driver never receives a 'FLOW_CLS_TMPLT_DESTROY' command when
+a qdisc is deleted. This results in a memory leak [1] which can be
+reproduced using [2].
+
+Fix by introducing a 'tmplt_reoffload' operation and have the stack
+invoke it with the appropriate arguments as part of the replay.
+Implement the operation in the sole classifier that supports chain
+templates (flower) by emitting the 'FLOW_CLS_TMPLT_{CREATE,DESTROY}'
+command based on whether a flow offload callback is being bound to a
+filter block or being unbound from one.
+
+As far as I can tell, the issue happens since cited commit which
+reordered tcf_block_offload_unbind() before tcf_block_flush_all_chains()
+in __tcf_block_put(). The order cannot be reversed as the filter block
+is expected to be freed after flushing all the chains.
+
+[1]
+unreferenced object 0xffff888107e28800 (size 2048):
+  comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s)
+  hex dump (first 32 bytes):
+    b1 a6 7c 11 81 88 ff ff e0 5b b3 10 81 88 ff ff  ..|......[......
+    01 00 00 00 00 00 00 00 e0 aa b0 84 ff ff ff ff  ................
+  backtrace:
+    [<ffffffff81c06a68>] __kmem_cache_alloc_node+0x1e8/0x320
+    [<ffffffff81ab374e>] __kmalloc+0x4e/0x90
+    [<ffffffff832aec6d>] mlxsw_sp_acl_ruleset_get+0x34d/0x7a0
+    [<ffffffff832bc195>] mlxsw_sp_flower_tmplt_create+0x145/0x180
+    [<ffffffff832b2e1a>] mlxsw_sp_flow_block_cb+0x1ea/0x280
+    [<ffffffff83a10613>] tc_setup_cb_call+0x183/0x340
+    [<ffffffff83a9f85a>] fl_tmplt_create+0x3da/0x4c0
+    [<ffffffff83a22435>] tc_ctl_chain+0xa15/0x1170
+    [<ffffffff838a863c>] rtnetlink_rcv_msg+0x3cc/0xed0
+    [<ffffffff83ac87f0>] netlink_rcv_skb+0x170/0x440
+    [<ffffffff83ac6270>] netlink_unicast+0x540/0x820
+    [<ffffffff83ac6e28>] netlink_sendmsg+0x8d8/0xda0
+    [<ffffffff83793def>] ____sys_sendmsg+0x30f/0xa80
+    [<ffffffff8379d29a>] ___sys_sendmsg+0x13a/0x1e0
+    [<ffffffff8379d50c>] __sys_sendmsg+0x11c/0x1f0
+    [<ffffffff843b9ce0>] do_syscall_64+0x40/0xe0
+unreferenced object 0xffff88816d2c0400 (size 1024):
+  comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s)
+  hex dump (first 32 bytes):
+    40 00 00 00 00 00 00 00 57 f6 38 be 00 00 00 00  @.......W.8.....
+    10 04 2c 6d 81 88 ff ff 10 04 2c 6d 81 88 ff ff  ..,m......,m....
+  backtrace:
+    [<ffffffff81c06a68>] __kmem_cache_alloc_node+0x1e8/0x320
+    [<ffffffff81ab36c1>] __kmalloc_node+0x51/0x90
+    [<ffffffff81a8ed96>] kvmalloc_node+0xa6/0x1f0
+    [<ffffffff82827d03>] bucket_table_alloc.isra.0+0x83/0x460
+    [<ffffffff82828d2b>] rhashtable_init+0x43b/0x7c0
+    [<ffffffff832aed48>] mlxsw_sp_acl_ruleset_get+0x428/0x7a0
+    [<ffffffff832bc195>] mlxsw_sp_flower_tmplt_create+0x145/0x180
+    [<ffffffff832b2e1a>] mlxsw_sp_flow_block_cb+0x1ea/0x280
+    [<ffffffff83a10613>] tc_setup_cb_call+0x183/0x340
+    [<ffffffff83a9f85a>] fl_tmplt_create+0x3da/0x4c0
+    [<ffffffff83a22435>] tc_ctl_chain+0xa15/0x1170
+    [<ffffffff838a863c>] rtnetlink_rcv_msg+0x3cc/0xed0
+    [<ffffffff83ac87f0>] netlink_rcv_skb+0x170/0x440
+    [<ffffffff83ac6270>] netlink_unicast+0x540/0x820
+    [<ffffffff83ac6e28>] netlink_sendmsg+0x8d8/0xda0
+    [<ffffffff83793def>] ____sys_sendmsg+0x30f/0xa80
+
+[2]
+ # tc qdisc add dev swp1 clsact
+ # tc chain add dev swp1 ingress proto ip chain 1 flower dst_ip 0.0.0.0/32
+ # tc qdisc del dev swp1 clsact
+ # devlink dev reload pci/0000:06:00.0
+
+Fixes: bbf73830cd48 ("net: sched: traverse chains in block with tcf_get_next_chain()")
+Signed-off-by: Ido Schimmel <idosch@nvidia.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/sch_generic.h |  4 ++++
+ net/sched/cls_api.c       |  9 ++++++++-
+ net/sched/cls_flower.c    | 23 +++++++++++++++++++++++
+ 3 files changed, 35 insertions(+), 1 deletion(-)
+
+diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
+index f232512505f8..e940debac400 100644
+--- a/include/net/sch_generic.h
++++ b/include/net/sch_generic.h
+@@ -376,6 +376,10 @@ struct tcf_proto_ops {
+                                               struct nlattr **tca,
+                                               struct netlink_ext_ack *extack);
+       void                    (*tmplt_destroy)(void *tmplt_priv);
++      void                    (*tmplt_reoffload)(struct tcf_chain *chain,
++                                                 bool add,
++                                                 flow_setup_cb_t *cb,
++                                                 void *cb_priv);
+       struct tcf_exts *       (*get_exts)(const struct tcf_proto *tp,
+                                           u32 handle);
+diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
+index a193cc7b3241..84e18b5f72a3 100644
+--- a/net/sched/cls_api.c
++++ b/net/sched/cls_api.c
+@@ -1536,6 +1536,9 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb,
+            chain_prev = chain,
+                    chain = __tcf_get_next_chain(block, chain),
+                    tcf_chain_put(chain_prev)) {
++              if (chain->tmplt_ops && add)
++                      chain->tmplt_ops->tmplt_reoffload(chain, true, cb,
++                                                        cb_priv);
+               for (tp = __tcf_get_next_proto(chain, NULL); tp;
+                    tp_prev = tp,
+                            tp = __tcf_get_next_proto(chain, tp),
+@@ -1551,6 +1554,9 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb,
+                               goto err_playback_remove;
+                       }
+               }
++              if (chain->tmplt_ops && !add)
++                      chain->tmplt_ops->tmplt_reoffload(chain, false, cb,
++                                                        cb_priv);
+       }
+       return 0;
+@@ -2950,7 +2956,8 @@ static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net,
+       ops = tcf_proto_lookup_ops(name, true, extack);
+       if (IS_ERR(ops))
+               return PTR_ERR(ops);
+-      if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) {
++      if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump ||
++          !ops->tmplt_reoffload) {
+               NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier");
+               module_put(ops->owner);
+               return -EOPNOTSUPP;
+diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
+index e5314a31f75a..efb9d2811b73 100644
+--- a/net/sched/cls_flower.c
++++ b/net/sched/cls_flower.c
+@@ -2721,6 +2721,28 @@ static void fl_tmplt_destroy(void *tmplt_priv)
+       kfree(tmplt);
+ }
++static void fl_tmplt_reoffload(struct tcf_chain *chain, bool add,
++                             flow_setup_cb_t *cb, void *cb_priv)
++{
++      struct fl_flow_tmplt *tmplt = chain->tmplt_priv;
++      struct flow_cls_offload cls_flower = {};
++
++      cls_flower.rule = flow_rule_alloc(0);
++      if (!cls_flower.rule)
++              return;
++
++      cls_flower.common.chain_index = chain->index;
++      cls_flower.command = add ? FLOW_CLS_TMPLT_CREATE :
++                                 FLOW_CLS_TMPLT_DESTROY;
++      cls_flower.cookie = (unsigned long) tmplt;
++      cls_flower.rule->match.dissector = &tmplt->dissector;
++      cls_flower.rule->match.mask = &tmplt->mask;
++      cls_flower.rule->match.key = &tmplt->dummy_key;
++
++      cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv);
++      kfree(cls_flower.rule);
++}
++
+ static int fl_dump_key_val(struct sk_buff *skb,
+                          void *val, int val_type,
+                          void *mask, int mask_type, int len)
+@@ -3628,6 +3650,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
+       .bind_class     = fl_bind_class,
+       .tmplt_create   = fl_tmplt_create,
+       .tmplt_destroy  = fl_tmplt_destroy,
++      .tmplt_reoffload = fl_tmplt_reoffload,
+       .tmplt_dump     = fl_tmplt_dump,
+       .get_exts       = fl_get_exts,
+       .owner          = THIS_MODULE,
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch b/queue-6.6/net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch
new file mode 100644 (file)
index 0000000..677e016
--- /dev/null
@@ -0,0 +1,87 @@
+From fff92e91ba328439aad1ad95a571f9d01628d9cd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 12:32:10 +0800
+Subject: net/smc: fix illegal rmb_desc access in SMC-D connection dump
+
+From: Wen Gu <guwen@linux.alibaba.com>
+
+[ Upstream commit dbc153fd3c142909e564bb256da087e13fbf239c ]
+
+A crash was found when dumping SMC-D connections. It can be reproduced
+by following steps:
+
+- run nginx/wrk test:
+  smc_run nginx
+  smc_run wrk -t 16 -c 1000 -d <duration> -H 'Connection: Close' <URL>
+
+- continuously dump SMC-D connections in parallel:
+  watch -n 1 'smcss -D'
+
+ BUG: kernel NULL pointer dereference, address: 0000000000000030
+ CPU: 2 PID: 7204 Comm: smcss Kdump: loaded Tainted: G E      6.7.0+ #55
+ RIP: 0010:__smc_diag_dump.constprop.0+0x5e5/0x620 [smc_diag]
+ Call Trace:
+  <TASK>
+  ? __die+0x24/0x70
+  ? page_fault_oops+0x66/0x150
+  ? exc_page_fault+0x69/0x140
+  ? asm_exc_page_fault+0x26/0x30
+  ? __smc_diag_dump.constprop.0+0x5e5/0x620 [smc_diag]
+  ? __kmalloc_node_track_caller+0x35d/0x430
+  ? __alloc_skb+0x77/0x170
+  smc_diag_dump_proto+0xd0/0xf0 [smc_diag]
+  smc_diag_dump+0x26/0x60 [smc_diag]
+  netlink_dump+0x19f/0x320
+  __netlink_dump_start+0x1dc/0x300
+  smc_diag_handler_dump+0x6a/0x80 [smc_diag]
+  ? __pfx_smc_diag_dump+0x10/0x10 [smc_diag]
+  sock_diag_rcv_msg+0x121/0x140
+  ? __pfx_sock_diag_rcv_msg+0x10/0x10
+  netlink_rcv_skb+0x5a/0x110
+  sock_diag_rcv+0x28/0x40
+  netlink_unicast+0x22a/0x330
+  netlink_sendmsg+0x1f8/0x420
+  __sock_sendmsg+0xb0/0xc0
+  ____sys_sendmsg+0x24e/0x300
+  ? copy_msghdr_from_user+0x62/0x80
+  ___sys_sendmsg+0x7c/0xd0
+  ? __do_fault+0x34/0x160
+  ? do_read_fault+0x5f/0x100
+  ? do_fault+0xb0/0x110
+  ? __handle_mm_fault+0x2b0/0x6c0
+  __sys_sendmsg+0x4d/0x80
+  do_syscall_64+0x69/0x180
+  entry_SYSCALL_64_after_hwframe+0x6e/0x76
+
+It is possible that the connection is in process of being established
+when we dump it. Assumed that the connection has been registered in a
+link group by smc_conn_create() but the rmb_desc has not yet been
+initialized by smc_buf_create(), thus causing the illegal access to
+conn->rmb_desc. So fix it by checking before dump.
+
+Fixes: 4b1b7d3b30a6 ("net/smc: add SMC-D diag support")
+Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
+Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
+Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/smc/smc_diag.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
+index 2c464d76b06c..37833b96b508 100644
+--- a/net/smc/smc_diag.c
++++ b/net/smc/smc_diag.c
+@@ -163,7 +163,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
+       }
+       if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd &&
+           (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) &&
+-          !list_empty(&smc->conn.lgr->list)) {
++          !list_empty(&smc->conn.lgr->list) && smc->conn.rmb_desc) {
+               struct smc_connection *conn = &smc->conn;
+               struct smcd_diag_dmbinfo dinfo;
+               struct smcd_dev *smcd = conn->lgr->smcd;
+-- 
+2.43.0
+
diff --git a/queue-6.6/net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch b/queue-6.6/net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch
new file mode 100644 (file)
index 0000000..3e7263d
--- /dev/null
@@ -0,0 +1,63 @@
+From 264fa9d0041aa2c4c7ce85b9734ad9883997d136 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 19:19:09 +0100
+Subject: net: stmmac: Wait a bit for the reset to take effect
+
+From: Bernd Edlinger <bernd.edlinger@hotmail.de>
+
+[ Upstream commit a5f5eee282a0aae80227697e1d9c811b1726d31d ]
+
+otherwise the synopsys_id value may be read out wrong,
+because the GMAC_VERSION register might still be in reset
+state, for at least 1 us after the reset is de-asserted.
+
+Add a wait for 10 us before continuing to be on the safe side.
+
+> From what have you got that delay value?
+
+Just try and error, with very old linux versions and old gcc versions
+the synopsys_id was read out correctly most of the time (but not always),
+with recent linux versions and recnet gcc versions it was read out
+wrongly most of the time, but again not always.
+I don't have access to the VHDL code in question, so I cannot
+tell why it takes so long to get the correct values, I also do not
+have more than a few hardware samples, so I cannot tell how long
+this timeout must be in worst case.
+Experimentally I can tell that the register is read several times
+as zero immediately after the reset is de-asserted, also adding several
+no-ops is not enough, adding a printk is enough, also udelay(1) seems to
+be enough but I tried that not very often, and I have not access to many
+hardware samples to be 100% sure about the necessary delay.
+And since the udelay here is only executed once per device instance,
+it seems acceptable to delay the boot for 10 us.
+
+BTW: my hardware's synopsys id is 0x37.
+
+Fixes: c5e4ddbdfa11 ("net: stmmac: Add support for optional reset control")
+Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
+Link: https://lore.kernel.org/r/AS8P193MB1285A810BD78C111E7F6AA34E4752@AS8P193MB1285.EURP193.PROD.OUTLOOK.COM
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+index 684ec7058c82..292857c0e601 100644
+--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
++++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+@@ -7441,6 +7441,9 @@ int stmmac_dvr_probe(struct device *device,
+               dev_err(priv->device, "unable to bring out of ahb reset: %pe\n",
+                       ERR_PTR(ret));
++      /* Wait a bit for the reset to take effect */
++      udelay(10);
++
+       /* Init MAC and get the capabilities */
+       ret = stmmac_hw_init(priv);
+       if (ret)
+-- 
+2.43.0
+
diff --git a/queue-6.6/netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch b/queue-6.6/netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch
new file mode 100644 (file)
index 0000000..f0752c3
--- /dev/null
@@ -0,0 +1,60 @@
+From 7c981514c167ba49ad7ee5dd97a3e0e67b119874 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 13:34:32 +0100
+Subject: netfilter: nf_tables: restrict anonymous set and map names to 16
+ bytes
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit b462579b2b86a8f5230543cadd3a4836be27baf7 ]
+
+nftables has two types of sets/maps, one where userspace defines the
+name, and anonymous sets/maps, where userspace defines a template name.
+
+For the latter, kernel requires presence of exactly one "%d".
+nftables uses "__set%d" and "__map%d" for this.  The kernel will
+expand the format specifier and replaces it with the smallest unused
+number.
+
+As-is, userspace could define a template name that allows to move
+the set name past the 256 bytes upperlimit (post-expansion).
+
+I don't see how this could be a problem, but I would prefer if userspace
+cannot do this, so add a limit of 16 bytes for the '%d' template name.
+
+16 bytes is the old total upper limit for set names that existed when
+nf_tables was merged initially.
+
+Fixes: 387454901bd6 ("netfilter: nf_tables: Allow set names of up to 255 chars")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_tables_api.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
+index b28fbcb86e94..bad58df478a7 100644
+--- a/net/netfilter/nf_tables_api.c
++++ b/net/netfilter/nf_tables_api.c
+@@ -24,6 +24,7 @@
+ #include <net/sock.h>
+ #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-"))
++#define NFT_SET_MAX_ANONLEN 16
+ unsigned int nf_tables_net_id __read_mostly;
+@@ -4351,6 +4352,9 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
+               if (p[1] != 'd' || strchr(p + 2, '%'))
+                       return -EINVAL;
++              if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN)
++                      return -EINVAL;
++
+               inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL);
+               if (inuse == NULL)
+                       return -ENOMEM;
+-- 
+2.43.0
+
diff --git a/queue-6.6/netfilter-nf_tables-validate-nfproto_-family.patch b/queue-6.6/netfilter-nf_tables-validate-nfproto_-family.patch
new file mode 100644 (file)
index 0000000..8006b23
--- /dev/null
@@ -0,0 +1,196 @@
+From b425af5f70851260303b3d9c5d0d89f4ed55c6b0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 16:38:25 +0100
+Subject: netfilter: nf_tables: validate NFPROTO_* family
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+[ Upstream commit d0009effa8862c20a13af4cb7475d9771b905693 ]
+
+Several expressions explicitly refer to NF_INET_* hook definitions
+from expr->ops->validate, however, family is not validated.
+
+Bail out with EOPNOTSUPP in case they are used from unsupported
+families.
+
+Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables")
+Fixes: a3c90f7a2323 ("netfilter: nf_tables: flow offload expression")
+Fixes: 2fa841938c64 ("netfilter: nf_tables: introduce routing expression")
+Fixes: 554ced0a6e29 ("netfilter: nf_tables: add support for native socket matching")
+Fixes: ad49d86e07a4 ("netfilter: nf_tables: Add synproxy support")
+Fixes: 4ed8eb6570a4 ("netfilter: nf_tables: Add native tproxy support")
+Fixes: 6c47260250fc ("netfilter: nf_tables: add xfrm expression")
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nft_compat.c       | 12 ++++++++++++
+ net/netfilter/nft_flow_offload.c |  5 +++++
+ net/netfilter/nft_nat.c          |  5 +++++
+ net/netfilter/nft_rt.c           |  5 +++++
+ net/netfilter/nft_socket.c       |  5 +++++
+ net/netfilter/nft_synproxy.c     |  7 +++++--
+ net/netfilter/nft_tproxy.c       |  5 +++++
+ net/netfilter/nft_xfrm.c         |  5 +++++
+ 8 files changed, 47 insertions(+), 2 deletions(-)
+
+diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
+index 5284cd2ad532..f0eeda97bfcd 100644
+--- a/net/netfilter/nft_compat.c
++++ b/net/netfilter/nft_compat.c
+@@ -350,6 +350,12 @@ static int nft_target_validate(const struct nft_ctx *ctx,
+       unsigned int hook_mask = 0;
+       int ret;
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_BRIDGE &&
++          ctx->family != NFPROTO_ARP)
++              return -EOPNOTSUPP;
++
+       if (nft_is_base_chain(ctx->chain)) {
+               const struct nft_base_chain *basechain =
+                                               nft_base_chain(ctx->chain);
+@@ -595,6 +601,12 @@ static int nft_match_validate(const struct nft_ctx *ctx,
+       unsigned int hook_mask = 0;
+       int ret;
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_BRIDGE &&
++          ctx->family != NFPROTO_ARP)
++              return -EOPNOTSUPP;
++
+       if (nft_is_base_chain(ctx->chain)) {
+               const struct nft_base_chain *basechain =
+                                               nft_base_chain(ctx->chain);
+diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
+index ab3362c483b4..397351fa4d5f 100644
+--- a/net/netfilter/nft_flow_offload.c
++++ b/net/netfilter/nft_flow_offload.c
+@@ -384,6 +384,11 @@ static int nft_flow_offload_validate(const struct nft_ctx *ctx,
+ {
+       unsigned int hook_mask = (1 << NF_INET_FORWARD);
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_INET)
++              return -EOPNOTSUPP;
++
+       return nft_chain_validate_hooks(ctx->chain, hook_mask);
+ }
+diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
+index 583885ce7232..808f5802c270 100644
+--- a/net/netfilter/nft_nat.c
++++ b/net/netfilter/nft_nat.c
+@@ -143,6 +143,11 @@ static int nft_nat_validate(const struct nft_ctx *ctx,
+       struct nft_nat *priv = nft_expr_priv(expr);
+       int err;
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_INET)
++              return -EOPNOTSUPP;
++
+       err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
+       if (err < 0)
+               return err;
+diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
+index 35a2c28caa60..24d977138572 100644
+--- a/net/netfilter/nft_rt.c
++++ b/net/netfilter/nft_rt.c
+@@ -166,6 +166,11 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp
+       const struct nft_rt *priv = nft_expr_priv(expr);
+       unsigned int hooks;
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_INET)
++              return -EOPNOTSUPP;
++
+       switch (priv->key) {
+       case NFT_RT_NEXTHOP4:
+       case NFT_RT_NEXTHOP6:
+diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
+index 9ed85be79452..f30163e2ca62 100644
+--- a/net/netfilter/nft_socket.c
++++ b/net/netfilter/nft_socket.c
+@@ -242,6 +242,11 @@ static int nft_socket_validate(const struct nft_ctx *ctx,
+                              const struct nft_expr *expr,
+                              const struct nft_data **data)
+ {
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_INET)
++              return -EOPNOTSUPP;
++
+       return nft_chain_validate_hooks(ctx->chain,
+                                       (1 << NF_INET_PRE_ROUTING) |
+                                       (1 << NF_INET_LOCAL_IN) |
+diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
+index 13da882669a4..1d737f89dfc1 100644
+--- a/net/netfilter/nft_synproxy.c
++++ b/net/netfilter/nft_synproxy.c
+@@ -186,7 +186,6 @@ static int nft_synproxy_do_init(const struct nft_ctx *ctx,
+               break;
+ #endif
+       case NFPROTO_INET:
+-      case NFPROTO_BRIDGE:
+               err = nf_synproxy_ipv4_init(snet, ctx->net);
+               if (err)
+                       goto nf_ct_failure;
+@@ -219,7 +218,6 @@ static void nft_synproxy_do_destroy(const struct nft_ctx *ctx)
+               break;
+ #endif
+       case NFPROTO_INET:
+-      case NFPROTO_BRIDGE:
+               nf_synproxy_ipv4_fini(snet, ctx->net);
+               nf_synproxy_ipv6_fini(snet, ctx->net);
+               break;
+@@ -253,6 +251,11 @@ static int nft_synproxy_validate(const struct nft_ctx *ctx,
+                                const struct nft_expr *expr,
+                                const struct nft_data **data)
+ {
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_INET)
++              return -EOPNOTSUPP;
++
+       return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) |
+                                                   (1 << NF_INET_FORWARD));
+ }
+diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
+index ae15cd693f0e..71412adb73d4 100644
+--- a/net/netfilter/nft_tproxy.c
++++ b/net/netfilter/nft_tproxy.c
+@@ -316,6 +316,11 @@ static int nft_tproxy_validate(const struct nft_ctx *ctx,
+                              const struct nft_expr *expr,
+                              const struct nft_data **data)
+ {
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_INET)
++              return -EOPNOTSUPP;
++
+       return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING);
+ }
+diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
+index 452f8587adda..1c866757db55 100644
+--- a/net/netfilter/nft_xfrm.c
++++ b/net/netfilter/nft_xfrm.c
+@@ -235,6 +235,11 @@ static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *e
+       const struct nft_xfrm *priv = nft_expr_priv(expr);
+       unsigned int hooks;
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_INET)
++              return -EOPNOTSUPP;
++
+       switch (priv->dir) {
+       case XFRM_POLICY_IN:
+               hooks = (1 << NF_INET_FORWARD) |
+-- 
+2.43.0
+
diff --git a/queue-6.6/netfilter-nft_limit-reject-configurations-that-cause.patch b/queue-6.6/netfilter-nft_limit-reject-configurations-that-cause.patch
new file mode 100644 (file)
index 0000000..b34eb8c
--- /dev/null
@@ -0,0 +1,83 @@
+From 998d4aae3b7603145fb0d6e462c4b7677e2c634b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 13:11:32 +0100
+Subject: netfilter: nft_limit: reject configurations that cause integer
+ overflow
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit c9d9eb9c53d37cdebbad56b91e40baf42d5a97aa ]
+
+Reject bogus configs where internal token counter wraps around.
+This only occurs with very very large requests, such as 17gbyte/s.
+
+Its better to reject this rather than having incorrect ratelimit.
+
+Fixes: d2168e849ebf ("netfilter: nft_limit: add per-byte limiting")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nft_limit.c | 23 ++++++++++++++++-------
+ 1 file changed, 16 insertions(+), 7 deletions(-)
+
+diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
+index 79039afde34e..cefa25e0dbb0 100644
+--- a/net/netfilter/nft_limit.c
++++ b/net/netfilter/nft_limit.c
+@@ -58,17 +58,19 @@ static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost)
+ static int nft_limit_init(struct nft_limit_priv *priv,
+                         const struct nlattr * const tb[], bool pkts)
+ {
++      u64 unit, tokens, rate_with_burst;
+       bool invert = false;
+-      u64 unit, tokens;
+       if (tb[NFTA_LIMIT_RATE] == NULL ||
+           tb[NFTA_LIMIT_UNIT] == NULL)
+               return -EINVAL;
+       priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
++      if (priv->rate == 0)
++              return -EINVAL;
++
+       unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
+-      priv->nsecs = unit * NSEC_PER_SEC;
+-      if (priv->rate == 0 || priv->nsecs < unit)
++      if (check_mul_overflow(unit, NSEC_PER_SEC, &priv->nsecs))
+               return -EOVERFLOW;
+       if (tb[NFTA_LIMIT_BURST])
+@@ -77,18 +79,25 @@ static int nft_limit_init(struct nft_limit_priv *priv,
+       if (pkts && priv->burst == 0)
+               priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT;
+-      if (priv->rate + priv->burst < priv->rate)
++      if (check_add_overflow(priv->rate, priv->burst, &rate_with_burst))
+               return -EOVERFLOW;
+       if (pkts) {
+-              tokens = div64_u64(priv->nsecs, priv->rate) * priv->burst;
++              u64 tmp = div64_u64(priv->nsecs, priv->rate);
++
++              if (check_mul_overflow(tmp, priv->burst, &tokens))
++                      return -EOVERFLOW;
+       } else {
++              u64 tmp;
++
+               /* The token bucket size limits the number of tokens can be
+                * accumulated. tokens_max specifies the bucket size.
+                * tokens_max = unit * (rate + burst) / rate.
+                */
+-              tokens = div64_u64(priv->nsecs * (priv->rate + priv->burst),
+-                               priv->rate);
++              if (check_mul_overflow(priv->nsecs, rate_with_burst, &tmp))
++                      return -EOVERFLOW;
++
++              tokens = div64_u64(tmp, priv->rate);
+       }
+       if (tb[NFTA_LIMIT_FLAGS]) {
+-- 
+2.43.0
+
diff --git a/queue-6.6/netfs-fscache-prevent-oops-in-fscache_put_cache.patch b/queue-6.6/netfs-fscache-prevent-oops-in-fscache_put_cache.patch
new file mode 100644 (file)
index 0000000..1954665
--- /dev/null
@@ -0,0 +1,44 @@
+From c64fd15dd673caa6944140c91c582526022281a7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 12 Jan 2024 09:59:41 +0300
+Subject: netfs, fscache: Prevent Oops in fscache_put_cache()
+
+From: Dan Carpenter <dan.carpenter@linaro.org>
+
+[ Upstream commit 3be0b3ed1d76c6703b9ee482b55f7e01c369cc68 ]
+
+This function dereferences "cache" and then checks if it's
+IS_ERR_OR_NULL().  Check first, then dereference.
+
+Fixes: 9549332df4ed ("fscache: Implement cache registration")
+Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/e84bc740-3502-4f16-982a-a40d5676615c@moroto.mountain/ # v2
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/fscache/cache.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
+index d645f8b302a2..9397ed39b0b4 100644
+--- a/fs/fscache/cache.c
++++ b/fs/fscache/cache.c
+@@ -179,13 +179,14 @@ EXPORT_SYMBOL(fscache_acquire_cache);
+ void fscache_put_cache(struct fscache_cache *cache,
+                      enum fscache_cache_trace where)
+ {
+-      unsigned int debug_id = cache->debug_id;
++      unsigned int debug_id;
+       bool zero;
+       int ref;
+       if (IS_ERR_OR_NULL(cache))
+               return;
++      debug_id = cache->debug_id;
+       zero = __refcount_dec_and_test(&cache->ref, &ref);
+       trace_fscache_cache(debug_id, ref - 1, where);
+-- 
+2.43.0
+
diff --git a/queue-6.6/netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch b/queue-6.6/netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch
new file mode 100644 (file)
index 0000000..d091288
--- /dev/null
@@ -0,0 +1,76 @@
+From 1212fe5426423b34e0f08d1c34182d9395c23de8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 09:18:07 +0800
+Subject: netlink: fix potential sleeping issue in mqueue_flush_file
+
+From: Zhengchao Shao <shaozhengchao@huawei.com>
+
+[ Upstream commit 234ec0b6034b16869d45128b8cd2dc6ffe596f04 ]
+
+I analyze the potential sleeping issue of the following processes:
+Thread A                                Thread B
+...                                     netlink_create  //ref = 1
+do_mq_notify                            ...
+  sock = netlink_getsockbyfilp          ...     //ref = 2
+  info->notify_sock = sock;             ...
+...                                     netlink_sendmsg
+...                                       skb = netlink_alloc_large_skb  //skb->head is vmalloced
+...                                       netlink_unicast
+...                                         sk = netlink_getsockbyportid //ref = 3
+...                                         netlink_sendskb
+...                                           __netlink_sendskb
+...                                             skb_queue_tail //put skb to sk_receive_queue
+...                                         sock_put //ref = 2
+...                                     ...
+...                                     netlink_release
+...                                       deferred_put_nlk_sk //ref = 1
+mqueue_flush_file
+  spin_lock
+  remove_notification
+    netlink_sendskb
+      sock_put  //ref = 0
+        sk_free
+          ...
+          __sk_destruct
+            netlink_sock_destruct
+              skb_queue_purge  //get skb from sk_receive_queue
+                ...
+                __skb_queue_purge_reason
+                  kfree_skb_reason
+                    __kfree_skb
+                    ...
+                    skb_release_all
+                      skb_release_head_state
+                        netlink_skb_destructor
+                          vfree(skb->head)  //sleeping while holding spinlock
+
+In netlink_sendmsg, if the memory pointed to by skb->head is allocated by
+vmalloc, and is put to sk_receive_queue queue, also the skb is not freed.
+When the mqueue executes flush, the sleeping bug will occur. Use
+vfree_atomic instead of vfree in netlink_skb_destructor to solve the issue.
+
+Fixes: c05cdb1b864f ("netlink: allow large data transfers from user-space")
+Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
+Link: https://lore.kernel.org/r/20240122011807.2110357-1-shaozhengchao@huawei.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netlink/af_netlink.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
+index eb086b06d60d..d9107b545d36 100644
+--- a/net/netlink/af_netlink.c
++++ b/net/netlink/af_netlink.c
+@@ -374,7 +374,7 @@ static void netlink_skb_destructor(struct sk_buff *skb)
+       if (is_vmalloc_addr(skb->head)) {
+               if (!skb->cloned ||
+                   !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
+-                      vfree(skb->head);
++                      vfree_atomic(skb->head);
+               skb->head = NULL;
+       }
+-- 
+2.43.0
+
diff --git a/queue-6.6/rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch b/queue-6.6/rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch
new file mode 100644 (file)
index 0000000..d01daef
--- /dev/null
@@ -0,0 +1,141 @@
+From 25442abcd36f14a6891e57e93be65b28cb2cb890 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 19 Dec 2023 00:19:15 +0100
+Subject: rcu: Defer RCU kthreads wakeup when CPU is dying
+
+From: Frederic Weisbecker <frederic@kernel.org>
+
+[ Upstream commit e787644caf7628ad3269c1fbd321c3255cf51710 ]
+
+When the CPU goes idle for the last time during the CPU down hotplug
+process, RCU reports a final quiescent state for the current CPU. If
+this quiescent state propagates up to the top, some tasks may then be
+woken up to complete the grace period: the main grace period kthread
+and/or the expedited main workqueue (or kworker).
+
+If those kthreads have a SCHED_FIFO policy, the wake up can indirectly
+arm the RT bandwith timer to the local offline CPU. Since this happens
+after hrtimers have been migrated at CPUHP_AP_HRTIMERS_DYING stage, the
+timer gets ignored. Therefore if the RCU kthreads are waiting for RT
+bandwidth to be available, they may never be actually scheduled.
+
+This triggers TREE03 rcutorture hangs:
+
+        rcu: INFO: rcu_preempt self-detected stall on CPU
+        rcu:     4-...!: (1 GPs behind) idle=9874/1/0x4000000000000000 softirq=0/0 fqs=20 rcuc=21071 jiffies(starved)
+        rcu:     (t=21035 jiffies g=938281 q=40787 ncpus=6)
+        rcu: rcu_preempt kthread starved for 20964 jiffies! g938281 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x0 ->cpu=0
+        rcu:     Unless rcu_preempt kthread gets sufficient CPU time, OOM is now expected behavior.
+        rcu: RCU grace-period kthread stack dump:
+        task:rcu_preempt     state:R  running task     stack:14896 pid:14    tgid:14    ppid:2      flags:0x00004000
+        Call Trace:
+         <TASK>
+         __schedule+0x2eb/0xa80
+         schedule+0x1f/0x90
+         schedule_timeout+0x163/0x270
+         ? __pfx_process_timeout+0x10/0x10
+         rcu_gp_fqs_loop+0x37c/0x5b0
+         ? __pfx_rcu_gp_kthread+0x10/0x10
+         rcu_gp_kthread+0x17c/0x200
+         kthread+0xde/0x110
+         ? __pfx_kthread+0x10/0x10
+         ret_from_fork+0x2b/0x40
+         ? __pfx_kthread+0x10/0x10
+         ret_from_fork_asm+0x1b/0x30
+         </TASK>
+
+The situation can't be solved with just unpinning the timer. The hrtimer
+infrastructure and the nohz heuristics involved in finding the best
+remote target for an unpinned timer would then also need to handle
+enqueues from an offline CPU in the most horrendous way.
+
+So fix this on the RCU side instead and defer the wake up to an online
+CPU if it's too late for the local one.
+
+Reported-by: Paul E. McKenney <paulmck@kernel.org>
+Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
+Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
+Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
+Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/rcu/tree.c     | 34 +++++++++++++++++++++++++++++++++-
+ kernel/rcu/tree_exp.h |  3 +--
+ 2 files changed, 34 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
+index 9af42eae1ba3..4fe47ed95eeb 100644
+--- a/kernel/rcu/tree.c
++++ b/kernel/rcu/tree.c
+@@ -1013,6 +1013,38 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
+       return needmore;
+ }
++static void swake_up_one_online_ipi(void *arg)
++{
++      struct swait_queue_head *wqh = arg;
++
++      swake_up_one(wqh);
++}
++
++static void swake_up_one_online(struct swait_queue_head *wqh)
++{
++      int cpu = get_cpu();
++
++      /*
++       * If called from rcutree_report_cpu_starting(), wake up
++       * is dangerous that late in the CPU-down hotplug process. The
++       * scheduler might queue an ignored hrtimer. Defer the wake up
++       * to an online CPU instead.
++       */
++      if (unlikely(cpu_is_offline(cpu))) {
++              int target;
++
++              target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU),
++                                       cpu_online_mask);
++
++              smp_call_function_single(target, swake_up_one_online_ipi,
++                                       wqh, 0);
++              put_cpu();
++      } else {
++              put_cpu();
++              swake_up_one(wqh);
++      }
++}
++
+ /*
+  * Awaken the grace-period kthread.  Don't do a self-awaken (unless in an
+  * interrupt or softirq handler, in which case we just might immediately
+@@ -1037,7 +1069,7 @@ static void rcu_gp_kthread_wake(void)
+               return;
+       WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
+       WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
+-      swake_up_one(&rcu_state.gp_wq);
++      swake_up_one_online(&rcu_state.gp_wq);
+ }
+ /*
+diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
+index 8239b39d945b..6e87dc764f47 100644
+--- a/kernel/rcu/tree_exp.h
++++ b/kernel/rcu/tree_exp.h
+@@ -173,7 +173,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
+       return ret;
+ }
+-
+ /*
+  * Report the exit from RCU read-side critical section for the last task
+  * that queued itself during or before the current expedited preemptible-RCU
+@@ -201,7 +200,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
+                       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                       if (wake) {
+                               smp_mb(); /* EGP done before wake_up(). */
+-                              swake_up_one(&rcu_state.expedited_wq);
++                              swake_up_one_online(&rcu_state.expedited_wq);
+                       }
+                       break;
+               }
+-- 
+2.43.0
+
diff --git a/queue-6.6/selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch b/queue-6.6/selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch
new file mode 100644 (file)
index 0000000..6cb6a67
--- /dev/null
@@ -0,0 +1,231 @@
+From e172b3627d5871d6afe0da3566ba055ff9f14ca7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 19:16:42 -0800
+Subject: selftest: Don't reuse port for SO_INCOMING_CPU test.
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+[ Upstream commit 97de5a15edf2d22184f5ff588656030bbb7fa358 ]
+
+Jakub reported that ASSERT_EQ(cpu, i) in so_incoming_cpu.c seems to
+fire somewhat randomly.
+
+  # #  RUN           so_incoming_cpu.before_reuseport.test3 ...
+  # # so_incoming_cpu.c:191:test3:Expected cpu (32) == i (0)
+  # # test3: Test terminated by assertion
+  # #          FAIL  so_incoming_cpu.before_reuseport.test3
+  # not ok 3 so_incoming_cpu.before_reuseport.test3
+
+When the test failed, not-yet-accepted CLOSE_WAIT sockets received
+SYN with a "challenging" SEQ number, which was sent from an unexpected
+CPU that did not create the receiver.
+
+The test basically does:
+
+  1. for each cpu:
+    1-1. create a server
+    1-2. set SO_INCOMING_CPU
+
+  2. for each cpu:
+    2-1. set cpu affinity
+    2-2. create some clients
+    2-3. let clients connect() to the server on the same cpu
+    2-4. close() clients
+
+  3. for each server:
+    3-1. accept() all child sockets
+    3-2. check if all children have the same SO_INCOMING_CPU with the server
+
+The root cause was the close() in 2-4. and net.ipv4.tcp_tw_reuse.
+
+In a loop of 2., close() changed the client state to FIN_WAIT_2, and
+the peer transitioned to CLOSE_WAIT.
+
+In another loop of 2., connect() happened to select the same port of
+the FIN_WAIT_2 socket, and it was reused as the default value of
+net.ipv4.tcp_tw_reuse is 2.
+
+As a result, the new client sent SYN to the CLOSE_WAIT socket from
+a different CPU, and the receiver's sk_incoming_cpu was overwritten
+with unexpected CPU ID.
+
+Also, the SYN had a different SEQ number, so the CLOSE_WAIT socket
+responded with Challenge ACK.  The new client properly returned RST
+and effectively killed the CLOSE_WAIT socket.
+
+This way, all clients were created successfully, but the error was
+detected later by 3-2., ASSERT_EQ(cpu, i).
+
+To avoid the failure, let's make sure that (i) the number of clients
+is less than the number of available ports and (ii) such reuse never
+happens.
+
+Fixes: 6df96146b202 ("selftest: Add test for SO_INCOMING_CPU.")
+Reported-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Tested-by: Jakub Kicinski <kuba@kernel.org>
+Link: https://lore.kernel.org/r/20240120031642.67014-1-kuniyu@amazon.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/so_incoming_cpu.c | 68 ++++++++++++++-----
+ 1 file changed, 50 insertions(+), 18 deletions(-)
+
+diff --git a/tools/testing/selftests/net/so_incoming_cpu.c b/tools/testing/selftests/net/so_incoming_cpu.c
+index a14818164102..e9fa14e10732 100644
+--- a/tools/testing/selftests/net/so_incoming_cpu.c
++++ b/tools/testing/selftests/net/so_incoming_cpu.c
+@@ -3,19 +3,16 @@
+ #define _GNU_SOURCE
+ #include <sched.h>
++#include <fcntl.h>
++
+ #include <netinet/in.h>
+ #include <sys/socket.h>
+ #include <sys/sysinfo.h>
+ #include "../kselftest_harness.h"
+-#define CLIENT_PER_SERVER     32 /* More sockets, more reliable */
+-#define NR_SERVER             self->nproc
+-#define NR_CLIENT             (CLIENT_PER_SERVER * NR_SERVER)
+-
+ FIXTURE(so_incoming_cpu)
+ {
+-      int nproc;
+       int *servers;
+       union {
+               struct sockaddr addr;
+@@ -56,12 +53,47 @@ FIXTURE_VARIANT_ADD(so_incoming_cpu, after_all_listen)
+       .when_to_set = AFTER_ALL_LISTEN,
+ };
++static void write_sysctl(struct __test_metadata *_metadata,
++                       char *filename, char *string)
++{
++      int fd, len, ret;
++
++      fd = open(filename, O_WRONLY);
++      ASSERT_NE(fd, -1);
++
++      len = strlen(string);
++      ret = write(fd, string, len);
++      ASSERT_EQ(ret, len);
++}
++
++static void setup_netns(struct __test_metadata *_metadata)
++{
++      ASSERT_EQ(unshare(CLONE_NEWNET), 0);
++      ASSERT_EQ(system("ip link set lo up"), 0);
++
++      write_sysctl(_metadata, "/proc/sys/net/ipv4/ip_local_port_range", "10000 60001");
++      write_sysctl(_metadata, "/proc/sys/net/ipv4/tcp_tw_reuse", "0");
++}
++
++#define NR_PORT                               (60001 - 10000 - 1)
++#define NR_CLIENT_PER_SERVER_DEFAULT  32
++static int nr_client_per_server, nr_server, nr_client;
++
+ FIXTURE_SETUP(so_incoming_cpu)
+ {
+-      self->nproc = get_nprocs();
+-      ASSERT_LE(2, self->nproc);
++      setup_netns(_metadata);
++
++      nr_server = get_nprocs();
++      ASSERT_LE(2, nr_server);
++
++      if (NR_CLIENT_PER_SERVER_DEFAULT * nr_server < NR_PORT)
++              nr_client_per_server = NR_CLIENT_PER_SERVER_DEFAULT;
++      else
++              nr_client_per_server = NR_PORT / nr_server;
++
++      nr_client = nr_client_per_server * nr_server;
+-      self->servers = malloc(sizeof(int) * NR_SERVER);
++      self->servers = malloc(sizeof(int) * nr_server);
+       ASSERT_NE(self->servers, NULL);
+       self->in_addr.sin_family = AF_INET;
+@@ -74,7 +106,7 @@ FIXTURE_TEARDOWN(so_incoming_cpu)
+ {
+       int i;
+-      for (i = 0; i < NR_SERVER; i++)
++      for (i = 0; i < nr_server; i++)
+               close(self->servers[i]);
+       free(self->servers);
+@@ -110,10 +142,10 @@ int create_server(struct __test_metadata *_metadata,
+       if (variant->when_to_set == BEFORE_LISTEN)
+               set_so_incoming_cpu(_metadata, fd, cpu);
+-      /* We don't use CLIENT_PER_SERVER here not to block
++      /* We don't use nr_client_per_server here not to block
+        * this test at connect() if SO_INCOMING_CPU is broken.
+        */
+-      ret = listen(fd, NR_CLIENT);
++      ret = listen(fd, nr_client);
+       ASSERT_EQ(ret, 0);
+       if (variant->when_to_set == AFTER_LISTEN)
+@@ -128,7 +160,7 @@ void create_servers(struct __test_metadata *_metadata,
+ {
+       int i, ret;
+-      for (i = 0; i < NR_SERVER; i++) {
++      for (i = 0; i < nr_server; i++) {
+               self->servers[i] = create_server(_metadata, self, variant, i);
+               if (i == 0) {
+@@ -138,7 +170,7 @@ void create_servers(struct __test_metadata *_metadata,
+       }
+       if (variant->when_to_set == AFTER_ALL_LISTEN) {
+-              for (i = 0; i < NR_SERVER; i++)
++              for (i = 0; i < nr_server; i++)
+                       set_so_incoming_cpu(_metadata, self->servers[i], i);
+       }
+ }
+@@ -149,7 +181,7 @@ void create_clients(struct __test_metadata *_metadata,
+       cpu_set_t cpu_set;
+       int i, j, fd, ret;
+-      for (i = 0; i < NR_SERVER; i++) {
++      for (i = 0; i < nr_server; i++) {
+               CPU_ZERO(&cpu_set);
+               CPU_SET(i, &cpu_set);
+@@ -162,7 +194,7 @@ void create_clients(struct __test_metadata *_metadata,
+               ret = sched_setaffinity(0, sizeof(cpu_set), &cpu_set);
+               ASSERT_EQ(ret, 0);
+-              for (j = 0; j < CLIENT_PER_SERVER; j++) {
++              for (j = 0; j < nr_client_per_server; j++) {
+                       fd  = socket(AF_INET, SOCK_STREAM, 0);
+                       ASSERT_NE(fd, -1);
+@@ -180,8 +212,8 @@ void verify_incoming_cpu(struct __test_metadata *_metadata,
+       int i, j, fd, cpu, ret, total = 0;
+       socklen_t len = sizeof(int);
+-      for (i = 0; i < NR_SERVER; i++) {
+-              for (j = 0; j < CLIENT_PER_SERVER; j++) {
++      for (i = 0; i < nr_server; i++) {
++              for (j = 0; j < nr_client_per_server; j++) {
+                       /* If we see -EAGAIN here, SO_INCOMING_CPU is broken */
+                       fd = accept(self->servers[i], &self->addr, &self->addrlen);
+                       ASSERT_NE(fd, -1);
+@@ -195,7 +227,7 @@ void verify_incoming_cpu(struct __test_metadata *_metadata,
+               }
+       }
+-      ASSERT_EQ(total, NR_CLIENT);
++      ASSERT_EQ(total, nr_client);
+       TH_LOG("SO_INCOMING_CPU is very likely to be "
+              "working correctly with %d sockets.", total);
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.6/selftests-bonding-do-not-test-arp-ns-target-with-mod.patch b/queue-6.6/selftests-bonding-do-not-test-arp-ns-target-with-mod.patch
new file mode 100644 (file)
index 0000000..4f873ef
--- /dev/null
@@ -0,0 +1,63 @@
+From 5e571a63fbab26f1f130953fc67d7180143df056 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 15:59:17 +0800
+Subject: selftests: bonding: do not test arp/ns target with mode
+ balance-alb/tlb
+
+From: Hangbin Liu <liuhangbin@gmail.com>
+
+[ Upstream commit a2933a8759a62269754e54733d993b19de870e84 ]
+
+The prio_arp/ns tests hard code the mode to active-backup. At the same
+time, The balance-alb/tlb modes do not support arp/ns target. So remove
+the prio_arp/ns tests from the loop and only test active-backup mode.
+
+Fixes: 481b56e0391e ("selftests: bonding: re-format bond option tests")
+Reported-by: Jay Vosburgh <jay.vosburgh@canonical.com>
+Closes: https://lore.kernel.org/netdev/17415.1705965957@famine/
+Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
+Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
+Link: https://lore.kernel.org/r/20240123075917.1576360-1-liuhangbin@gmail.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../testing/selftests/drivers/net/bonding/bond_options.sh | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh
+index c54d1697f439..d508486cc0bd 100755
+--- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh
++++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh
+@@ -162,7 +162,7 @@ prio_arp()
+       local mode=$1
+       for primary_reselect in 0 1 2; do
+-              prio_test "mode active-backup arp_interval 100 arp_ip_target ${g_ip4} primary eth1 primary_reselect $primary_reselect"
++              prio_test "mode $mode arp_interval 100 arp_ip_target ${g_ip4} primary eth1 primary_reselect $primary_reselect"
+               log_test "prio" "$mode arp_ip_target primary_reselect $primary_reselect"
+       done
+ }
+@@ -178,7 +178,7 @@ prio_ns()
+       fi
+       for primary_reselect in 0 1 2; do
+-              prio_test "mode active-backup arp_interval 100 ns_ip6_target ${g_ip6} primary eth1 primary_reselect $primary_reselect"
++              prio_test "mode $mode arp_interval 100 ns_ip6_target ${g_ip6} primary eth1 primary_reselect $primary_reselect"
+               log_test "prio" "$mode ns_ip6_target primary_reselect $primary_reselect"
+       done
+ }
+@@ -194,9 +194,9 @@ prio()
+       for mode in $modes; do
+               prio_miimon $mode
+-              prio_arp $mode
+-              prio_ns $mode
+       done
++      prio_arp "active-backup"
++      prio_ns "active-backup"
+ }
+ arp_validate_test()
+-- 
+2.43.0
+
diff --git a/queue-6.6/selftests-bonding-increase-timeout-to-1200s.patch b/queue-6.6/selftests-bonding-increase-timeout-to-1200s.patch
new file mode 100644 (file)
index 0000000..0b6844d
--- /dev/null
@@ -0,0 +1,56 @@
+From 36694814a356ab1babaef9c760f7939542ef77e3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 19:12:32 -0500
+Subject: selftests: bonding: Increase timeout to 1200s
+
+From: Benjamin Poirier <bpoirier@nvidia.com>
+
+[ Upstream commit b01f15a7571b7aa222458bc9bf26ab59bd84e384 ]
+
+When tests are run by runner.sh, bond_options.sh gets killed before
+it can complete:
+
+make -C tools/testing/selftests run_tests TARGETS="drivers/net/bonding"
+       [...]
+       # timeout set to 120
+       # selftests: drivers/net/bonding: bond_options.sh
+       # TEST: prio (active-backup miimon primary_reselect 0)                [ OK ]
+       # TEST: prio (active-backup miimon primary_reselect 1)                [ OK ]
+       # TEST: prio (active-backup miimon primary_reselect 2)                [ OK ]
+       # TEST: prio (active-backup arp_ip_target primary_reselect 0)         [ OK ]
+       # TEST: prio (active-backup arp_ip_target primary_reselect 1)         [ OK ]
+       # TEST: prio (active-backup arp_ip_target primary_reselect 2)         [ OK ]
+       #
+       not ok 7 selftests: drivers/net/bonding: bond_options.sh # TIMEOUT 120 seconds
+
+This test includes many sleep statements, at least some of which are
+related to timers in the operation of the bonding driver itself. Increase
+the test timeout to allow the test to complete.
+
+I ran the test in slightly different VMs (including one without HW
+virtualization support) and got runtimes of 13m39.760s, 13m31.238s, and
+13m2.956s. Use a ~1.5x "safety factor" and set the timeout to 1200s.
+
+Fixes: 42a8d4aaea84 ("selftests: bonding: add bonding prio option test")
+Reported-by: Jakub Kicinski <kuba@kernel.org>
+Closes: https://lore.kernel.org/netdev/20240116104402.1203850a@kernel.org/#t
+Suggested-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Benjamin Poirier <bpoirier@nvidia.com>
+Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
+Link: https://lore.kernel.org/r/20240118001233.304759-1-bpoirier@nvidia.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/drivers/net/bonding/settings | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/testing/selftests/drivers/net/bonding/settings b/tools/testing/selftests/drivers/net/bonding/settings
+index 6091b45d226b..79b65bdf05db 100644
+--- a/tools/testing/selftests/drivers/net/bonding/settings
++++ b/tools/testing/selftests/drivers/net/bonding/settings
+@@ -1 +1 @@
+-timeout=120
++timeout=1200
+-- 
+2.43.0
+
diff --git a/queue-6.6/selftests-fill-in-some-missing-configs-for-net.patch b/queue-6.6/selftests-fill-in-some-missing-configs-for-net.patch
new file mode 100644 (file)
index 0000000..6d390f9
--- /dev/null
@@ -0,0 +1,117 @@
+From 747c876e953ce89b719a939e459240ba7cb67b6b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 12:35:28 -0800
+Subject: selftests: fill in some missing configs for net
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 04fe7c5029cbdbcdb28917f09a958d939a8f19f7 ]
+
+We are missing a lot of config options from net selftests,
+it seems:
+
+tun/tap:     CONFIG_TUN, CONFIG_MACVLAN, CONFIG_MACVTAP
+fib_tests:   CONFIG_NET_SCH_FQ_CODEL
+l2tp:        CONFIG_L2TP, CONFIG_L2TP_V3, CONFIG_L2TP_IP, CONFIG_L2TP_ETH
+sctp-vrf:    CONFIG_INET_DIAG
+txtimestamp: CONFIG_NET_CLS_U32
+vxlan_mdb:   CONFIG_BRIDGE_VLAN_FILTERING
+gre_gso:     CONFIG_NET_IPGRE_DEMUX, CONFIG_IP_GRE, CONFIG_IPV6_GRE
+srv6_end_dt*_l3vpn:   CONFIG_IPV6_SEG6_LWTUNNEL
+ip_local_port_range:  CONFIG_MPTCP
+fib_test:    CONFIG_NET_CLS_BASIC
+rtnetlink:   CONFIG_MACSEC, CONFIG_NET_SCH_HTB, CONFIG_XFRM_INTERFACE
+             CONFIG_NET_IPGRE, CONFIG_BONDING
+fib_nexthops: CONFIG_MPLS, CONFIG_MPLS_ROUTING
+vxlan_mdb:   CONFIG_NET_ACT_GACT
+tls:         CONFIG_TLS, CONFIG_CRYPTO_CHACHA20POLY1305
+psample:     CONFIG_PSAMPLE
+fcnal:       CONFIG_TCP_MD5SIG
+
+Try to add them in a semi-alphabetical order.
+
+Fixes: 62199e3f1658 ("selftests: net: Add VXLAN MDB test")
+Fixes: c12e0d5f267d ("self-tests: introduce self-tests for RPS default mask")
+Fixes: 122db5e3634b ("selftests/net: add MPTCP coverage for IP_LOCAL_PORT_RANGE")
+Link: https://lore.kernel.org/r/20240122203528.672004-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/config | 28 ++++++++++++++++++++++++++++
+ 1 file changed, 28 insertions(+)
+
+diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
+index 8da562a9ae87..19ff75051660 100644
+--- a/tools/testing/selftests/net/config
++++ b/tools/testing/selftests/net/config
+@@ -1,5 +1,6 @@
+ CONFIG_USER_NS=y
+ CONFIG_NET_NS=y
++CONFIG_BONDING=m
+ CONFIG_BPF_SYSCALL=y
+ CONFIG_TEST_BPF=m
+ CONFIG_NUMA=y
+@@ -14,9 +15,13 @@ CONFIG_VETH=y
+ CONFIG_NET_IPVTI=y
+ CONFIG_IPV6_VTI=y
+ CONFIG_DUMMY=y
++CONFIG_BRIDGE_VLAN_FILTERING=y
+ CONFIG_BRIDGE=y
++CONFIG_CRYPTO_CHACHA20POLY1305=m
+ CONFIG_VLAN_8021Q=y
+ CONFIG_IFB=y
++CONFIG_INET_DIAG=y
++CONFIG_IP_GRE=m
+ CONFIG_NETFILTER=y
+ CONFIG_NETFILTER_ADVANCED=y
+ CONFIG_NF_CONNTRACK=m
+@@ -25,15 +30,36 @@ CONFIG_IP6_NF_IPTABLES=m
+ CONFIG_IP_NF_IPTABLES=m
+ CONFIG_IP6_NF_NAT=m
+ CONFIG_IP_NF_NAT=m
++CONFIG_IPV6_GRE=m
++CONFIG_IPV6_SEG6_LWTUNNEL=y
++CONFIG_L2TP_ETH=m
++CONFIG_L2TP_IP=m
++CONFIG_L2TP=m
++CONFIG_L2TP_V3=y
++CONFIG_MACSEC=m
++CONFIG_MACVLAN=y
++CONFIG_MACVTAP=y
++CONFIG_MPLS=y
++CONFIG_MPTCP=y
+ CONFIG_NF_TABLES=m
+ CONFIG_NF_TABLES_IPV6=y
+ CONFIG_NF_TABLES_IPV4=y
+ CONFIG_NFT_NAT=m
++CONFIG_NET_ACT_GACT=m
++CONFIG_NET_CLS_BASIC=m
++CONFIG_NET_CLS_U32=m
++CONFIG_NET_IPGRE_DEMUX=m
++CONFIG_NET_IPGRE=m
++CONFIG_NET_SCH_FQ_CODEL=m
++CONFIG_NET_SCH_HTB=m
+ CONFIG_NET_SCH_FQ=m
+ CONFIG_NET_SCH_ETF=m
+ CONFIG_NET_SCH_NETEM=y
++CONFIG_PSAMPLE=m
++CONFIG_TCP_MD5SIG=y
+ CONFIG_TEST_BLACKHOLE_DEV=m
+ CONFIG_KALLSYMS=y
++CONFIG_TLS=m
+ CONFIG_TRACEPOINTS=y
+ CONFIG_NET_DROP_MONITOR=m
+ CONFIG_NETDEVSIM=m
+@@ -48,7 +74,9 @@ CONFIG_BAREUDP=m
+ CONFIG_IPV6_IOAM6_LWTUNNEL=y
+ CONFIG_CRYPTO_SM4_GENERIC=y
+ CONFIG_AMT=m
++CONFIG_TUN=y
+ CONFIG_VXLAN=m
+ CONFIG_IP_SCTP=m
+ CONFIG_NETFILTER_XT_MATCH_POLICY=m
+ CONFIG_CRYPTO_ARIA=y
++CONFIG_XFRM_INTERFACE=m
+-- 
+2.43.0
+
diff --git a/queue-6.6/selftests-net-fix-rps_default_mask-with-32-cpus.patch b/queue-6.6/selftests-net-fix-rps_default_mask-with-32-cpus.patch
new file mode 100644 (file)
index 0000000..e2cdabf
--- /dev/null
@@ -0,0 +1,51 @@
+From 06ca639fba800dcbdc8d7762501231c341aaf82f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 11:58:15 -0800
+Subject: selftests: net: fix rps_default_mask with >32 CPUs
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 0719b5338a0cbe80d1637a5fb03d8141b5bfc7a1 ]
+
+If there is more than 32 cpus the bitmask will start to contain
+commas, leading to:
+
+./rps_default_mask.sh: line 36: [: 00000000,00000000: integer expression expected
+
+Remove the commas, bash doesn't interpret leading zeroes as oct
+so that should be good enough. Switch to bash, Simon reports that
+not all shells support this type of substitution.
+
+Fixes: c12e0d5f267d ("self-tests: introduce self-tests for RPS default mask")
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240122195815.638997-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/rps_default_mask.sh | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/tools/testing/selftests/net/rps_default_mask.sh b/tools/testing/selftests/net/rps_default_mask.sh
+index a26c5624429f..4287a8529890 100755
+--- a/tools/testing/selftests/net/rps_default_mask.sh
++++ b/tools/testing/selftests/net/rps_default_mask.sh
+@@ -1,4 +1,4 @@
+-#!/bin/sh
++#!/bin/bash
+ # SPDX-License-Identifier: GPL-2.0
+ readonly ksft_skip=4
+@@ -33,6 +33,10 @@ chk_rps() {
+       rps_mask=$($cmd /sys/class/net/$dev_name/queues/rx-0/rps_cpus)
+       printf "%-60s" "$msg"
++
++      # In case there is more than 32 CPUs we need to remove commas from masks
++      rps_mask=${rps_mask//,}
++      expected_rps_mask=${expected_rps_mask//,}
+       if [ $rps_mask -eq $expected_rps_mask ]; then
+               echo "[ ok ]"
+       else
+-- 
+2.43.0
+
diff --git a/queue-6.6/selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch b/queue-6.6/selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch
new file mode 100644 (file)
index 0000000..0efde91
--- /dev/null
@@ -0,0 +1,102 @@
+From 59cd40d48593ee5845289d25d185c890c62b8c2d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 22:05:29 -0800
+Subject: selftests: netdevsim: fix the udp_tunnel_nic test
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 0879020a7817e7ce636372c016b4528f541c9f4d ]
+
+This test is missing a whole bunch of checks for interface
+renaming and one ifup. Presumably it was only used on a system
+with renaming disabled and NetworkManager running.
+
+Fixes: 91f430b2c49d ("selftests: net: add a test for UDP tunnel info infra")
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240123060529.1033912-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../selftests/drivers/net/netdevsim/udp_tunnel_nic.sh    | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh
+index 1b08e042cf94..185b02d2d4cd 100755
+--- a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh
++++ b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh
+@@ -269,6 +269,7 @@ for port in 0 1; do
+       echo 1 > $NSIM_DEV_SYS/new_port
+     fi
+     NSIM_NETDEV=`get_netdev_name old_netdevs`
++    ifconfig $NSIM_NETDEV up
+     msg="new NIC device created"
+     exp0=( 0 0 0 0 )
+@@ -430,6 +431,7 @@ for port in 0 1; do
+     fi
+     echo $port > $NSIM_DEV_SYS/new_port
++    NSIM_NETDEV=`get_netdev_name old_netdevs`
+     ifconfig $NSIM_NETDEV up
+     overflow_table0 "overflow NIC table"
+@@ -487,6 +489,7 @@ for port in 0 1; do
+     fi
+     echo $port > $NSIM_DEV_SYS/new_port
++    NSIM_NETDEV=`get_netdev_name old_netdevs`
+     ifconfig $NSIM_NETDEV up
+     overflow_table0 "overflow NIC table"
+@@ -543,6 +546,7 @@ for port in 0 1; do
+     fi
+     echo $port > $NSIM_DEV_SYS/new_port
++    NSIM_NETDEV=`get_netdev_name old_netdevs`
+     ifconfig $NSIM_NETDEV up
+     overflow_table0 "destroy NIC"
+@@ -572,6 +576,7 @@ for port in 0 1; do
+     fi
+     echo $port > $NSIM_DEV_SYS/new_port
++    NSIM_NETDEV=`get_netdev_name old_netdevs`
+     ifconfig $NSIM_NETDEV up
+     msg="create VxLANs v6"
+@@ -632,6 +637,7 @@ for port in 0 1; do
+     fi
+     echo $port > $NSIM_DEV_SYS/new_port
++    NSIM_NETDEV=`get_netdev_name old_netdevs`
+     ifconfig $NSIM_NETDEV up
+     echo 110 > $NSIM_DEV_DFS/ports/$port/udp_ports_inject_error
+@@ -687,6 +693,7 @@ for port in 0 1; do
+     fi
+     echo $port > $NSIM_DEV_SYS/new_port
++    NSIM_NETDEV=`get_netdev_name old_netdevs`
+     ifconfig $NSIM_NETDEV up
+     msg="create VxLANs v6"
+@@ -746,6 +753,7 @@ for port in 0 1; do
+     fi
+     echo $port > $NSIM_DEV_SYS/new_port
++    NSIM_NETDEV=`get_netdev_name old_netdevs`
+     ifconfig $NSIM_NETDEV up
+     msg="create VxLANs v6"
+@@ -876,6 +884,7 @@ msg="re-add a port"
+ echo 2 > $NSIM_DEV_SYS/del_port
+ echo 2 > $NSIM_DEV_SYS/new_port
++NSIM_NETDEV=`get_netdev_name old_netdevs`
+ check_tables
+ msg="replace VxLAN in overflow table"
+-- 
+2.43.0
+
index f30f2c6324a44993b679e9b1e0c712a006f036a8..44c98c2e35cf2a1c824505d14450b4f5815f0c16 100644 (file)
@@ -164,3 +164,67 @@ revert-drm-amd-enable-pcie-pme-from-d3.patch
 cifs-fix-lock-ordering-while-disabling-multichannel.patch
 cifs-fix-a-pending-undercount-of-srv_count.patch
 cifs-after-disabling-multichannel-mark-tcon-for-reconnect.patch
+sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch
+wifi-mac80211-fix-potential-sta-link-leak.patch
+btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch
+net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch
+selftests-bonding-increase-timeout-to-1200s.patch
+tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch
+bnxt_en-wait-for-flr-to-complete-during-probe.patch
+bnxt_en-prevent-kernel-warning-when-running-offline-.patch
+vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch
+llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch
+llc-drop-support-for-eth_p_tr_802_2.patch
+udp-fix-busy-polling.patch
+net-fix-removing-a-namespace-with-conflicting-altnam.patch
+tun-fix-missing-dropped-counter-in-tun_xdp_act.patch
+tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch
+net-micrel-fix-ptp-frame-parsing-for-lan8814.patch
+net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch
+netfs-fscache-prevent-oops-in-fscache_put_cache.patch
+tracing-ensure-visibility-when-inserting-an-element-.patch
+afs-hide-silly-rename-files-from-userspace.patch
+tcp-add-memory-barrier-to-tcp_push.patch
+selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch
+netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch
+ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch
+selftests-fill-in-some-missing-configs-for-net.patch
+net-sched-flower-fix-chain-template-offload.patch
+net-mlx5e-fix-operation-precedence-bug-in-port-times.patch
+net-mlx5e-fix-peer-flow-lists-handling.patch
+net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch
+net-mlx5-bridge-enable-mcast-in-smfs-steering-mode.patch
+net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch
+net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch
+net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch
+net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch
+net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch
+net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch
+net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch
+net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch
+rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch
+netfilter-nft_limit-reject-configurations-that-cause.patch
+netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch
+netfilter-nf_tables-validate-nfproto_-family.patch
+net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch
+net-mvpp2-clear-bm-pool-before-initialization.patch
+selftests-net-fix-rps_default_mask-with-32-cpus.patch
+selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch
+xsk-recycle-buffer-in-case-rx-queue-was-full.patch
+xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch
+bpf-propagate-modified-uaddrlen-from-cgroup-sockaddr.patch
+bpf-add-bpf_sock_addr_set_sun_path-to-allow-writing-.patch
+xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch
+ice-work-on-pre-xdp-prog-frag-count.patch
+i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch
+ice-remove-redundant-xdp_rxq_info-registration.patch
+intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch
+ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch
+xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch
+i40e-set-xdp_rxq_info-frag_size.patch
+i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch
+fjes-fix-memleaks-in-fjes_hw_setup.patch
+selftests-bonding-do-not-test-arp-ns-target-with-mod.patch
+net-fec-fix-the-unhandled-context-fault-from-smmu.patch
+tsnep-remove-fcs-for-xdp-data-path.patch
+tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch
diff --git a/queue-6.6/sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch b/queue-6.6/sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch
new file mode 100644 (file)
index 0000000..6cd6c40
--- /dev/null
@@ -0,0 +1,42 @@
+From 5136798a60d688d5a0b2d542aeb595c24725482d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 22:06:28 +0100
+Subject: SUNRPC: use request size to initialize bio_vec in svc_udp_sendto()
+
+From: Lucas Stach <l.stach@pengutronix.de>
+
+[ Upstream commit 1d9cabe2817edd215779dc9c2fe5e7ab9aac0704 ]
+
+Use the proper size when setting up the bio_vec, as otherwise only
+zero-length UDP packets will be sent.
+
+Fixes: baabf59c2414 ("SUNRPC: Convert svc_udp_sendto() to use the per-socket bio_vec array")
+Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sunrpc/svcsock.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
+index 998687421fa6..e0ce4276274b 100644
+--- a/net/sunrpc/svcsock.c
++++ b/net/sunrpc/svcsock.c
+@@ -717,12 +717,12 @@ static int svc_udp_sendto(struct svc_rqst *rqstp)
+                               ARRAY_SIZE(rqstp->rq_bvec), xdr);
+       iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
+-                    count, 0);
++                    count, rqstp->rq_res.len);
+       err = sock_sendmsg(svsk->sk_sock, &msg);
+       if (err == -ECONNREFUSED) {
+               /* ICMP error on earlier request. */
+               iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
+-                            count, 0);
++                            count, rqstp->rq_res.len);
+               err = sock_sendmsg(svsk->sk_sock, &msg);
+       }
+-- 
+2.43.0
+
diff --git a/queue-6.6/tcp-add-memory-barrier-to-tcp_push.patch b/queue-6.6/tcp-add-memory-barrier-to-tcp_push.patch
new file mode 100644 (file)
index 0000000..9daaced
--- /dev/null
@@ -0,0 +1,101 @@
+From 7c6a8c1eb5c73240018b9b8701947e3e098d71d1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 11:01:33 -0800
+Subject: tcp: Add memory barrier to tcp_push()
+
+From: Salvatore Dipietro <dipiets@amazon.com>
+
+[ Upstream commit 7267e8dcad6b2f9fce05a6a06335d7040acbc2b6 ]
+
+On CPUs with weak memory models, reads and updates performed by tcp_push
+to the sk variables can get reordered leaving the socket throttled when
+it should not. The tasklet running tcp_wfree() may also not observe the
+memory updates in time and will skip flushing any packets throttled by
+tcp_push(), delaying the sending. This can pathologically cause 40ms
+extra latency due to bad interactions with delayed acks.
+
+Adding a memory barrier in tcp_push removes the bug, similarly to the
+previous commit bf06200e732d ("tcp: tsq: fix nonagle handling").
+smp_mb__after_atomic() is used to not incur in unnecessary overhead
+on x86 since not affected.
+
+Patch has been tested using an AWS c7g.2xlarge instance with Ubuntu
+22.04 and Apache Tomcat 9.0.83 running the basic servlet below:
+
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+public class HelloWorldServlet extends HttpServlet {
+    @Override
+    protected void doGet(HttpServletRequest request, HttpServletResponse response)
+      throws ServletException, IOException {
+        response.setContentType("text/html;charset=utf-8");
+        OutputStreamWriter osw = new OutputStreamWriter(response.getOutputStream(),"UTF-8");
+        String s = "a".repeat(3096);
+        osw.write(s,0,s.length());
+        osw.flush();
+    }
+}
+
+Load was applied using wrk2 (https://github.com/kinvolk/wrk2) from an AWS
+c6i.8xlarge instance. Before the patch an additional 40ms latency from P99.99+
+values is observed while, with the patch, the extra latency disappears.
+
+No patch and tcp_autocorking=1
+./wrk -t32 -c128 -d40s --latency -R10000  http://172.31.60.173:8080/hello/hello
+  ...
+ 50.000%    0.91ms
+ 75.000%    1.13ms
+ 90.000%    1.46ms
+ 99.000%    1.74ms
+ 99.900%    1.89ms
+ 99.990%   41.95ms  <<< 40+ ms extra latency
+ 99.999%   48.32ms
+100.000%   48.96ms
+
+With patch and tcp_autocorking=1
+./wrk -t32 -c128 -d40s --latency -R10000  http://172.31.60.173:8080/hello/hello
+  ...
+ 50.000%    0.90ms
+ 75.000%    1.13ms
+ 90.000%    1.45ms
+ 99.000%    1.72ms
+ 99.900%    1.83ms
+ 99.990%    2.11ms  <<< no 40+ ms extra latency
+ 99.999%    2.53ms
+100.000%    2.62ms
+
+Patch has been also tested on x86 (m7i.2xlarge instance) which it is not
+affected by this issue and the patch doesn't introduce any additional
+delay.
+
+Fixes: 7aa5470c2c09 ("tcp: tsq: move tsq_flags close to sk_wmem_alloc")
+Signed-off-by: Salvatore Dipietro <dipiets@amazon.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20240119190133.43698-1-dipiets@amazon.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/tcp.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index f124f6c63915..fb417aee86e6 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -722,6 +722,7 @@ void tcp_push(struct sock *sk, int flags, int mss_now,
+               if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
+                       NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
+                       set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
++                      smp_mb__after_atomic();
+               }
+               /* It is possible TX completion already happened
+                * before we set TSQ_THROTTLED.
+-- 
+2.43.0
+
diff --git a/queue-6.6/tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch b/queue-6.6/tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch
new file mode 100644 (file)
index 0000000..f8ad494
--- /dev/null
@@ -0,0 +1,170 @@
+From b9cc1f57bc1b392c35cd428d5ca29d92a9e28fd8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 09:20:19 +0800
+Subject: tcp: make sure init the accept_queue's spinlocks once
+
+From: Zhengchao Shao <shaozhengchao@huawei.com>
+
+[ Upstream commit 198bc90e0e734e5f98c3d2833e8390cac3df61b2 ]
+
+When I run syz's reproduction C program locally, it causes the following
+issue:
+pvqspinlock: lock 0xffff9d181cd5c660 has corrupted value 0x0!
+WARNING: CPU: 19 PID: 21160 at __pv_queued_spin_unlock_slowpath (kernel/locking/qspinlock_paravirt.h:508)
+Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
+RIP: 0010:__pv_queued_spin_unlock_slowpath (kernel/locking/qspinlock_paravirt.h:508)
+Code: 73 56 3a ff 90 c3 cc cc cc cc 8b 05 bb 1f 48 01 85 c0 74 05 c3 cc cc cc cc 8b 17 48 89 fe 48 c7 c7
+30 20 ce 8f e8 ad 56 42 ff <0f> 0b c3 cc cc cc cc 0f 0b 0f 1f 40 00 90 90 90 90 90 90 90 90 90
+RSP: 0018:ffffa8d200604cb8 EFLAGS: 00010282
+RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff9d1ef60e0908
+RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff9d1ef60e0900
+RBP: ffff9d181cd5c280 R08: 0000000000000000 R09: 00000000ffff7fff
+R10: ffffa8d200604b68 R11: ffffffff907dcdc8 R12: 0000000000000000
+R13: ffff9d181cd5c660 R14: ffff9d1813a3f330 R15: 0000000000001000
+FS:  00007fa110184640(0000) GS:ffff9d1ef60c0000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 0000000020000000 CR3: 000000011f65e000 CR4: 00000000000006f0
+Call Trace:
+<IRQ>
+  _raw_spin_unlock (kernel/locking/spinlock.c:186)
+  inet_csk_reqsk_queue_add (net/ipv4/inet_connection_sock.c:1321)
+  inet_csk_complete_hashdance (net/ipv4/inet_connection_sock.c:1358)
+  tcp_check_req (net/ipv4/tcp_minisocks.c:868)
+  tcp_v4_rcv (net/ipv4/tcp_ipv4.c:2260)
+  ip_protocol_deliver_rcu (net/ipv4/ip_input.c:205)
+  ip_local_deliver_finish (net/ipv4/ip_input.c:234)
+  __netif_receive_skb_one_core (net/core/dev.c:5529)
+  process_backlog (./include/linux/rcupdate.h:779)
+  __napi_poll (net/core/dev.c:6533)
+  net_rx_action (net/core/dev.c:6604)
+  __do_softirq (./arch/x86/include/asm/jump_label.h:27)
+  do_softirq (kernel/softirq.c:454 kernel/softirq.c:441)
+</IRQ>
+<TASK>
+  __local_bh_enable_ip (kernel/softirq.c:381)
+  __dev_queue_xmit (net/core/dev.c:4374)
+  ip_finish_output2 (./include/net/neighbour.h:540 net/ipv4/ip_output.c:235)
+  __ip_queue_xmit (net/ipv4/ip_output.c:535)
+  __tcp_transmit_skb (net/ipv4/tcp_output.c:1462)
+  tcp_rcv_synsent_state_process (net/ipv4/tcp_input.c:6469)
+  tcp_rcv_state_process (net/ipv4/tcp_input.c:6657)
+  tcp_v4_do_rcv (net/ipv4/tcp_ipv4.c:1929)
+  __release_sock (./include/net/sock.h:1121 net/core/sock.c:2968)
+  release_sock (net/core/sock.c:3536)
+  inet_wait_for_connect (net/ipv4/af_inet.c:609)
+  __inet_stream_connect (net/ipv4/af_inet.c:702)
+  inet_stream_connect (net/ipv4/af_inet.c:748)
+  __sys_connect (./include/linux/file.h:45 net/socket.c:2064)
+  __x64_sys_connect (net/socket.c:2073 net/socket.c:2070 net/socket.c:2070)
+  do_syscall_64 (arch/x86/entry/common.c:51 arch/x86/entry/common.c:82)
+  entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129)
+  RIP: 0033:0x7fa10ff05a3d
+  Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89
+  c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ab a3 0e 00 f7 d8 64 89 01 48
+  RSP: 002b:00007fa110183de8 EFLAGS: 00000202 ORIG_RAX: 000000000000002a
+  RAX: ffffffffffffffda RBX: 0000000020000054 RCX: 00007fa10ff05a3d
+  RDX: 000000000000001c RSI: 0000000020000040 RDI: 0000000000000003
+  RBP: 00007fa110183e20 R08: 0000000000000000 R09: 0000000000000000
+  R10: 0000000000000000 R11: 0000000000000202 R12: 00007fa110184640
+  R13: 0000000000000000 R14: 00007fa10fe8b060 R15: 00007fff73e23b20
+</TASK>
+
+The issue triggering process is analyzed as follows:
+Thread A                                       Thread B
+tcp_v4_rcv     //receive ack TCP packet       inet_shutdown
+  tcp_check_req                                  tcp_disconnect //disconnect sock
+  ...                                              tcp_set_state(sk, TCP_CLOSE)
+    inet_csk_complete_hashdance                ...
+      inet_csk_reqsk_queue_add                 inet_listen  //start listen
+        spin_lock(&queue->rskq_lock)             inet_csk_listen_start
+        ...                                        reqsk_queue_alloc
+        ...                                          spin_lock_init
+        spin_unlock(&queue->rskq_lock) //warning
+
+When the socket receives the ACK packet during the three-way handshake,
+it will hold spinlock. And then the user actively shutdowns the socket
+and listens to the socket immediately, the spinlock will be initialized.
+When the socket is going to release the spinlock, a warning is generated.
+Also the same issue to fastopenq.lock.
+
+Move init spinlock to inet_create and inet_accept to make sure init the
+accept_queue's spinlocks once.
+
+Fixes: fff1f3001cc5 ("tcp: add a spinlock to protect struct request_sock_queue")
+Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path")
+Reported-by: Ming Shu <sming56@aliyun.com>
+Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20240118012019.1751966-1-shaozhengchao@huawei.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/inet_connection_sock.h | 8 ++++++++
+ net/core/request_sock.c            | 3 ---
+ net/ipv4/af_inet.c                 | 3 +++
+ net/ipv4/inet_connection_sock.c    | 4 ++++
+ 4 files changed, 15 insertions(+), 3 deletions(-)
+
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
+index 5d2fcc137b88..01a73bf74fa1 100644
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -347,4 +347,12 @@ static inline bool inet_csk_has_ulp(const struct sock *sk)
+       return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops;
+ }
++static inline void inet_init_csk_locks(struct sock *sk)
++{
++      struct inet_connection_sock *icsk = inet_csk(sk);
++
++      spin_lock_init(&icsk->icsk_accept_queue.rskq_lock);
++      spin_lock_init(&icsk->icsk_accept_queue.fastopenq.lock);
++}
++
+ #endif /* _INET_CONNECTION_SOCK_H */
+diff --git a/net/core/request_sock.c b/net/core/request_sock.c
+index f35c2e998406..63de5c635842 100644
+--- a/net/core/request_sock.c
++++ b/net/core/request_sock.c
+@@ -33,9 +33,6 @@
+ void reqsk_queue_alloc(struct request_sock_queue *queue)
+ {
+-      spin_lock_init(&queue->rskq_lock);
+-
+-      spin_lock_init(&queue->fastopenq.lock);
+       queue->fastopenq.rskq_rst_head = NULL;
+       queue->fastopenq.rskq_rst_tail = NULL;
+       queue->fastopenq.qlen = 0;
+diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
+index b0a5de1303b5..b739ddbef0f0 100644
+--- a/net/ipv4/af_inet.c
++++ b/net/ipv4/af_inet.c
+@@ -330,6 +330,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
+       if (INET_PROTOSW_REUSE & answer_flags)
+               sk->sk_reuse = SK_CAN_REUSE;
++      if (INET_PROTOSW_ICSK & answer_flags)
++              inet_init_csk_locks(sk);
++
+       inet = inet_sk(sk);
+       inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);
+diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
+index 394a498c2823..762817d6c8d7 100644
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -730,6 +730,10 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
+       }
+       if (req)
+               reqsk_put(req);
++
++      if (newsk)
++              inet_init_csk_locks(newsk);
++
+       return newsk;
+ out_err:
+       newsk = NULL;
+-- 
+2.43.0
+
diff --git a/queue-6.6/tracing-ensure-visibility-when-inserting-an-element-.patch b/queue-6.6/tracing-ensure-visibility-when-inserting-an-element-.patch
new file mode 100644 (file)
index 0000000..d833d02
--- /dev/null
@@ -0,0 +1,129 @@
+From 72a7126c4a10804959674d8718962280a7d696ad Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 16:09:28 +0100
+Subject: tracing: Ensure visibility when inserting an element into tracing_map
+
+From: Petr Pavlu <petr.pavlu@suse.com>
+
+[ Upstream commit 2b44760609e9eaafc9d234a6883d042fc21132a7 ]
+
+Running the following two commands in parallel on a multi-processor
+AArch64 machine can sporadically produce an unexpected warning about
+duplicate histogram entries:
+
+ $ while true; do
+     echo hist:key=id.syscall:val=hitcount > \
+       /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
+     cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
+     sleep 0.001
+   done
+ $ stress-ng --sysbadaddr $(nproc)
+
+The warning looks as follows:
+
+[ 2911.172474] ------------[ cut here ]------------
+[ 2911.173111] Duplicates detected: 1
+[ 2911.173574] WARNING: CPU: 2 PID: 12247 at kernel/trace/tracing_map.c:983 tracing_map_sort_entries+0x3e0/0x408
+[ 2911.174702] Modules linked in: iscsi_ibft(E) iscsi_boot_sysfs(E) rfkill(E) af_packet(E) nls_iso8859_1(E) nls_cp437(E) vfat(E) fat(E) ena(E) tiny_power_button(E) qemu_fw_cfg(E) button(E) fuse(E) efi_pstore(E) ip_tables(E) x_tables(E) xfs(E) libcrc32c(E) aes_ce_blk(E) aes_ce_cipher(E) crct10dif_ce(E) polyval_ce(E) polyval_generic(E) ghash_ce(E) gf128mul(E) sm4_ce_gcm(E) sm4_ce_ccm(E) sm4_ce(E) sm4_ce_cipher(E) sm4(E) sm3_ce(E) sm3(E) sha3_ce(E) sha512_ce(E) sha512_arm64(E) sha2_ce(E) sha256_arm64(E) nvme(E) sha1_ce(E) nvme_core(E) nvme_auth(E) t10_pi(E) sg(E) scsi_mod(E) scsi_common(E) efivarfs(E)
+[ 2911.174738] Unloaded tainted modules: cppc_cpufreq(E):1
+[ 2911.180985] CPU: 2 PID: 12247 Comm: cat Kdump: loaded Tainted: G            E      6.7.0-default #2 1b58bbb22c97e4399dc09f92d309344f69c44a01
+[ 2911.182398] Hardware name: Amazon EC2 c7g.8xlarge/, BIOS 1.0 11/1/2018
+[ 2911.183208] pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
+[ 2911.184038] pc : tracing_map_sort_entries+0x3e0/0x408
+[ 2911.184667] lr : tracing_map_sort_entries+0x3e0/0x408
+[ 2911.185310] sp : ffff8000a1513900
+[ 2911.185750] x29: ffff8000a1513900 x28: ffff0003f272fe80 x27: 0000000000000001
+[ 2911.186600] x26: ffff0003f272fe80 x25: 0000000000000030 x24: 0000000000000008
+[ 2911.187458] x23: ffff0003c5788000 x22: ffff0003c16710c8 x21: ffff80008017f180
+[ 2911.188310] x20: ffff80008017f000 x19: ffff80008017f180 x18: ffffffffffffffff
+[ 2911.189160] x17: 0000000000000000 x16: 0000000000000000 x15: ffff8000a15134b8
+[ 2911.190015] x14: 0000000000000000 x13: 205d373432323154 x12: 5b5d313131333731
+[ 2911.190844] x11: 00000000fffeffff x10: 00000000fffeffff x9 : ffffd1b78274a13c
+[ 2911.191716] x8 : 000000000017ffe8 x7 : c0000000fffeffff x6 : 000000000057ffa8
+[ 2911.192554] x5 : ffff0012f6c24ec0 x4 : 0000000000000000 x3 : ffff2e5b72b5d000
+[ 2911.193404] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0003ff254480
+[ 2911.194259] Call trace:
+[ 2911.194626]  tracing_map_sort_entries+0x3e0/0x408
+[ 2911.195220]  hist_show+0x124/0x800
+[ 2911.195692]  seq_read_iter+0x1d4/0x4e8
+[ 2911.196193]  seq_read+0xe8/0x138
+[ 2911.196638]  vfs_read+0xc8/0x300
+[ 2911.197078]  ksys_read+0x70/0x108
+[ 2911.197534]  __arm64_sys_read+0x24/0x38
+[ 2911.198046]  invoke_syscall+0x78/0x108
+[ 2911.198553]  el0_svc_common.constprop.0+0xd0/0xf8
+[ 2911.199157]  do_el0_svc+0x28/0x40
+[ 2911.199613]  el0_svc+0x40/0x178
+[ 2911.200048]  el0t_64_sync_handler+0x13c/0x158
+[ 2911.200621]  el0t_64_sync+0x1a8/0x1b0
+[ 2911.201115] ---[ end trace 0000000000000000 ]---
+
+The problem appears to be caused by CPU reordering of writes issued from
+__tracing_map_insert().
+
+The check for the presence of an element with a given key in this
+function is:
+
+ val = READ_ONCE(entry->val);
+ if (val && keys_match(key, val->key, map->key_size)) ...
+
+The write of a new entry is:
+
+ elt = get_free_elt(map);
+ memcpy(elt->key, key, map->key_size);
+ entry->val = elt;
+
+The "memcpy(elt->key, key, map->key_size);" and "entry->val = elt;"
+stores may become visible in the reversed order on another CPU. This
+second CPU might then incorrectly determine that a new key doesn't match
+an already present val->key and subsequently insert a new element,
+resulting in a duplicate.
+
+Fix the problem by adding a write barrier between
+"memcpy(elt->key, key, map->key_size);" and "entry->val = elt;", and for
+good measure, also use WRITE_ONCE(entry->val, elt) for publishing the
+element. The sequence pairs with the mentioned "READ_ONCE(entry->val);"
+and the "val->key" check which has an address dependency.
+
+The barrier is placed on a path executed when adding an element for
+a new key. Subsequent updates targeting the same key remain unaffected.
+
+From the user's perspective, the issue was introduced by commit
+c193707dde77 ("tracing: Remove code which merges duplicates"), which
+followed commit cbf4100efb8f ("tracing: Add support to detect and avoid
+duplicates"). The previous code operated differently; it inherently
+expected potential races which result in duplicates but merged them
+later when they occurred.
+
+Link: https://lore.kernel.org/linux-trace-kernel/20240122150928.27725-1-petr.pavlu@suse.com
+
+Fixes: c193707dde77 ("tracing: Remove code which merges duplicates")
+Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
+Acked-by: Tom Zanussi <tom.zanussi@linux.intel.com>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/trace/tracing_map.c | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
+index c774e560f2f9..a4dcf0f24352 100644
+--- a/kernel/trace/tracing_map.c
++++ b/kernel/trace/tracing_map.c
+@@ -574,7 +574,12 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
+                               }
+                               memcpy(elt->key, key, map->key_size);
+-                              entry->val = elt;
++                              /*
++                               * Ensure the initialization is visible and
++                               * publish the elt.
++                               */
++                              smp_wmb();
++                              WRITE_ONCE(entry->val, elt);
+                               atomic64_inc(&map->hits);
+                               return entry->val;
+-- 
+2.43.0
+
diff --git a/queue-6.6/tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch b/queue-6.6/tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch
new file mode 100644 (file)
index 0000000..ada3524
--- /dev/null
@@ -0,0 +1,52 @@
+From 43d7a55fe424e183b51503d870bf52d919e36ee3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 21:09:18 +0100
+Subject: tsnep: Fix XDP_RING_NEED_WAKEUP for empty fill ring
+
+From: Gerhard Engleder <gerhard@engleder-embedded.com>
+
+[ Upstream commit 9a91c05f4bd6f6bdd6b8f90445e0da92e3ac956c ]
+
+The fill ring of the XDP socket may contain not enough buffers to
+completey fill the RX queue during socket creation. In this case the
+flag XDP_RING_NEED_WAKEUP is not set as this flag is only set if the RX
+queue is not completely filled during polling.
+
+Set XDP_RING_NEED_WAKEUP flag also if RX queue is not completely filled
+during XDP socket creation.
+
+Fixes: 3fc2333933fd ("tsnep: Add XDP socket zero-copy RX support")
+Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/engleder/tsnep_main.c | 13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
+index 9fea97671f4b..08e113e785a7 100644
+--- a/drivers/net/ethernet/engleder/tsnep_main.c
++++ b/drivers/net/ethernet/engleder/tsnep_main.c
+@@ -1711,6 +1711,19 @@ static void tsnep_rx_reopen_xsk(struct tsnep_rx *rx)
+                       allocated--;
+               }
+       }
++
++      /* set need wakeup flag immediately if ring is not filled completely,
++       * first polling would be too late as need wakeup signalisation would
++       * be delayed for an indefinite time
++       */
++      if (xsk_uses_need_wakeup(rx->xsk_pool)) {
++              int desc_available = tsnep_rx_desc_available(rx);
++
++              if (desc_available)
++                      xsk_set_rx_need_wakeup(rx->xsk_pool);
++              else
++                      xsk_clear_rx_need_wakeup(rx->xsk_pool);
++      }
+ }
+ static bool tsnep_pending(struct tsnep_queue *queue)
+-- 
+2.43.0
+
diff --git a/queue-6.6/tsnep-remove-fcs-for-xdp-data-path.patch b/queue-6.6/tsnep-remove-fcs-for-xdp-data-path.patch
new file mode 100644 (file)
index 0000000..13b52f5
--- /dev/null
@@ -0,0 +1,49 @@
+From bb256d5a97bae3dcb46d7b662391f874fe788a98 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 21:09:17 +0100
+Subject: tsnep: Remove FCS for XDP data path
+
+From: Gerhard Engleder <gerhard@engleder-embedded.com>
+
+[ Upstream commit 50bad6f797d4d501c5ef416a6f92e1912ab5aa8b ]
+
+The RX data buffer includes the FCS. The FCS is already stripped for the
+normal data path. But for the XDP data path the FCS is included and
+acts like additional/useless data.
+
+Remove the FCS from the RX data buffer also for XDP.
+
+Fixes: 65b28c810035 ("tsnep: Add XDP RX support")
+Fixes: 3fc2333933fd ("tsnep: Add XDP socket zero-copy RX support")
+Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/engleder/tsnep_main.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
+index 38da2d6c250e..9fea97671f4b 100644
+--- a/drivers/net/ethernet/engleder/tsnep_main.c
++++ b/drivers/net/ethernet/engleder/tsnep_main.c
+@@ -1434,7 +1434,7 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
+                       xdp_prepare_buff(&xdp, page_address(entry->page),
+                                        XDP_PACKET_HEADROOM + TSNEP_RX_INLINE_METADATA_SIZE,
+-                                       length, false);
++                                       length - ETH_FCS_LEN, false);
+                       consume = tsnep_xdp_run_prog(rx, prog, &xdp,
+                                                    &xdp_status, tx_nq, tx);
+@@ -1517,7 +1517,7 @@ static int tsnep_rx_poll_zc(struct tsnep_rx *rx, struct napi_struct *napi,
+               prefetch(entry->xdp->data);
+               length = __le32_to_cpu(entry->desc_wb->properties) &
+                        TSNEP_DESC_LENGTH_MASK;
+-              xsk_buff_set_size(entry->xdp, length);
++              xsk_buff_set_size(entry->xdp, length - ETH_FCS_LEN);
+               xsk_buff_dma_sync_for_cpu(entry->xdp, rx->xsk_pool);
+               /* RX metadata with timestamps is in front of actual data,
+-- 
+2.43.0
+
diff --git a/queue-6.6/tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch b/queue-6.6/tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch
new file mode 100644 (file)
index 0000000..a57aa53
--- /dev/null
@@ -0,0 +1,49 @@
+From f5d4f981502d41e1176c57c6fd78b6125f849ff7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 18:22:56 +0800
+Subject: tun: add missing rx stats accounting in tun_xdp_act
+
+From: Yunjian Wang <wangyunjian@huawei.com>
+
+[ Upstream commit f1084c427f55d573fcd5688d9ba7b31b78019716 ]
+
+The TUN can be used as vhost-net backend, and it is necessary to
+count the packets transmitted from TUN to vhost-net/virtio-net.
+However, there are some places in the receive path that were not
+taken into account when using XDP. It would be beneficial to also
+include new accounting for successfully received bytes using
+dev_sw_netstats_rx_add.
+
+Fixes: 761876c857cb ("tap: XDP support")
+Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Acked-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/tun.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/net/tun.c b/drivers/net/tun.c
+index 237fef557ba5..4a4f8c8e79fa 100644
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -1634,6 +1634,7 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
+                       dev_core_stats_rx_dropped_inc(tun->dev);
+                       return err;
+               }
++              dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data);
+               break;
+       case XDP_TX:
+               err = tun_xdp_tx(tun->dev, xdp);
+@@ -1641,6 +1642,7 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
+                       dev_core_stats_rx_dropped_inc(tun->dev);
+                       return err;
+               }
++              dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data);
+               break;
+       case XDP_PASS:
+               break;
+-- 
+2.43.0
+
diff --git a/queue-6.6/tun-fix-missing-dropped-counter-in-tun_xdp_act.patch b/queue-6.6/tun-fix-missing-dropped-counter-in-tun_xdp_act.patch
new file mode 100644 (file)
index 0000000..7201026
--- /dev/null
@@ -0,0 +1,52 @@
+From ba746249f1664840569f703c743616bbc33f83ed Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 18:22:35 +0800
+Subject: tun: fix missing dropped counter in tun_xdp_act
+
+From: Yunjian Wang <wangyunjian@huawei.com>
+
+[ Upstream commit 5744ba05e7c4bff8fec133dd0f9e51ddffba92f5 ]
+
+The commit 8ae1aff0b331 ("tuntap: split out XDP logic") includes
+dropped counter for XDP_DROP, XDP_ABORTED, and invalid XDP actions.
+Unfortunately, that commit missed the dropped counter when error
+occurs during XDP_TX and XDP_REDIRECT actions. This patch fixes
+this issue.
+
+Fixes: 8ae1aff0b331 ("tuntap: split out XDP logic")
+Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Acked-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/tun.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/tun.c b/drivers/net/tun.c
+index afa5497f7c35..237fef557ba5 100644
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -1630,13 +1630,17 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
+       switch (act) {
+       case XDP_REDIRECT:
+               err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
+-              if (err)
++              if (err) {
++                      dev_core_stats_rx_dropped_inc(tun->dev);
+                       return err;
++              }
+               break;
+       case XDP_TX:
+               err = tun_xdp_tx(tun->dev, xdp);
+-              if (err < 0)
++              if (err < 0) {
++                      dev_core_stats_rx_dropped_inc(tun->dev);
+                       return err;
++              }
+               break;
+       case XDP_PASS:
+               break;
+-- 
+2.43.0
+
diff --git a/queue-6.6/udp-fix-busy-polling.patch b/queue-6.6/udp-fix-busy-polling.patch
new file mode 100644 (file)
index 0000000..51ad36b
--- /dev/null
@@ -0,0 +1,134 @@
+From ace3a097e86597bd5f6569778b648ae2dc67ca42 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 20:17:49 +0000
+Subject: udp: fix busy polling
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit a54d51fb2dfb846aedf3751af501e9688db447f5 ]
+
+Generic sk_busy_loop_end() only looks at sk->sk_receive_queue
+for presence of packets.
+
+Problem is that for UDP sockets after blamed commit, some packets
+could be present in another queue: udp_sk(sk)->reader_queue
+
+In some cases, a busy poller could spin until timeout expiration,
+even if some packets are available in udp_sk(sk)->reader_queue.
+
+v3: - make sk_busy_loop_end() nicer (Willem)
+
+v2: - add a READ_ONCE(sk->sk_family) in sk_is_inet() to avoid KCSAN splats.
+    - add a sk_is_inet() check in sk_is_udp() (Willem feedback)
+    - add a sk_is_inet() check in sk_is_tcp().
+
+Fixes: 2276f58ac589 ("udp: use a separate rx queue for packet reception")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/skmsg.h   |  6 ------
+ include/net/inet_sock.h |  5 -----
+ include/net/sock.h      | 18 +++++++++++++++++-
+ net/core/sock.c         | 11 +++++++++--
+ 4 files changed, 26 insertions(+), 14 deletions(-)
+
+diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
+index c953b8c0d2f4..bd4418377bac 100644
+--- a/include/linux/skmsg.h
++++ b/include/linux/skmsg.h
+@@ -500,12 +500,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
+       return !!psock->saved_data_ready;
+ }
+-static inline bool sk_is_udp(const struct sock *sk)
+-{
+-      return sk->sk_type == SOCK_DGRAM &&
+-             sk->sk_protocol == IPPROTO_UDP;
+-}
+-
+ #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
+ #define BPF_F_STRPARSER       (1UL << 1)
+diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
+index 2de0e4d4a027..2790ba58ffe5 100644
+--- a/include/net/inet_sock.h
++++ b/include/net/inet_sock.h
+@@ -301,11 +301,6 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
+ #define inet_assign_bit(nr, sk, val)          \
+       assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val)
+-static inline bool sk_is_inet(struct sock *sk)
+-{
+-      return sk->sk_family == AF_INET || sk->sk_family == AF_INET6;
+-}
+-
+ /**
+  * sk_to_full_sk - Access to a full socket
+  * @sk: pointer to a socket
+diff --git a/include/net/sock.h b/include/net/sock.h
+index 70a771d96467..e70c903b04f3 100644
+--- a/include/net/sock.h
++++ b/include/net/sock.h
+@@ -2793,9 +2793,25 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
+                          &skb_shinfo(skb)->tskey);
+ }
++static inline bool sk_is_inet(const struct sock *sk)
++{
++      int family = READ_ONCE(sk->sk_family);
++
++      return family == AF_INET || family == AF_INET6;
++}
++
+ static inline bool sk_is_tcp(const struct sock *sk)
+ {
+-      return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP;
++      return sk_is_inet(sk) &&
++             sk->sk_type == SOCK_STREAM &&
++             sk->sk_protocol == IPPROTO_TCP;
++}
++
++static inline bool sk_is_udp(const struct sock *sk)
++{
++      return sk_is_inet(sk) &&
++             sk->sk_type == SOCK_DGRAM &&
++             sk->sk_protocol == IPPROTO_UDP;
+ }
+ static inline bool sk_is_stream_unix(const struct sock *sk)
+diff --git a/net/core/sock.c b/net/core/sock.c
+index 5cd21e699f2d..383e30fe79f4 100644
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -107,6 +107,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/poll.h>
+ #include <linux/tcp.h>
++#include <linux/udp.h>
+ #include <linux/init.h>
+ #include <linux/highmem.h>
+ #include <linux/user_namespace.h>
+@@ -4136,8 +4137,14 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
+ {
+       struct sock *sk = p;
+-      return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
+-             sk_busy_loop_timeout(sk, start_time);
++      if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
++              return true;
++
++      if (sk_is_udp(sk) &&
++          !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
++              return true;
++
++      return sk_busy_loop_timeout(sk, start_time);
+ }
+ EXPORT_SYMBOL(sk_busy_loop_end);
+ #endif /* CONFIG_NET_RX_BUSY_POLL */
+-- 
+2.43.0
+
diff --git a/queue-6.6/vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch b/queue-6.6/vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch
new file mode 100644 (file)
index 0000000..ee7b2bf
--- /dev/null
@@ -0,0 +1,58 @@
+From e7753228b98d84fa2b26786a7330c8ebca54b5e5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 21:03:06 +0800
+Subject: vlan: skip nested type that is not IFLA_VLAN_QOS_MAPPING
+
+From: Lin Ma <linma@zju.edu.cn>
+
+[ Upstream commit 6c21660fe221a15c789dee2bc2fd95516bc5aeaf ]
+
+In the vlan_changelink function, a loop is used to parse the nested
+attributes IFLA_VLAN_EGRESS_QOS and IFLA_VLAN_INGRESS_QOS in order to
+obtain the struct ifla_vlan_qos_mapping. These two nested attributes are
+checked in the vlan_validate_qos_map function, which calls
+nla_validate_nested_deprecated with the vlan_map_policy.
+
+However, this deprecated validator applies a LIBERAL strictness, allowing
+the presence of an attribute with the type IFLA_VLAN_QOS_UNSPEC.
+Consequently, the loop in vlan_changelink may parse an attribute of type
+IFLA_VLAN_QOS_UNSPEC and believe it carries a payload of
+struct ifla_vlan_qos_mapping, which is not necessarily true.
+
+To address this issue and ensure compatibility, this patch introduces two
+type checks that skip attributes whose type is not IFLA_VLAN_QOS_MAPPING.
+
+Fixes: 07b5b17e157b ("[VLAN]: Use rtnl_link API")
+Signed-off-by: Lin Ma <linma@zju.edu.cn>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240118130306.1644001-1-linma@zju.edu.cn
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/8021q/vlan_netlink.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
+index 214532173536..a3b68243fd4b 100644
+--- a/net/8021q/vlan_netlink.c
++++ b/net/8021q/vlan_netlink.c
+@@ -118,12 +118,16 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[],
+       }
+       if (data[IFLA_VLAN_INGRESS_QOS]) {
+               nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) {
++                      if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING)
++                              continue;
+                       m = nla_data(attr);
+                       vlan_dev_set_ingress_priority(dev, m->to, m->from);
+               }
+       }
+       if (data[IFLA_VLAN_EGRESS_QOS]) {
+               nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) {
++                      if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING)
++                              continue;
+                       m = nla_data(attr);
+                       err = vlan_dev_set_egress_priority(dev, m->from, m->to);
+                       if (err)
+-- 
+2.43.0
+
diff --git a/queue-6.6/wifi-mac80211-fix-potential-sta-link-leak.patch b/queue-6.6/wifi-mac80211-fix-potential-sta-link-leak.patch
new file mode 100644 (file)
index 0000000..6ed5cfc
--- /dev/null
@@ -0,0 +1,44 @@
+From 65daa8e237218112e916d1176649db6695ae27b0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 11 Jan 2024 18:17:44 +0200
+Subject: wifi: mac80211: fix potential sta-link leak
+
+From: Johannes Berg <johannes.berg@intel.com>
+
+[ Upstream commit b01a74b3ca6fd51b62c67733ba7c3280fa6c5d26 ]
+
+When a station is allocated, links are added but not
+set to valid yet (e.g. during connection to an AP MLD),
+we might remove the station without ever marking links
+valid, and leak them. Fix that.
+
+Fixes: cb71f1d136a6 ("wifi: mac80211: add sta link addition/removal")
+Signed-off-by: Johannes Berg <johannes.berg@intel.com>
+Reviewed-by: Ilan Peer <ilan.peer@intel.com>
+Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
+Link: https://msgid.link/20240111181514.6573998beaf8.I09ac2e1d41c80f82a5a616b8bd1d9d8dd709a6a6@changeid
+Signed-off-by: Johannes Berg <johannes.berg@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/mac80211/sta_info.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
+index 0c5cc75857e4..e112300caaf7 100644
+--- a/net/mac80211/sta_info.c
++++ b/net/mac80211/sta_info.c
+@@ -398,7 +398,10 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
+       int i;
+       for (i = 0; i < ARRAY_SIZE(sta->link); i++) {
+-              if (!(sta->sta.valid_links & BIT(i)))
++              struct link_sta_info *link_sta;
++
++              link_sta = rcu_access_pointer(sta->link[i]);
++              if (!link_sta)
+                       continue;
+               sta_remove_link(sta, i, false);
+-- 
+2.43.0
+
diff --git a/queue-6.6/xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch b/queue-6.6/xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch
new file mode 100644 (file)
index 0000000..5f2ed2c
--- /dev/null
@@ -0,0 +1,42 @@
+From 87e3f813265f9528c9b11271cb2531f4952a8a26 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:16:00 +0100
+Subject: xdp: reflect tail increase for MEM_TYPE_XSK_BUFF_POOL
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit fbadd83a612c3b7aad2987893faca6bd24aaebb3 ]
+
+XSK ZC Rx path calculates the size of data that will be posted to XSK Rx
+queue via subtracting xdp_buff::data_end from xdp_buff::data.
+
+In bpf_xdp_frags_increase_tail(), when underlying memory type of
+xdp_rxq_info is MEM_TYPE_XSK_BUFF_POOL, add offset to data_end in tail
+fragment, so that later on user space will be able to take into account
+the amount of bytes added by XDP program.
+
+Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-10-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/core/filter.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/net/core/filter.c b/net/core/filter.c
+index 46ee0f5433e3..01f2417deef2 100644
+--- a/net/core/filter.c
++++ b/net/core/filter.c
+@@ -4081,6 +4081,8 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
+       memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset);
+       skb_frag_size_add(frag, offset);
+       sinfo->xdp_frags_size += offset;
++      if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
++              xsk_buff_get_tail(xdp)->data_end += offset;
+       return 0;
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.6/xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch b/queue-6.6/xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch
new file mode 100644 (file)
index 0000000..dd775d6
--- /dev/null
@@ -0,0 +1,195 @@
+From 991e25fbea1c00cd8a3ebf0504a17c7f19093ee0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:54 +0100
+Subject: xsk: fix usage of multi-buffer BPF helpers for ZC XDP
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit c5114710c8ce86b8317e9b448f4fd15c711c2a82 ]
+
+Currently when packet is shrunk via bpf_xdp_adjust_tail() and memory
+type is set to MEM_TYPE_XSK_BUFF_POOL, null ptr dereference happens:
+
+[1136314.192256] BUG: kernel NULL pointer dereference, address:
+0000000000000034
+[1136314.203943] #PF: supervisor read access in kernel mode
+[1136314.213768] #PF: error_code(0x0000) - not-present page
+[1136314.223550] PGD 0 P4D 0
+[1136314.230684] Oops: 0000 [#1] PREEMPT SMP NOPTI
+[1136314.239621] CPU: 8 PID: 54203 Comm: xdpsock Not tainted 6.6.0+ #257
+[1136314.250469] Hardware name: Intel Corporation S2600WFT/S2600WFT,
+BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019
+[1136314.265615] RIP: 0010:__xdp_return+0x6c/0x210
+[1136314.274653] Code: ad 00 48 8b 47 08 49 89 f8 a8 01 0f 85 9b 01 00 00 0f 1f 44 00 00 f0 41 ff 48 34 75 32 4c 89 c7 e9 79 cd 80 ff 83 fe 03 75 17 <f6> 41 34 01 0f 85 02 01 00 00 48 89 cf e9 22 cc 1e 00 e9 3d d2 86
+[1136314.302907] RSP: 0018:ffffc900089f8db0 EFLAGS: 00010246
+[1136314.312967] RAX: ffffc9003168aed0 RBX: ffff8881c3300000 RCX:
+0000000000000000
+[1136314.324953] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
+ffffc9003168c000
+[1136314.336929] RBP: 0000000000000ae0 R08: 0000000000000002 R09:
+0000000000010000
+[1136314.348844] R10: ffffc9000e495000 R11: 0000000000000040 R12:
+0000000000000001
+[1136314.360706] R13: 0000000000000524 R14: ffffc9003168aec0 R15:
+0000000000000001
+[1136314.373298] FS:  00007f8df8bbcb80(0000) GS:ffff8897e0e00000(0000)
+knlGS:0000000000000000
+[1136314.386105] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[1136314.396532] CR2: 0000000000000034 CR3: 00000001aa912002 CR4:
+00000000007706f0
+[1136314.408377] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
+0000000000000000
+[1136314.420173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
+0000000000000400
+[1136314.431890] PKRU: 55555554
+[1136314.439143] Call Trace:
+[1136314.446058]  <IRQ>
+[1136314.452465]  ? __die+0x20/0x70
+[1136314.459881]  ? page_fault_oops+0x15b/0x440
+[1136314.468305]  ? exc_page_fault+0x6a/0x150
+[1136314.476491]  ? asm_exc_page_fault+0x22/0x30
+[1136314.484927]  ? __xdp_return+0x6c/0x210
+[1136314.492863]  bpf_xdp_adjust_tail+0x155/0x1d0
+[1136314.501269]  bpf_prog_ccc47ae29d3b6570_xdp_sock_prog+0x15/0x60
+[1136314.511263]  ice_clean_rx_irq_zc+0x206/0xc60 [ice]
+[1136314.520222]  ? ice_xmit_zc+0x6e/0x150 [ice]
+[1136314.528506]  ice_napi_poll+0x467/0x670 [ice]
+[1136314.536858]  ? ttwu_do_activate.constprop.0+0x8f/0x1a0
+[1136314.546010]  __napi_poll+0x29/0x1b0
+[1136314.553462]  net_rx_action+0x133/0x270
+[1136314.561619]  __do_softirq+0xbe/0x28e
+[1136314.569303]  do_softirq+0x3f/0x60
+
+This comes from __xdp_return() call with xdp_buff argument passed as
+NULL which is supposed to be consumed by xsk_buff_free() call.
+
+To address this properly, in ZC case, a node that represents the frag
+being removed has to be pulled out of xskb_list. Introduce
+appropriate xsk helpers to do such node operation and use them
+accordingly within bpf_xdp_adjust_tail().
+
+Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com> # For the xsk header part
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-4-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/xdp_sock_drv.h | 26 +++++++++++++++++++++++
+ net/core/filter.c          | 42 ++++++++++++++++++++++++++++++++------
+ 2 files changed, 62 insertions(+), 6 deletions(-)
+
+diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
+index 7290eb721c07..5425f7ad5ebd 100644
+--- a/include/net/xdp_sock_drv.h
++++ b/include/net/xdp_sock_drv.h
+@@ -147,6 +147,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
+       return ret;
+ }
++static inline void xsk_buff_del_tail(struct xdp_buff *tail)
++{
++      struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp);
++
++      list_del(&xskb->xskb_list_node);
++}
++
++static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
++{
++      struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
++      struct xdp_buff_xsk *frag;
++
++      frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk,
++                             xskb_list_node);
++      return &frag->xdp;
++}
++
+ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
+ {
+       xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
+@@ -310,6 +327,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
+       return NULL;
+ }
++static inline void xsk_buff_del_tail(struct xdp_buff *tail)
++{
++}
++
++static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
++{
++      return NULL;
++}
++
+ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
+ {
+ }
+diff --git a/net/core/filter.c b/net/core/filter.c
+index cbc395d96479..46ee0f5433e3 100644
+--- a/net/core/filter.c
++++ b/net/core/filter.c
+@@ -82,6 +82,7 @@
+ #include <net/mptcp.h>
+ #include <net/netfilter/nf_conntrack_bpf.h>
+ #include <linux/un.h>
++#include <net/xdp_sock_drv.h>
+ static const struct bpf_func_proto *
+ bpf_sk_base_func_proto(enum bpf_func_id func_id);
+@@ -4084,6 +4085,40 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
+       return 0;
+ }
++static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
++                                 struct xdp_mem_info *mem_info, bool release)
++{
++      struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp);
++
++      if (release) {
++              xsk_buff_del_tail(zc_frag);
++              __xdp_return(NULL, mem_info, false, zc_frag);
++      } else {
++              zc_frag->data_end -= shrink;
++      }
++}
++
++static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
++                              int shrink)
++{
++      struct xdp_mem_info *mem_info = &xdp->rxq->mem;
++      bool release = skb_frag_size(frag) == shrink;
++
++      if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
++              bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release);
++              goto out;
++      }
++
++      if (release) {
++              struct page *page = skb_frag_page(frag);
++
++              __xdp_return(page_address(page), mem_info, false, NULL);
++      }
++
++out:
++      return release;
++}
++
+ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
+ {
+       struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+@@ -4098,12 +4133,7 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
+               len_free += shrink;
+               offset -= shrink;
+-
+-              if (skb_frag_size(frag) == shrink) {
+-                      struct page *page = skb_frag_page(frag);
+-
+-                      __xdp_return(page_address(page), &xdp->rxq->mem,
+-                                   false, NULL);
++              if (bpf_xdp_shrink_data(xdp, frag, shrink)) {
+                       n_frags_free++;
+               } else {
+                       skb_frag_size_sub(frag, shrink);
+-- 
+2.43.0
+
diff --git a/queue-6.6/xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch b/queue-6.6/xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch
new file mode 100644 (file)
index 0000000..ad30a04
--- /dev/null
@@ -0,0 +1,107 @@
+From b86bbc55700fc6b7540c6611c8605721a4fac36a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:53 +0100
+Subject: xsk: make xsk_buff_pool responsible for clearing xdp_buff::flags
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit f7f6aa8e24383fbb11ac55942e66da9660110f80 ]
+
+XDP multi-buffer support introduced XDP_FLAGS_HAS_FRAGS flag that is
+used by drivers to notify data path whether xdp_buff contains fragments
+or not. Data path looks up mentioned flag on first buffer that occupies
+the linear part of xdp_buff, so drivers only modify it there. This is
+sufficient for SKB and XDP_DRV modes as usually xdp_buff is allocated on
+stack or it resides within struct representing driver's queue and
+fragments are carried via skb_frag_t structs. IOW, we are dealing with
+only one xdp_buff.
+
+ZC mode though relies on list of xdp_buff structs that is carried via
+xsk_buff_pool::xskb_list, so ZC data path has to make sure that
+fragments do *not* have XDP_FLAGS_HAS_FRAGS set. Otherwise,
+xsk_buff_free() could misbehave if it would be executed against xdp_buff
+that carries a frag with XDP_FLAGS_HAS_FRAGS flag set. Such scenario can
+take place when within supplied XDP program bpf_xdp_adjust_tail() is
+used with negative offset that would in turn release the tail fragment
+from multi-buffer frame.
+
+Calling xsk_buff_free() on tail fragment with XDP_FLAGS_HAS_FRAGS would
+result in releasing all the nodes from xskb_list that were produced by
+driver before XDP program execution, which is not what is intended -
+only tail fragment should be deleted from xskb_list and then it should
+be put onto xsk_buff_pool::free_list. Such multi-buffer frame will never
+make it up to user space, so from AF_XDP application POV there would be
+no traffic running, however due to free_list getting constantly new
+nodes, driver will be able to feed HW Rx queue with recycled buffers.
+Bottom line is that instead of traffic being redirected to user space,
+it would be continuously dropped.
+
+To fix this, let us clear the mentioned flag on xsk_buff_pool side
+during xdp_buff initialization, which is what should have been done
+right from the start of XSK multi-buffer support.
+
+Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support")
+Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support")
+Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-3-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_xsk.c | 1 -
+ drivers/net/ethernet/intel/ice/ice_xsk.c   | 1 -
+ include/net/xdp_sock_drv.h                 | 1 +
+ net/xdp/xsk_buff_pool.c                    | 1 +
+ 4 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+index 7d991e4d9b89..b75e6b6d317c 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+@@ -503,7 +503,6 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
+               xdp_res = i40e_run_xdp_zc(rx_ring, first, xdp_prog);
+               i40e_handle_xdp_result_zc(rx_ring, first, rx_desc, &rx_packets,
+                                         &rx_bytes, xdp_res, &failure);
+-              first->flags = 0;
+               next_to_clean = next_to_process;
+               if (failure)
+                       break;
+diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
+index 2a3f0834e139..33f194c870bb 100644
+--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
++++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
+@@ -897,7 +897,6 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
+               if (!first) {
+                       first = xdp;
+-                      xdp_buff_clear_frags_flag(first);
+               } else if (ice_add_xsk_frag(rx_ring, first, xdp, size)) {
+                       break;
+               }
+diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
+index 1f6fc8c7a84c..7290eb721c07 100644
+--- a/include/net/xdp_sock_drv.h
++++ b/include/net/xdp_sock_drv.h
+@@ -152,6 +152,7 @@ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
+       xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
+       xdp->data_meta = xdp->data;
+       xdp->data_end = xdp->data + size;
++      xdp->flags = 0;
+ }
+ static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool,
+diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
+index 49cb9f9a09be..b0a611677865 100644
+--- a/net/xdp/xsk_buff_pool.c
++++ b/net/xdp/xsk_buff_pool.c
+@@ -541,6 +541,7 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
+       xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
+       xskb->xdp.data_meta = xskb->xdp.data;
++      xskb->xdp.flags = 0;
+       if (pool->dma_need_sync) {
+               dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
+-- 
+2.43.0
+
diff --git a/queue-6.6/xsk-recycle-buffer-in-case-rx-queue-was-full.patch b/queue-6.6/xsk-recycle-buffer-in-case-rx-queue-was-full.patch
new file mode 100644 (file)
index 0000000..ccedf35
--- /dev/null
@@ -0,0 +1,58 @@
+From 3a23678456ba4de3d40da0dbc139590cb5a1c647 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:52 +0100
+Subject: xsk: recycle buffer in case Rx queue was full
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 269009893146c495f41e9572dd9319e787c2eba9 ]
+
+Add missing xsk_buff_free() call when __xsk_rcv_zc() failed to produce
+descriptor to XSK Rx queue.
+
+Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-2-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/xdp/xsk.c | 12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
+index 774a6d1916e4..d849dc04a334 100644
+--- a/net/xdp/xsk.c
++++ b/net/xdp/xsk.c
+@@ -166,8 +166,10 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+               contd = XDP_PKT_CONTD;
+       err = __xsk_rcv_zc(xs, xskb, len, contd);
+-      if (err || likely(!frags))
+-              goto out;
++      if (err)
++              goto err;
++      if (likely(!frags))
++              return 0;
+       xskb_list = &xskb->pool->xskb_list;
+       list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) {
+@@ -176,11 +178,13 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+               len = pos->xdp.data_end - pos->xdp.data;
+               err = __xsk_rcv_zc(xs, pos, len, contd);
+               if (err)
+-                      return err;
++                      goto err;
+               list_del(&pos->xskb_list_node);
+       }
+-out:
++      return 0;
++err:
++      xsk_buff_free(xdp);
+       return err;
+ }
+-- 
+2.43.0
+