--- /dev/null
+From 38166116c770d034a08477438037a48df4b8bf94 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 23 Feb 2024 13:15:02 +0000
+Subject: afs: Fix endless loop in directory parsing
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 5f7a07646655fb4108da527565dcdc80124b14c4 ]
+
+If a directory has a block with only ".__afsXXXX" files in it (from
+uncompleted silly-rename), these .__afsXXXX files are skipped but without
+advancing the file position in the dir_context. This leads to
+afs_dir_iterate() repeating the block again and again.
+
+Fix this by making the code that skips the .__afsXXXX file also manually
+advance the file position.
+
+The symptoms are a soft lookup:
+
+ watchdog: BUG: soft lockup - CPU#3 stuck for 52s! [check:5737]
+ ...
+ RIP: 0010:afs_dir_iterate_block+0x39/0x1fd
+ ...
+ ? watchdog_timer_fn+0x1a6/0x213
+ ...
+ ? asm_sysvec_apic_timer_interrupt+0x16/0x20
+ ? afs_dir_iterate_block+0x39/0x1fd
+ afs_dir_iterate+0x10a/0x148
+ afs_readdir+0x30/0x4a
+ iterate_dir+0x93/0xd3
+ __do_sys_getdents64+0x6b/0xd4
+
+This is almost certainly the actual fix for:
+
+ https://bugzilla.kernel.org/show_bug.cgi?id=218496
+
+Fixes: 57e9d49c5452 ("afs: Hide silly-rename files from userspace")
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/786185.1708694102@warthog.procyon.org.uk
+Reviewed-by: Marc Dionne <marc.dionne@auristor.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: Markus Suvanto <markus.suvanto@gmail.com>
+cc: linux-afs@lists.infradead.org
+Signed-off-by: Christian Brauner <brauner@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/dir.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/fs/afs/dir.c b/fs/afs/dir.c
+index 106426de50279..c4e22e9f7a666 100644
+--- a/fs/afs/dir.c
++++ b/fs/afs/dir.c
+@@ -497,8 +497,10 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
+ dire->u.name[0] == '.' &&
+ ctx->actor != afs_lookup_filldir &&
+ ctx->actor != afs_lookup_one_filldir &&
+- memcmp(dire->u.name, ".__afs", 6) == 0)
++ memcmp(dire->u.name, ".__afs", 6) == 0) {
++ ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
+ continue;
++ }
+
+ /* found the next entry */
+ if (!dir_emit(ctx, dire->u.name, nlen,
+--
+2.43.0
+
--- /dev/null
+From 6cacd78d3d0d7f306351ea57e1693c9ebcf06c13 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 21 Feb 2024 10:21:56 +0100
+Subject: ALSA: Drop leftover snd-rtctimer stuff from Makefile
+
+From: Takashi Iwai <tiwai@suse.de>
+
+[ Upstream commit 4df49712eb54141be00a9312547436d55677f092 ]
+
+We forgot to remove the line for snd-rtctimer from Makefile while
+dropping the functionality. Get rid of the stale line.
+
+Fixes: 34ce71a96dcb ("ALSA: timer: remove legacy rtctimer")
+Link: https://lore.kernel.org/r/20240221092156.28695-1-tiwai@suse.de
+Signed-off-by: Takashi Iwai <tiwai@suse.de>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ sound/core/Makefile | 1 -
+ 1 file changed, 1 deletion(-)
+
+diff --git a/sound/core/Makefile b/sound/core/Makefile
+index 79e1407cd0de7..7da92e0383e1c 100644
+--- a/sound/core/Makefile
++++ b/sound/core/Makefile
+@@ -33,7 +33,6 @@ snd-ctl-led-objs := control_led.o
+ snd-rawmidi-objs := rawmidi.o
+ snd-timer-objs := timer.o
+ snd-hrtimer-objs := hrtimer.o
+-snd-rtctimer-objs := rtctimer.o
+ snd-hwdep-objs := hwdep.o
+ snd-seq-device-objs := seq_device.o
+
+--
+2.43.0
+
--- /dev/null
+From 3cd502c7c9bd2a1f6501604c1bf1203a1befa44e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 4 Jan 2024 11:56:32 +0000
+Subject: Bluetooth: Avoid potential use-after-free in hci_error_reset
+
+From: Ying Hsu <yinghsu@chromium.org>
+
+[ Upstream commit 2449007d3f73b2842c9734f45f0aadb522daf592 ]
+
+While handling the HCI_EV_HARDWARE_ERROR event, if the underlying
+BT controller is not responding, the GPIO reset mechanism would
+free the hci_dev and lead to a use-after-free in hci_error_reset.
+
+Here's the call trace observed on a ChromeOS device with Intel AX201:
+ queue_work_on+0x3e/0x6c
+ __hci_cmd_sync_sk+0x2ee/0x4c0 [bluetooth <HASH:3b4a6>]
+ ? init_wait_entry+0x31/0x31
+ __hci_cmd_sync+0x16/0x20 [bluetooth <HASH:3b4a 6>]
+ hci_error_reset+0x4f/0xa4 [bluetooth <HASH:3b4a 6>]
+ process_one_work+0x1d8/0x33f
+ worker_thread+0x21b/0x373
+ kthread+0x13a/0x152
+ ? pr_cont_work+0x54/0x54
+ ? kthread_blkcg+0x31/0x31
+ ret_from_fork+0x1f/0x30
+
+This patch holds the reference count on the hci_dev while processing
+a HCI_EV_HARDWARE_ERROR event to avoid potential crash.
+
+Fixes: c7741d16a57c ("Bluetooth: Perform a power cycle when receiving hardware error event")
+Signed-off-by: Ying Hsu <yinghsu@chromium.org>
+Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/bluetooth/hci_core.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
+index b3b597960c562..a8854b24f4cfb 100644
+--- a/net/bluetooth/hci_core.c
++++ b/net/bluetooth/hci_core.c
+@@ -2330,6 +2330,7 @@ static void hci_error_reset(struct work_struct *work)
+ {
+ struct hci_dev *hdev = container_of(work, struct hci_dev, error_reset);
+
++ hci_dev_hold(hdev);
+ BT_DBG("%s", hdev->name);
+
+ if (hdev->hw_error)
+@@ -2337,10 +2338,10 @@ static void hci_error_reset(struct work_struct *work)
+ else
+ bt_dev_err(hdev, "hardware error 0x%2.2x", hdev->hw_error_code);
+
+- if (hci_dev_do_close(hdev))
+- return;
++ if (!hci_dev_do_close(hdev))
++ hci_dev_do_open(hdev);
+
+- hci_dev_do_open(hdev);
++ hci_dev_put(hdev);
+ }
+
+ void hci_uuids_clear(struct hci_dev *hdev)
+--
+2.43.0
+
--- /dev/null
+From d5a12daef1accfe1f0be72b6de584739d96c11c3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 25 Jan 2024 14:50:28 +0800
+Subject: Bluetooth: Enforce validation on max value of connection interval
+
+From: Kai-Heng Feng <kai.heng.feng@canonical.com>
+
+[ Upstream commit e4b019515f950b4e6e5b74b2e1bb03a90cb33039 ]
+
+Right now Linux BT stack cannot pass test case "GAP/CONN/CPUP/BV-05-C
+'Connection Parameter Update Procedure Invalid Parameters Central
+Responder'" in Bluetooth Test Suite revision GAP.TS.p44. [0]
+
+That was revoled by commit c49a8682fc5d ("Bluetooth: validate BLE
+connection interval updates"), but later got reverted due to devices
+like keyboards and mice may require low connection interval.
+
+So only validate the max value connection interval to pass the Test
+Suite, and let devices to request low connection interval if needed.
+
+[0] https://www.bluetooth.org/docman/handlers/DownloadDoc.ashx?doc_id=229869
+
+Fixes: 68d19d7d9957 ("Revert "Bluetooth: validate BLE connection interval updates"")
+Signed-off-by: Kai-Heng Feng <kai.heng.feng@canonical.com>
+Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/bluetooth/hci_event.c | 4 ++++
+ net/bluetooth/l2cap_core.c | 8 +++++++-
+ 2 files changed, 11 insertions(+), 1 deletion(-)
+
+diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
+index 0bfd856d079d5..ba7242729a8fb 100644
+--- a/net/bluetooth/hci_event.c
++++ b/net/bluetooth/hci_event.c
+@@ -6058,6 +6058,10 @@ static void hci_le_remote_conn_param_req_evt(struct hci_dev *hdev,
+ return send_conn_param_neg_reply(hdev, handle,
+ HCI_ERROR_UNKNOWN_CONN_ID);
+
++ if (max > hcon->le_conn_max_interval)
++ return send_conn_param_neg_reply(hdev, handle,
++ HCI_ERROR_INVALID_LL_PARAMS);
++
+ if (hci_check_conn_params(min, max, latency, timeout))
+ return send_conn_param_neg_reply(hdev, handle,
+ HCI_ERROR_INVALID_LL_PARAMS);
+diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
+index 850b6aab73779..11bfc8737e6ce 100644
+--- a/net/bluetooth/l2cap_core.c
++++ b/net/bluetooth/l2cap_core.c
+@@ -5614,7 +5614,13 @@ static inline int l2cap_conn_param_update_req(struct l2cap_conn *conn,
+
+ memset(&rsp, 0, sizeof(rsp));
+
+- err = hci_check_conn_params(min, max, latency, to_multiplier);
++ if (max > hcon->le_conn_max_interval) {
++ BT_DBG("requested connection interval exceeds current bounds.");
++ err = -EINVAL;
++ } else {
++ err = hci_check_conn_params(min, max, latency, to_multiplier);
++ }
++
+ if (err)
+ rsp.result = cpu_to_le16(L2CAP_CONN_PARAM_REJECTED);
+ else
+--
+2.43.0
+
--- /dev/null
+From 9bb5f007e7a4af2a8e7284a87875684be129592f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 09:02:47 -0500
+Subject: Bluetooth: hci_event: Fix handling of HCI_EV_IO_CAPA_REQUEST
+
+From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
+
+[ Upstream commit 7e74aa53a68bf60f6019bd5d9a9a1406ec4d4865 ]
+
+If we received HCI_EV_IO_CAPA_REQUEST while
+HCI_OP_READ_REMOTE_EXT_FEATURES is yet to be responded assume the remote
+does support SSP since otherwise this event shouldn't be generated.
+
+Link: https://lore.kernel.org/linux-bluetooth/CABBYNZ+9UdG1cMZVmdtN3U2aS16AKMCyTARZZyFX7xTEDWcMOw@mail.gmail.com/T/#t
+Fixes: c7f59461f5a7 ("Bluetooth: Fix a refcnt underflow problem for hci_conn")
+Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/bluetooth/hci_event.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
+index c4a35d4612b05..0bfd856d079d5 100644
+--- a/net/bluetooth/hci_event.c
++++ b/net/bluetooth/hci_event.c
+@@ -4720,9 +4720,12 @@ static void hci_io_capa_request_evt(struct hci_dev *hdev, struct sk_buff *skb)
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &ev->bdaddr);
+- if (!conn || !hci_conn_ssp_enabled(conn))
++ if (!conn || !hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
+ goto unlock;
+
++ /* Assume remote supports SSP since it has triggered this event */
++ set_bit(HCI_CONN_SSP_ENABLED, &conn->flags);
++
+ hci_conn_hold(conn);
+
+ if (!hci_dev_test_flag(hdev, HCI_MGMT))
+--
+2.43.0
+
--- /dev/null
+From bbf6131045fd6659821e6ea7c4f14301cc924232 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 9 Jan 2024 19:03:23 +0800
+Subject: Bluetooth: hci_event: Fix wrongly recorded wakeup BD_ADDR
+
+From: Zijun Hu <quic_zijuhu@quicinc.com>
+
+[ Upstream commit 61a5ab72edea7ebc3ad2c6beea29d966f528ebfb ]
+
+hci_store_wake_reason() wrongly parses event HCI_Connection_Request
+as HCI_Connection_Complete and HCI_Connection_Complete as
+HCI_Connection_Request, so causes recording wakeup BD_ADDR error and
+potential stability issue, fix it by using the correct field.
+
+Fixes: 2f20216c1d6f ("Bluetooth: Emit controller suspend and resume events")
+Signed-off-by: Zijun Hu <quic_zijuhu@quicinc.com>
+Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/bluetooth/hci_event.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
+index 2ad2f4647847c..c4a35d4612b05 100644
+--- a/net/bluetooth/hci_event.c
++++ b/net/bluetooth/hci_event.c
+@@ -6272,10 +6272,10 @@ static void hci_store_wake_reason(struct hci_dev *hdev, u8 event,
+ * keep track of the bdaddr of the connection event that woke us up.
+ */
+ if (event == HCI_EV_CONN_REQUEST) {
+- bacpy(&hdev->wake_addr, &conn_complete->bdaddr);
++ bacpy(&hdev->wake_addr, &conn_request->bdaddr);
+ hdev->wake_addr_type = BDADDR_BREDR;
+ } else if (event == HCI_EV_CONN_COMPLETE) {
+- bacpy(&hdev->wake_addr, &conn_request->bdaddr);
++ bacpy(&hdev->wake_addr, &conn_complete->bdaddr);
+ hdev->wake_addr_type = BDADDR_BREDR;
+ } else if (event == HCI_EV_LE_META) {
+ struct hci_ev_le_meta *le_ev = (void *)skb->data;
+--
+2.43.0
+
--- /dev/null
+From 4c864a7c9d27772a6b65133248c2d758de049e75 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 17 Feb 2024 13:30:10 -0800
+Subject: cpufreq: intel_pstate: fix pstate limits enforcement for adjust_perf
+ call back
+
+From: Doug Smythies <dsmythies@telus.net>
+
+[ Upstream commit f0a0fc10abb062d122db5ac4ed42f6d1ca342649 ]
+
+There is a loophole in pstate limit clamping for the intel_cpufreq CPU
+frequency scaling driver (intel_pstate in passive mode), schedutil CPU
+frequency scaling governor, HWP (HardWare Pstate) control enabled, when
+the adjust_perf call back path is used.
+
+Fix it.
+
+Fixes: a365ab6b9dfb cpufreq: intel_pstate: Implement the ->adjust_perf() callback
+Signed-off-by: Doug Smythies <dsmythies@telus.net>
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/cpufreq/intel_pstate.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c
+index dd5f4eee9ffb6..4de71e772f514 100644
+--- a/drivers/cpufreq/intel_pstate.c
++++ b/drivers/cpufreq/intel_pstate.c
+@@ -2787,6 +2787,9 @@ static void intel_cpufreq_adjust_perf(unsigned int cpunum,
+ if (min_pstate < cpu->min_perf_ratio)
+ min_pstate = cpu->min_perf_ratio;
+
++ if (min_pstate > cpu->max_perf_ratio)
++ min_pstate = cpu->max_perf_ratio;
++
+ max_pstate = min(cap_pstate, cpu->max_perf_ratio);
+ if (max_pstate < min_pstate)
+ max_pstate = min_pstate;
+--
+2.43.0
+
--- /dev/null
+From d2836ae85f8c0e574fbd4404d0ca6cf69a6cc476 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 12 Feb 2024 12:24:40 +0100
+Subject: efi/capsule-loader: fix incorrect allocation size
+
+From: Arnd Bergmann <arnd@arndb.de>
+
+[ Upstream commit fccfa646ef3628097d59f7d9c1a3e84d4b6bb45e ]
+
+gcc-14 notices that the allocation with sizeof(void) on 32-bit architectures
+is not enough for a 64-bit phys_addr_t:
+
+drivers/firmware/efi/capsule-loader.c: In function 'efi_capsule_open':
+drivers/firmware/efi/capsule-loader.c:295:24: error: allocation of insufficient size '4' for type 'phys_addr_t' {aka 'long long unsigned int'} with size '8' [-Werror=alloc-size]
+ 295 | cap_info->phys = kzalloc(sizeof(void *), GFP_KERNEL);
+ | ^
+
+Use the correct type instead here.
+
+Fixes: f24c4d478013 ("efi/capsule-loader: Reinstate virtual capsule mapping")
+Signed-off-by: Arnd Bergmann <arnd@arndb.de>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/firmware/efi/capsule-loader.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/firmware/efi/capsule-loader.c b/drivers/firmware/efi/capsule-loader.c
+index 3e8d4b51a8140..97bafb5f70389 100644
+--- a/drivers/firmware/efi/capsule-loader.c
++++ b/drivers/firmware/efi/capsule-loader.c
+@@ -292,7 +292,7 @@ static int efi_capsule_open(struct inode *inode, struct file *file)
+ return -ENOMEM;
+ }
+
+- cap_info->phys = kzalloc(sizeof(void *), GFP_KERNEL);
++ cap_info->phys = kzalloc(sizeof(phys_addr_t), GFP_KERNEL);
+ if (!cap_info->phys) {
+ kfree(cap_info->pages);
+ kfree(cap_info);
+--
+2.43.0
+
--- /dev/null
+From ea962eef9ca6461736867cb86a60da28aaa572cb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 8 Feb 2024 12:44:11 +0100
+Subject: fbcon: always restore the old font data in fbcon_do_set_font()
+
+From: Jiri Slaby (SUSE) <jirislaby@kernel.org>
+
+[ Upstream commit 00d6a284fcf3fad1b7e1b5bc3cd87cbfb60ce03f ]
+
+Commit a5a923038d70 (fbdev: fbcon: Properly revert changes when
+vc_resize() failed) started restoring old font data upon failure (of
+vc_resize()). But it performs so only for user fonts. It means that the
+"system"/internal fonts are not restored at all. So in result, the very
+first call to fbcon_do_set_font() performs no restore at all upon
+failing vc_resize().
+
+This can be reproduced by Syzkaller to crash the system on the next
+invocation of font_get(). It's rather hard to hit the allocation failure
+in vc_resize() on the first font_set(), but not impossible. Esp. if
+fault injection is used to aid the execution/failure. It was
+demonstrated by Sirius:
+ BUG: unable to handle page fault for address: fffffffffffffff8
+ #PF: supervisor read access in kernel mode
+ #PF: error_code(0x0000) - not-present page
+ PGD cb7b067 P4D cb7b067 PUD cb7d067 PMD 0
+ Oops: 0000 [#1] PREEMPT SMP KASAN
+ CPU: 1 PID: 8007 Comm: poc Not tainted 6.7.0-g9d1694dc91ce #20
+ Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014
+ RIP: 0010:fbcon_get_font+0x229/0x800 drivers/video/fbdev/core/fbcon.c:2286
+ Call Trace:
+ <TASK>
+ con_font_get drivers/tty/vt/vt.c:4558 [inline]
+ con_font_op+0x1fc/0xf20 drivers/tty/vt/vt.c:4673
+ vt_k_ioctl drivers/tty/vt/vt_ioctl.c:474 [inline]
+ vt_ioctl+0x632/0x2ec0 drivers/tty/vt/vt_ioctl.c:752
+ tty_ioctl+0x6f8/0x1570 drivers/tty/tty_io.c:2803
+ vfs_ioctl fs/ioctl.c:51 [inline]
+ ...
+
+So restore the font data in any case, not only for user fonts. Note the
+later 'if' is now protected by 'old_userfont' and not 'old_data' as the
+latter is always set now. (And it is supposed to be non-NULL. Otherwise
+we would see the bug above again.)
+
+Signed-off-by: Jiri Slaby (SUSE) <jirislaby@kernel.org>
+Fixes: a5a923038d70 ("fbdev: fbcon: Properly revert changes when vc_resize() failed")
+Reported-and-tested-by: Ubisectech Sirius <bugreport@ubisectech.com>
+Cc: Ubisectech Sirius <bugreport@ubisectech.com>
+Cc: Daniel Vetter <daniel@ffwll.ch>
+Cc: Helge Deller <deller@gmx.de>
+Cc: linux-fbdev@vger.kernel.org
+Cc: dri-devel@lists.freedesktop.org
+Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
+Link: https://patchwork.freedesktop.org/patch/msgid/20240208114411.14604-1-jirislaby@kernel.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/video/fbdev/core/fbcon.c | 8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c
+index b6712655ec1f0..b163b54b868e6 100644
+--- a/drivers/video/fbdev/core/fbcon.c
++++ b/drivers/video/fbdev/core/fbcon.c
+@@ -2409,11 +2409,9 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount,
+ struct fbcon_ops *ops = info->fbcon_par;
+ struct fbcon_display *p = &fb_display[vc->vc_num];
+ int resize, ret, old_userfont, old_width, old_height, old_charcount;
+- char *old_data = NULL;
++ u8 *old_data = vc->vc_font.data;
+
+ resize = (w != vc->vc_font.width) || (h != vc->vc_font.height);
+- if (p->userfont)
+- old_data = vc->vc_font.data;
+ vc->vc_font.data = (void *)(p->fontdata = data);
+ old_userfont = p->userfont;
+ if ((p->userfont = userfont))
+@@ -2447,13 +2445,13 @@ static int fbcon_do_set_font(struct vc_data *vc, int w, int h, int charcount,
+ update_screen(vc);
+ }
+
+- if (old_data && (--REFCOUNT(old_data) == 0))
++ if (old_userfont && (--REFCOUNT(old_data) == 0))
+ kfree(old_data - FONT_EXTRA_WORDS * sizeof(int));
+ return 0;
+
+ err_out:
+ p->fontdata = old_data;
+- vc->vc_font.data = (void *)old_data;
++ vc->vc_font.data = old_data;
+
+ if (userfont) {
+ p->userfont = old_userfont;
+--
+2.43.0
+
--- /dev/null
+From 8c38f5c11f86a0c30a105012d3c28ba9cc7d94f9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 27 Feb 2024 10:49:41 -0800
+Subject: igb: extend PTP timestamp adjustments to i211
+
+From: Oleksij Rempel <o.rempel@pengutronix.de>
+
+[ Upstream commit 0bb7b09392eb74b152719ae87b1ba5e4bf910ef0 ]
+
+The i211 requires the same PTP timestamp adjustments as the i210,
+according to its datasheet. To ensure consistent timestamping across
+different platforms, this change extends the existing adjustments to
+include the i211.
+
+The adjustment result are tested and comparable for i210 and i211 based
+systems.
+
+Fixes: 3f544d2a4d5c ("igb: adjust PTP timestamps for Tx/Rx latency")
+Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
+Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
+Tested-by: Pucha Himasekhar Reddy <himasekharx.reddy.pucha@intel.com> (A Contingent worker at Intel)
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Link: https://lore.kernel.org/r/20240227184942.362710-1-anthony.l.nguyen@intel.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/igb/igb_ptp.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c
+index 9cdb7a856ab6c..1a1575e8577af 100644
+--- a/drivers/net/ethernet/intel/igb/igb_ptp.c
++++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
+@@ -826,7 +826,7 @@ static void igb_ptp_tx_hwtstamp(struct igb_adapter *adapter)
+
+ igb_ptp_systim_to_hwtstamp(adapter, &shhwtstamps, regval);
+ /* adjust timestamp for the TX latency based on link speed */
+- if (adapter->hw.mac.type == e1000_i210) {
++ if (hw->mac.type == e1000_i210 || hw->mac.type == e1000_i211) {
+ switch (adapter->link_speed) {
+ case SPEED_10:
+ adjust = IGB_I210_TX_LATENCY_10;
+@@ -872,6 +872,7 @@ int igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va,
+ ktime_t *timestamp)
+ {
+ struct igb_adapter *adapter = q_vector->adapter;
++ struct e1000_hw *hw = &adapter->hw;
+ struct skb_shared_hwtstamps ts;
+ __le64 *regval = (__le64 *)va;
+ int adjust = 0;
+@@ -891,7 +892,7 @@ int igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector, void *va,
+ igb_ptp_systim_to_hwtstamp(adapter, &ts, le64_to_cpu(regval[1]));
+
+ /* adjust timestamp for the RX latency based on link speed */
+- if (adapter->hw.mac.type == e1000_i210) {
++ if (hw->mac.type == e1000_i210 || hw->mac.type == e1000_i211) {
+ switch (adapter->link_speed) {
+ case SPEED_10:
+ adjust = IGB_I210_RX_LATENCY_10;
+--
+2.43.0
+
--- /dev/null
+From 7b448de1f86cc50dbade510a3cafd85b321d4869 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 22 Feb 2024 12:17:47 +0000
+Subject: ipv6: fix potential "struct net" leak in inet6_rtm_getaddr()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 10bfd453da64a057bcfd1a49fb6b271c48653cdb ]
+
+It seems that if userspace provides a correct IFA_TARGET_NETNSID value
+but no IFA_ADDRESS and IFA_LOCAL attributes, inet6_rtm_getaddr()
+returns -EINVAL with an elevated "struct net" refcount.
+
+Fixes: 6ecf4c37eb3e ("ipv6: enable IFA_TARGET_NETNSID for RTM_GETADDR")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Christian Brauner <brauner@kernel.org>
+Cc: David Ahern <dsahern@kernel.org>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv6/addrconf.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
+index c52317184e3e2..968ca078191cd 100644
+--- a/net/ipv6/addrconf.c
++++ b/net/ipv6/addrconf.c
+@@ -5463,9 +5463,10 @@ static int inet6_rtm_getaddr(struct sk_buff *in_skb, struct nlmsghdr *nlh,
+ }
+
+ addr = extract_addr(tb[IFA_ADDRESS], tb[IFA_LOCAL], &peer);
+- if (!addr)
+- return -EINVAL;
+-
++ if (!addr) {
++ err = -EINVAL;
++ goto errout;
++ }
+ ifm = nlmsg_data(nlh);
+ if (ifm->ifa_index)
+ dev = dev_get_by_index(tgt_net, ifm->ifa_index);
+--
+2.43.0
+
--- /dev/null
+From 1aaed7c27b7dc092c5dcbda64bc0bd79e8703ecd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 22 Feb 2024 13:38:38 +0100
+Subject: lan78xx: enable auto speed configuration for LAN7850 if no EEPROM is
+ detected
+
+From: Oleksij Rempel <o.rempel@pengutronix.de>
+
+[ Upstream commit 0e67899abfbfdea0c3c0ed3fd263ffc601c5c157 ]
+
+Same as LAN7800, LAN7850 can be used without EEPROM. If EEPROM is not
+present or not flashed, LAN7850 will fail to sync the speed detected by the PHY
+with the MAC. In case link speed is 100Mbit, it will accidentally work,
+otherwise no data can be transferred.
+
+Better way would be to implement link_up callback, or set auto speed
+configuration unconditionally. But this changes would be more intrusive.
+So, for now, set it only if no EEPROM is found.
+
+Fixes: e69647a19c87 ("lan78xx: Set ASD in MAC_CR when EEE is enabled.")
+Signed-off-by: Oleksij Rempel <o.rempel@pengutronix.de>
+Link: https://lore.kernel.org/r/20240222123839.2816561-1-o.rempel@pengutronix.de
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/usb/lan78xx.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
+index 5700c9d20a3e2..c8b42892655a1 100644
+--- a/drivers/net/usb/lan78xx.c
++++ b/drivers/net/usb/lan78xx.c
+@@ -2862,7 +2862,8 @@ static int lan78xx_reset(struct lan78xx_net *dev)
+ if (dev->chipid == ID_REV_CHIP_ID_7801_)
+ buf &= ~MAC_CR_GMII_EN_;
+
+- if (dev->chipid == ID_REV_CHIP_ID_7800_) {
++ if (dev->chipid == ID_REV_CHIP_ID_7800_ ||
++ dev->chipid == ID_REV_CHIP_ID_7850_) {
+ ret = lan78xx_read_raw_eeprom(dev, 0, 1, &sig);
+ if (!ret && sig != EEPROM_INDICATOR) {
+ /* Implies there is no external eeprom. Set mac speed */
+--
+2.43.0
+
--- /dev/null
+From 4a0367bb181569b5cc86ae4e1018b971b3e769ca Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 8 Nov 2023 09:07:01 -0600
+Subject: mtd: spinand: gigadevice: Fix the get ecc status issue
+
+From: Han Xu <han.xu@nxp.com>
+
+[ Upstream commit 59950610c0c00c7a06d8a75d2ee5d73dba4274cf ]
+
+Some GigaDevice ecc_get_status functions use on-stack buffer for
+spi_mem_op causes spi_mem_check_op failing, fix the issue by using
+spinand scratchbuf.
+
+Fixes: c40c7a990a46 ("mtd: spinand: Add support for GigaDevice GD5F1GQ4UExxG")
+Signed-off-by: Han Xu <han.xu@nxp.com>
+Signed-off-by: Miquel Raynal <miquel.raynal@bootlin.com>
+Link: https://lore.kernel.org/linux-mtd/20231108150701.593912-1-han.xu@nxp.com
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/mtd/nand/spi/gigadevice.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/mtd/nand/spi/gigadevice.c b/drivers/mtd/nand/spi/gigadevice.c
+index da77ab20296ea..56d1b56615f97 100644
+--- a/drivers/mtd/nand/spi/gigadevice.c
++++ b/drivers/mtd/nand/spi/gigadevice.c
+@@ -178,7 +178,7 @@ static int gd5fxgq4uexxg_ecc_get_status(struct spinand_device *spinand,
+ {
+ u8 status2;
+ struct spi_mem_op op = SPINAND_GET_FEATURE_OP(GD5FXGQXXEXXG_REG_STATUS2,
+- &status2);
++ spinand->scratchbuf);
+ int ret;
+
+ switch (status & STATUS_ECC_MASK) {
+@@ -199,6 +199,7 @@ static int gd5fxgq4uexxg_ecc_get_status(struct spinand_device *spinand,
+ * report the maximum of 4 in this case
+ */
+ /* bits sorted this way (3...0): ECCS1,ECCS0,ECCSE1,ECCSE0 */
++ status2 = *(spinand->scratchbuf);
+ return ((status & STATUS_ECC_MASK) >> 2) |
+ ((status2 & STATUS_ECC_MASK) >> 4);
+
+@@ -220,7 +221,7 @@ static int gd5fxgq5xexxg_ecc_get_status(struct spinand_device *spinand,
+ {
+ u8 status2;
+ struct spi_mem_op op = SPINAND_GET_FEATURE_OP(GD5FXGQXXEXXG_REG_STATUS2,
+- &status2);
++ spinand->scratchbuf);
+ int ret;
+
+ switch (status & STATUS_ECC_MASK) {
+@@ -240,6 +241,7 @@ static int gd5fxgq5xexxg_ecc_get_status(struct spinand_device *spinand,
+ * 1 ... 4 bits are flipped (and corrected)
+ */
+ /* bits sorted this way (1...0): ECCSE1, ECCSE0 */
++ status2 = *(spinand->scratchbuf);
+ return ((status2 & STATUS_ECC_MASK) >> 4) + 1;
+
+ case STATUS_ECC_UNCOR_ERROR:
+--
+2.43.0
+
--- /dev/null
+From c9e4dec79cee5595362c0b72b84e612297ad973b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 29 Apr 2022 08:17:35 +0300
+Subject: net: enable memcg accounting for veth queues
+
+From: Vasily Averin <vvs@openvz.org>
+
+[ Upstream commit 961c6136359eef38a8c023d02028fdcd123f02a6 ]
+
+veth netdevice defines own rx queues and allocates array containing
+up to 4095 ~750-bytes-long 'struct veth_rq' elements. Such allocation
+is quite huge and should be accounted to memcg.
+
+Signed-off-by: Vasily Averin <vvs@openvz.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 1ce7d306ea63 ("veth: try harder when allocating queue memory")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/veth.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/veth.c b/drivers/net/veth.c
+index 85c3e12f83627..87cee614618ca 100644
+--- a/drivers/net/veth.c
++++ b/drivers/net/veth.c
+@@ -1303,7 +1303,7 @@ static int veth_alloc_queues(struct net_device *dev)
+ struct veth_priv *priv = netdev_priv(dev);
+ int i;
+
+- priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL);
++ priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL_ACCOUNT);
+ if (!priv->rq)
+ return -ENOMEM;
+
+--
+2.43.0
+
--- /dev/null
+From bada6523094d175dcf46b7499d3a14131760a80a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 20 Feb 2024 14:56:02 +0100
+Subject: net: ip_tunnel: prevent perpetual headroom growth
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit 5ae1e9922bbdbaeb9cfbe91085ab75927488ac0f ]
+
+syzkaller triggered following kasan splat:
+BUG: KASAN: use-after-free in __skb_flow_dissect+0x19d1/0x7a50 net/core/flow_dissector.c:1170
+Read of size 1 at addr ffff88812fb4000e by task syz-executor183/5191
+[..]
+ kasan_report+0xda/0x110 mm/kasan/report.c:588
+ __skb_flow_dissect+0x19d1/0x7a50 net/core/flow_dissector.c:1170
+ skb_flow_dissect_flow_keys include/linux/skbuff.h:1514 [inline]
+ ___skb_get_hash net/core/flow_dissector.c:1791 [inline]
+ __skb_get_hash+0xc7/0x540 net/core/flow_dissector.c:1856
+ skb_get_hash include/linux/skbuff.h:1556 [inline]
+ ip_tunnel_xmit+0x1855/0x33c0 net/ipv4/ip_tunnel.c:748
+ ipip_tunnel_xmit+0x3cc/0x4e0 net/ipv4/ipip.c:308
+ __netdev_start_xmit include/linux/netdevice.h:4940 [inline]
+ netdev_start_xmit include/linux/netdevice.h:4954 [inline]
+ xmit_one net/core/dev.c:3548 [inline]
+ dev_hard_start_xmit+0x13d/0x6d0 net/core/dev.c:3564
+ __dev_queue_xmit+0x7c1/0x3d60 net/core/dev.c:4349
+ dev_queue_xmit include/linux/netdevice.h:3134 [inline]
+ neigh_connected_output+0x42c/0x5d0 net/core/neighbour.c:1592
+ ...
+ ip_finish_output2+0x833/0x2550 net/ipv4/ip_output.c:235
+ ip_finish_output+0x31/0x310 net/ipv4/ip_output.c:323
+ ..
+ iptunnel_xmit+0x5b4/0x9b0 net/ipv4/ip_tunnel_core.c:82
+ ip_tunnel_xmit+0x1dbc/0x33c0 net/ipv4/ip_tunnel.c:831
+ ipgre_xmit+0x4a1/0x980 net/ipv4/ip_gre.c:665
+ __netdev_start_xmit include/linux/netdevice.h:4940 [inline]
+ netdev_start_xmit include/linux/netdevice.h:4954 [inline]
+ xmit_one net/core/dev.c:3548 [inline]
+ dev_hard_start_xmit+0x13d/0x6d0 net/core/dev.c:3564
+ ...
+
+The splat occurs because skb->data points past skb->head allocated area.
+This is because neigh layer does:
+ __skb_pull(skb, skb_network_offset(skb));
+
+... but skb_network_offset() returns a negative offset and __skb_pull()
+arg is unsigned. IOW, we skb->data gets "adjusted" by a huge value.
+
+The negative value is returned because skb->head and skb->data distance is
+more than 64k and skb->network_header (u16) has wrapped around.
+
+The bug is in the ip_tunnel infrastructure, which can cause
+dev->needed_headroom to increment ad infinitum.
+
+The syzkaller reproducer consists of packets getting routed via a gre
+tunnel, and route of gre encapsulated packets pointing at another (ipip)
+tunnel. The ipip encapsulation finds gre0 as next output device.
+
+This results in the following pattern:
+
+1). First packet is to be sent out via gre0.
+Route lookup found an output device, ipip0.
+
+2).
+ip_tunnel_xmit for gre0 bumps gre0->needed_headroom based on the future
+output device, rt.dev->needed_headroom (ipip0).
+
+3).
+ip output / start_xmit moves skb on to ipip0. which runs the same
+code path again (xmit recursion).
+
+4).
+Routing step for the post-gre0-encap packet finds gre0 as output device
+to use for ipip0 encapsulated packet.
+
+tunl0->needed_headroom is then incremented based on the (already bumped)
+gre0 device headroom.
+
+This repeats for every future packet:
+
+gre0->needed_headroom gets inflated because previous packets' ipip0 step
+incremented rt->dev (gre0) headroom, and ipip0 incremented because gre0
+needed_headroom was increased.
+
+For each subsequent packet, gre/ipip0->needed_headroom grows until
+post-expand-head reallocations result in a skb->head/data distance of
+more than 64k.
+
+Once that happens, skb->network_header (u16) wraps around when
+pskb_expand_head tries to make sure that skb_network_offset() is unchanged
+after the headroom expansion/reallocation.
+
+After this skb_network_offset(skb) returns a different (and negative)
+result post headroom expansion.
+
+The next trip to neigh layer (or anything else that would __skb_pull the
+network header) makes skb->data point to a memory location outside
+skb->head area.
+
+v2: Cap the needed_headroom update to an arbitarily chosen upperlimit to
+prevent perpetual increase instead of dropping the headroom increment
+completely.
+
+Reported-and-tested-by: syzbot+bfde3bef047a81b8fde6@syzkaller.appspotmail.com
+Closes: https://groups.google.com/g/syzkaller-bugs/c/fL9G6GtWskY/m/VKk_PR5FBAAJ
+Fixes: 243aad830e8a ("ip_gre: include route header_len in max_headroom calculation")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240220135606.4939-1-fw@strlen.de
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/ip_tunnel.c | 28 +++++++++++++++++++++-------
+ 1 file changed, 21 insertions(+), 7 deletions(-)
+
+diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
+index 426dc910aaf87..96b7cd3049a33 100644
+--- a/net/ipv4/ip_tunnel.c
++++ b/net/ipv4/ip_tunnel.c
+@@ -540,6 +540,20 @@ static int tnl_update_pmtu(struct net_device *dev, struct sk_buff *skb,
+ return 0;
+ }
+
++static void ip_tunnel_adj_headroom(struct net_device *dev, unsigned int headroom)
++{
++ /* we must cap headroom to some upperlimit, else pskb_expand_head
++ * will overflow header offsets in skb_headers_offset_update().
++ */
++ static const unsigned int max_allowed = 512;
++
++ if (headroom > max_allowed)
++ headroom = max_allowed;
++
++ if (headroom > READ_ONCE(dev->needed_headroom))
++ WRITE_ONCE(dev->needed_headroom, headroom);
++}
++
+ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+ u8 proto, int tunnel_hlen)
+ {
+@@ -613,13 +627,13 @@ void ip_md_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+ }
+
+ headroom += LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len;
+- if (headroom > READ_ONCE(dev->needed_headroom))
+- WRITE_ONCE(dev->needed_headroom, headroom);
+-
+- if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
++ if (skb_cow_head(skb, headroom)) {
+ ip_rt_put(rt);
+ goto tx_dropped;
+ }
++
++ ip_tunnel_adj_headroom(dev, headroom);
++
+ iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, proto, tos, ttl,
+ df, !net_eq(tunnel->net, dev_net(dev)));
+ return;
+@@ -797,16 +811,16 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
+
+ max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ + rt->dst.header_len + ip_encap_hlen(&tunnel->encap);
+- if (max_headroom > READ_ONCE(dev->needed_headroom))
+- WRITE_ONCE(dev->needed_headroom, max_headroom);
+
+- if (skb_cow_head(skb, READ_ONCE(dev->needed_headroom))) {
++ if (skb_cow_head(skb, max_headroom)) {
+ ip_rt_put(rt);
+ dev->stats.tx_dropped++;
+ kfree_skb(skb);
+ return;
+ }
+
++ ip_tunnel_adj_headroom(dev, max_headroom);
++
+ iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
+ df, !net_eq(tunnel->net, dev_net(dev)));
+ return;
+--
+2.43.0
+
--- /dev/null
+From 07a2d624ff2c54511c92e6d2a8440348a4b97921 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 25 Feb 2024 00:20:06 +0100
+Subject: net: usb: dm9601: fix wrong return value in dm9601_mdio_read
+
+From: Javier Carrasco <javier.carrasco.cruz@gmail.com>
+
+[ Upstream commit c68b2c9eba38ec3f60f4894b189090febf4d8d22 ]
+
+The MII code does not check the return value of mdio_read (among
+others), and therefore no error code should be sent. A previous fix to
+the use of an uninitialized variable propagates negative error codes,
+that might lead to wrong operations by the MII library.
+
+An example of such issues is the use of mii_nway_restart by the dm9601
+driver. The mii_nway_restart function does not check the value returned
+by mdio_read, which in this case might be a negative number which could
+contain the exact bit the function checks (BMCR_ANENABLE = 0x1000).
+
+Return zero in case of error, as it is common practice in users of
+mdio_read to avoid wrong uses of the return value.
+
+Fixes: 8f8abb863fa5 ("net: usb: dm9601: fix uninitialized variable use in dm9601_mdio_read")
+Signed-off-by: Javier Carrasco <javier.carrasco.cruz@gmail.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Reviewed-by: Peter Korsgaard <peter@korsgaard.com>
+Link: https://lore.kernel.org/r/20240225-dm9601_ret_err-v1-1-02c1d959ea59@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/usb/dm9601.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/usb/dm9601.c b/drivers/net/usb/dm9601.c
+index 1959e12a3ff8a..f7357d884d6aa 100644
+--- a/drivers/net/usb/dm9601.c
++++ b/drivers/net/usb/dm9601.c
+@@ -232,7 +232,7 @@ static int dm9601_mdio_read(struct net_device *netdev, int phy_id, int loc)
+ err = dm_read_shared_word(dev, 1, loc, &res);
+ if (err < 0) {
+ netdev_err(dev->net, "MDIO read error: %d\n", err);
+- return err;
++ return 0;
+ }
+
+ netdev_dbg(dev->net,
+--
+2.43.0
+
--- /dev/null
+From 8efa248bc406923275a53b52bf40ba3122ff59d8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 21 Feb 2024 15:12:10 -0800
+Subject: net: veth: clear GRO when clearing XDP even when down
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit fe9f801355f0b47668419f30f1fac1cf4539e736 ]
+
+veth sets NETIF_F_GRO automatically when XDP is enabled,
+because both features use the same NAPI machinery.
+
+The logic to clear NETIF_F_GRO sits in veth_disable_xdp() which
+is called both on ndo_stop and when XDP is turned off.
+To avoid the flag from being cleared when the device is brought
+down, the clearing is skipped when IFF_UP is not set.
+Bringing the device down should indeed not modify its features.
+
+Unfortunately, this means that clearing is also skipped when
+XDP is disabled _while_ the device is down. And there's nothing
+on the open path to bring the device features back into sync.
+IOW if user enables XDP, disables it and then brings the device
+up we'll end up with a stray GRO flag set but no NAPI instances.
+
+We don't depend on the GRO flag on the datapath, so the datapath
+won't crash. We will crash (or hang), however, next time features
+are sync'ed (either by user via ethtool or peer changing its config).
+The GRO flag will go away, and veth will try to disable the NAPIs.
+But the open path never created them since XDP was off, the GRO flag
+was a stray. If NAPI was initialized before we'll hang in napi_disable().
+If it never was we'll crash trying to stop uninitialized hrtimer.
+
+Move the GRO flag updates to the XDP enable / disable paths,
+instead of mixing them with the ndo_open / ndo_close paths.
+
+Fixes: d3256efd8e8b ("veth: allow enabling NAPI even without XDP")
+Reported-by: Thomas Gleixner <tglx@linutronix.de>
+Reported-by: syzbot+039399a9b96297ddedca@syzkaller.appspotmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Reviewed-by: Toke Høiland-Jørgensen <toke@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/veth.c | 35 +++++++++++++++++------------------
+ 1 file changed, 17 insertions(+), 18 deletions(-)
+
+diff --git a/drivers/net/veth.c b/drivers/net/veth.c
+index 984a153804096..85c3e12f83627 100644
+--- a/drivers/net/veth.c
++++ b/drivers/net/veth.c
+@@ -1079,14 +1079,6 @@ static int veth_enable_xdp(struct net_device *dev)
+ veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, true);
+ return err;
+ }
+-
+- if (!veth_gro_requested(dev)) {
+- /* user-space did not require GRO, but adding XDP
+- * is supposed to get GRO working
+- */
+- dev->features |= NETIF_F_GRO;
+- netdev_features_change(dev);
+- }
+ }
+ }
+
+@@ -1106,18 +1098,9 @@ static void veth_disable_xdp(struct net_device *dev)
+ for (i = 0; i < dev->real_num_rx_queues; i++)
+ rcu_assign_pointer(priv->rq[i].xdp_prog, NULL);
+
+- if (!netif_running(dev) || !veth_gro_requested(dev)) {
++ if (!netif_running(dev) || !veth_gro_requested(dev))
+ veth_napi_del(dev);
+
+- /* if user-space did not require GRO, since adding XDP
+- * enabled it, clear it now
+- */
+- if (!veth_gro_requested(dev) && netif_running(dev)) {
+- dev->features &= ~NETIF_F_GRO;
+- netdev_features_change(dev);
+- }
+- }
+-
+ veth_disable_xdp_range(dev, 0, dev->real_num_rx_queues, false);
+ }
+
+@@ -1497,6 +1480,14 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
+ }
+
+ if (!old_prog) {
++ if (!veth_gro_requested(dev)) {
++ /* user-space did not require GRO, but adding
++ * XDP is supposed to get GRO working
++ */
++ dev->features |= NETIF_F_GRO;
++ netdev_features_change(dev);
++ }
++
+ peer->hw_features &= ~NETIF_F_GSO_SOFTWARE;
+ peer->max_mtu = max_mtu;
+ }
+@@ -1507,6 +1498,14 @@ static int veth_xdp_set(struct net_device *dev, struct bpf_prog *prog,
+ if (dev->flags & IFF_UP)
+ veth_disable_xdp(dev);
+
++ /* if user-space did not require GRO, since adding XDP
++ * enabled it, clear it now
++ */
++ if (!veth_gro_requested(dev)) {
++ dev->features &= ~NETIF_F_GRO;
++ netdev_features_change(dev);
++ }
++
+ if (peer) {
+ peer->hw_features |= NETIF_F_GSO_SOFTWARE;
+ peer->max_mtu = ETH_MAX_MTU;
+--
+2.43.0
+
--- /dev/null
+From d193e211fa7b4fbc137c4378057862b37c14f657 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 27 Feb 2024 16:17:51 +0100
+Subject: netfilter: bridge: confirm multicast packets before passing them up
+ the stack
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit 62e7151ae3eb465e0ab52a20c941ff33bb6332e9 ]
+
+conntrack nf_confirm logic cannot handle cloned skbs referencing
+the same nf_conn entry, which will happen for multicast (broadcast)
+frames on bridges.
+
+ Example:
+ macvlan0
+ |
+ br0
+ / \
+ ethX ethY
+
+ ethX (or Y) receives a L2 multicast or broadcast packet containing
+ an IP packet, flow is not yet in conntrack table.
+
+ 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting.
+ -> skb->_nfct now references a unconfirmed entry
+ 2. skb is broad/mcast packet. bridge now passes clones out on each bridge
+ interface.
+ 3. skb gets passed up the stack.
+ 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb
+ and schedules a work queue to send them out on the lower devices.
+
+ The clone skb->_nfct is not a copy, it is the same entry as the
+ original skb. The macvlan rx handler then returns RX_HANDLER_PASS.
+ 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb.
+
+The Macvlan broadcast worker and normal confirm path will race.
+
+This race will not happen if step 2 already confirmed a clone. In that
+case later steps perform skb_clone() with skb->_nfct already confirmed (in
+hash table). This works fine.
+
+But such confirmation won't happen when eb/ip/nftables rules dropped the
+packets before they reached the nf_confirm step in postrouting.
+
+Pablo points out that nf_conntrack_bridge doesn't allow use of stateful
+nat, so we can safely discard the nf_conn entry and let inet call
+conntrack again.
+
+This doesn't work for bridge netfilter: skb could have a nat
+transformation. Also bridge nf prevents re-invocation of inet prerouting
+via 'sabotage_in' hook.
+
+Work around this problem by explicit confirmation of the entry at LOCAL_IN
+time, before upper layer has a chance to clone the unconfirmed entry.
+
+The downside is that this disables NAT and conntrack helpers.
+
+Alternative fix would be to add locking to all code parts that deal with
+unconfirmed packets, but even if that could be done in a sane way this
+opens up other problems, for example:
+
+-m physdev --physdev-out eth0 -j SNAT --snat-to 1.2.3.4
+-m physdev --physdev-out eth1 -j SNAT --snat-to 1.2.3.5
+
+For multicast case, only one of such conflicting mappings will be
+created, conntrack only handles 1:1 NAT mappings.
+
+Users should set create a setup that explicitly marks such traffic
+NOTRACK (conntrack bypass) to avoid this, but we cannot auto-bypass
+them, ruleset might have accept rules for untracked traffic already,
+so user-visible behaviour would change.
+
+Suggested-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217777
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/netfilter.h | 1 +
+ net/bridge/br_netfilter_hooks.c | 96 ++++++++++++++++++++++
+ net/bridge/netfilter/nf_conntrack_bridge.c | 30 +++++++
+ net/netfilter/nf_conntrack_core.c | 1 +
+ 4 files changed, 128 insertions(+)
+
+diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
+index c92bb1580f419..c69cbd64b5b46 100644
+--- a/include/linux/netfilter.h
++++ b/include/linux/netfilter.h
+@@ -461,6 +461,7 @@ struct nf_ct_hook {
+ const struct sk_buff *);
+ void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb);
+ void (*set_closing)(struct nf_conntrack *nfct);
++ int (*confirm)(struct sk_buff *skb);
+ };
+ extern const struct nf_ct_hook __rcu *nf_ct_hook;
+
+diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
+index f14beb9a62edb..8a114a5000466 100644
+--- a/net/bridge/br_netfilter_hooks.c
++++ b/net/bridge/br_netfilter_hooks.c
+@@ -43,6 +43,10 @@
+ #include <linux/sysctl.h>
+ #endif
+
++#if IS_ENABLED(CONFIG_NF_CONNTRACK)
++#include <net/netfilter/nf_conntrack_core.h>
++#endif
++
+ static unsigned int brnf_net_id __read_mostly;
+
+ struct brnf_net {
+@@ -537,6 +541,90 @@ static unsigned int br_nf_pre_routing(void *priv,
+ return NF_STOLEN;
+ }
+
++#if IS_ENABLED(CONFIG_NF_CONNTRACK)
++/* conntracks' nf_confirm logic cannot handle cloned skbs referencing
++ * the same nf_conn entry, which will happen for multicast (broadcast)
++ * Frames on bridges.
++ *
++ * Example:
++ * macvlan0
++ * br0
++ * ethX ethY
++ *
++ * ethX (or Y) receives multicast or broadcast packet containing
++ * an IP packet, not yet in conntrack table.
++ *
++ * 1. skb passes through bridge and fake-ip (br_netfilter)Prerouting.
++ * -> skb->_nfct now references a unconfirmed entry
++ * 2. skb is broad/mcast packet. bridge now passes clones out on each bridge
++ * interface.
++ * 3. skb gets passed up the stack.
++ * 4. In macvlan case, macvlan driver retains clone(s) of the mcast skb
++ * and schedules a work queue to send them out on the lower devices.
++ *
++ * The clone skb->_nfct is not a copy, it is the same entry as the
++ * original skb. The macvlan rx handler then returns RX_HANDLER_PASS.
++ * 5. Normal conntrack hooks (in NF_INET_LOCAL_IN) confirm the orig skb.
++ *
++ * The Macvlan broadcast worker and normal confirm path will race.
++ *
++ * This race will not happen if step 2 already confirmed a clone. In that
++ * case later steps perform skb_clone() with skb->_nfct already confirmed (in
++ * hash table). This works fine.
++ *
++ * But such confirmation won't happen when eb/ip/nftables rules dropped the
++ * packets before they reached the nf_confirm step in postrouting.
++ *
++ * Work around this problem by explicit confirmation of the entry at
++ * LOCAL_IN time, before upper layer has a chance to clone the unconfirmed
++ * entry.
++ *
++ */
++static unsigned int br_nf_local_in(void *priv,
++ struct sk_buff *skb,
++ const struct nf_hook_state *state)
++{
++ struct nf_conntrack *nfct = skb_nfct(skb);
++ const struct nf_ct_hook *ct_hook;
++ struct nf_conn *ct;
++ int ret;
++
++ if (!nfct || skb->pkt_type == PACKET_HOST)
++ return NF_ACCEPT;
++
++ ct = container_of(nfct, struct nf_conn, ct_general);
++ if (likely(nf_ct_is_confirmed(ct)))
++ return NF_ACCEPT;
++
++ WARN_ON_ONCE(skb_shared(skb));
++ WARN_ON_ONCE(refcount_read(&nfct->use) != 1);
++
++ /* We can't call nf_confirm here, it would create a dependency
++ * on nf_conntrack module.
++ */
++ ct_hook = rcu_dereference(nf_ct_hook);
++ if (!ct_hook) {
++ skb->_nfct = 0ul;
++ nf_conntrack_put(nfct);
++ return NF_ACCEPT;
++ }
++
++ nf_bridge_pull_encap_header(skb);
++ ret = ct_hook->confirm(skb);
++ switch (ret & NF_VERDICT_MASK) {
++ case NF_STOLEN:
++ return NF_STOLEN;
++ default:
++ nf_bridge_push_encap_header(skb);
++ break;
++ }
++
++ ct = container_of(nfct, struct nf_conn, ct_general);
++ WARN_ON_ONCE(!nf_ct_is_confirmed(ct));
++
++ return ret;
++}
++#endif
+
+ /* PF_BRIDGE/FORWARD *************************************************/
+ static int br_nf_forward_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
+@@ -935,6 +1023,14 @@ static const struct nf_hook_ops br_nf_ops[] = {
+ .hooknum = NF_BR_PRE_ROUTING,
+ .priority = NF_BR_PRI_BRNF,
+ },
++#if IS_ENABLED(CONFIG_NF_CONNTRACK)
++ {
++ .hook = br_nf_local_in,
++ .pf = NFPROTO_BRIDGE,
++ .hooknum = NF_BR_LOCAL_IN,
++ .priority = NF_BR_PRI_LAST,
++ },
++#endif
+ {
+ .hook = br_nf_forward_ip,
+ .pf = NFPROTO_BRIDGE,
+diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c
+index d14b2dbbd1dfb..83743e95939b1 100644
+--- a/net/bridge/netfilter/nf_conntrack_bridge.c
++++ b/net/bridge/netfilter/nf_conntrack_bridge.c
+@@ -290,6 +290,30 @@ static unsigned int nf_ct_bridge_pre(void *priv, struct sk_buff *skb,
+ return nf_conntrack_in(skb, &bridge_state);
+ }
+
++static unsigned int nf_ct_bridge_in(void *priv, struct sk_buff *skb,
++ const struct nf_hook_state *state)
++{
++ enum ip_conntrack_info ctinfo;
++ struct nf_conn *ct;
++
++ if (skb->pkt_type == PACKET_HOST)
++ return NF_ACCEPT;
++
++ /* nf_conntrack_confirm() cannot handle concurrent clones,
++ * this happens for broad/multicast frames with e.g. macvlan on top
++ * of the bridge device.
++ */
++ ct = nf_ct_get(skb, &ctinfo);
++ if (!ct || nf_ct_is_confirmed(ct) || nf_ct_is_template(ct))
++ return NF_ACCEPT;
++
++ /* let inet prerouting call conntrack again */
++ skb->_nfct = 0;
++ nf_ct_put(ct);
++
++ return NF_ACCEPT;
++}
++
+ static void nf_ct_bridge_frag_save(struct sk_buff *skb,
+ struct nf_bridge_frag_data *data)
+ {
+@@ -414,6 +438,12 @@ static struct nf_hook_ops nf_ct_bridge_hook_ops[] __read_mostly = {
+ .hooknum = NF_BR_PRE_ROUTING,
+ .priority = NF_IP_PRI_CONNTRACK,
+ },
++ {
++ .hook = nf_ct_bridge_in,
++ .pf = NFPROTO_BRIDGE,
++ .hooknum = NF_BR_LOCAL_IN,
++ .priority = NF_IP_PRI_CONNTRACK_CONFIRM,
++ },
+ {
+ .hook = nf_ct_bridge_post,
+ .pf = NFPROTO_BRIDGE,
+diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
+index e0f4f76439d3d..be6031886f942 100644
+--- a/net/netfilter/nf_conntrack_core.c
++++ b/net/netfilter/nf_conntrack_core.c
+@@ -2850,6 +2850,7 @@ static const struct nf_ct_hook nf_conntrack_hook = {
+ .get_tuple_skb = nf_conntrack_get_tuple_skb,
+ .attach = nf_conntrack_attach,
+ .set_closing = nf_conntrack_set_closing,
++ .confirm = __nf_conntrack_confirm,
+ };
+
+ void nf_conntrack_init_end(void)
+--
+2.43.0
+
--- /dev/null
+From 2bf15b346e8c5bbdbf2bf246a13d8a784adc60c5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 7 Jan 2022 05:03:23 +0100
+Subject: netfilter: core: move ip_ct_attach indirection to struct nf_ct_hook
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit 3fce16493dc1aa2c9af3d7e7bd360dfe203a3e6a ]
+
+ip_ct_attach predates struct nf_ct_hook, we can place it there and
+remove the exported symbol.
+
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Stable-dep-of: 62e7151ae3eb ("netfilter: bridge: confirm multicast packets before passing them up the stack")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/netfilter.h | 2 +-
+ net/netfilter/core.c | 19 ++++++++-----------
+ net/netfilter/nf_conntrack_core.c | 4 +---
+ 3 files changed, 10 insertions(+), 15 deletions(-)
+
+diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
+index e20c2db0f2c16..64acdf22eb4fa 100644
+--- a/include/linux/netfilter.h
++++ b/include/linux/netfilter.h
+@@ -435,7 +435,6 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
+ #if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ #include <linux/netfilter/nf_conntrack_zones_common.h>
+
+-extern void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *) __rcu;
+ void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
+ struct nf_conntrack_tuple;
+ bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+@@ -458,6 +457,7 @@ struct nf_ct_hook {
+ void (*destroy)(struct nf_conntrack *);
+ bool (*get_tuple_skb)(struct nf_conntrack_tuple *,
+ const struct sk_buff *);
++ void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb);
+ };
+ extern struct nf_ct_hook __rcu *nf_ct_hook;
+
+diff --git a/net/netfilter/core.c b/net/netfilter/core.c
+index ffa84cafb746b..5396d27ba6a71 100644
+--- a/net/netfilter/core.c
++++ b/net/netfilter/core.c
+@@ -639,25 +639,22 @@ struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
+ EXPORT_SYMBOL_GPL(nf_ct_hook);
+
+ #if IS_ENABLED(CONFIG_NF_CONNTRACK)
+-/* This does not belong here, but locally generated errors need it if connection
+- tracking in use: without this, connection may not be in hash table, and hence
+- manufactured ICMP or RST packets will not be associated with it. */
+-void (*ip_ct_attach)(struct sk_buff *, const struct sk_buff *)
+- __rcu __read_mostly;
+-EXPORT_SYMBOL(ip_ct_attach);
+-
+ struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
+ EXPORT_SYMBOL_GPL(nf_nat_hook);
+
++/* This does not belong here, but locally generated errors need it if connection
++ * tracking in use: without this, connection may not be in hash table, and hence
++ * manufactured ICMP or RST packets will not be associated with it.
++ */
+ void nf_ct_attach(struct sk_buff *new, const struct sk_buff *skb)
+ {
+- void (*attach)(struct sk_buff *, const struct sk_buff *);
++ const struct nf_ct_hook *ct_hook;
+
+ if (skb->_nfct) {
+ rcu_read_lock();
+- attach = rcu_dereference(ip_ct_attach);
+- if (attach)
+- attach(new, skb);
++ ct_hook = rcu_dereference(nf_ct_hook);
++ if (ct_hook)
++ ct_hook->attach(new, skb);
+ rcu_read_unlock();
+ }
+ }
+diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
+index 10622760f894a..779e41d1afdce 100644
+--- a/net/netfilter/nf_conntrack_core.c
++++ b/net/netfilter/nf_conntrack_core.c
+@@ -2518,7 +2518,6 @@ static int kill_all(struct nf_conn *i, void *data)
+ void nf_conntrack_cleanup_start(void)
+ {
+ conntrack_gc_work.exiting = true;
+- RCU_INIT_POINTER(ip_ct_attach, NULL);
+ }
+
+ void nf_conntrack_cleanup_end(void)
+@@ -2838,12 +2837,11 @@ static struct nf_ct_hook nf_conntrack_hook = {
+ .update = nf_conntrack_update,
+ .destroy = nf_ct_destroy,
+ .get_tuple_skb = nf_conntrack_get_tuple_skb,
++ .attach = nf_conntrack_attach,
+ };
+
+ void nf_conntrack_init_end(void)
+ {
+- /* For use by REJECT target */
+- RCU_INIT_POINTER(ip_ct_attach, nf_conntrack_attach);
+ RCU_INIT_POINTER(nf_ct_hook, &nf_conntrack_hook);
+ }
+
+--
+2.43.0
+
--- /dev/null
+From 66e009926a2ed09a54351ac4e2b0c6ccb7423bdf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 1 Feb 2023 14:45:22 +0100
+Subject: netfilter: let reset rules clean out conntrack entries
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit 2954fe60e33da0f4de4d81a4c95c7dddb517d00c ]
+
+iptables/nftables support responding to tcp packets with tcp resets.
+
+The generated tcp reset packet passes through both output and postrouting
+netfilter hooks, but conntrack will never see them because the generated
+skb has its ->nfct pointer copied over from the packet that triggered the
+reset rule.
+
+If the reset rule is used for established connections, this
+may result in the conntrack entry to be around for a very long
+time (default timeout is 5 days).
+
+One way to avoid this would be to not copy the nf_conn pointer
+so that the rest packet passes through conntrack too.
+
+Problem is that output rules might not have the same conntrack
+zone setup as the prerouting ones, so its possible that the
+reset skb won't find the correct entry. Generating a template
+entry for the skb seems error prone as well.
+
+Add an explicit "closing" function that switches a confirmed
+conntrack entry to closed state and wire this up for tcp.
+
+If the entry isn't confirmed, no action is needed because
+the conntrack entry will never be committed to the table.
+
+Reported-by: Russel King <linux@armlinux.org.uk>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Stable-dep-of: 62e7151ae3eb ("netfilter: bridge: confirm multicast packets before passing them up the stack")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/netfilter.h | 3 +++
+ include/net/netfilter/nf_conntrack.h | 8 ++++++
+ net/ipv4/netfilter/nf_reject_ipv4.c | 1 +
+ net/ipv6/netfilter/nf_reject_ipv6.c | 1 +
+ net/netfilter/core.c | 16 ++++++++++++
+ net/netfilter/nf_conntrack_core.c | 12 +++++++++
+ net/netfilter/nf_conntrack_proto_tcp.c | 35 ++++++++++++++++++++++++++
+ 7 files changed, 76 insertions(+)
+
+diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
+index 5a665034c30be..c92bb1580f419 100644
+--- a/include/linux/netfilter.h
++++ b/include/linux/netfilter.h
+@@ -436,11 +436,13 @@ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
+ #include <linux/netfilter/nf_conntrack_zones_common.h>
+
+ void nf_ct_attach(struct sk_buff *, const struct sk_buff *);
++void nf_ct_set_closing(struct nf_conntrack *nfct);
+ struct nf_conntrack_tuple;
+ bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+ const struct sk_buff *skb);
+ #else
+ static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
++static inline void nf_ct_set_closing(struct nf_conntrack *nfct) {}
+ struct nf_conntrack_tuple;
+ static inline bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+ const struct sk_buff *skb)
+@@ -458,6 +460,7 @@ struct nf_ct_hook {
+ bool (*get_tuple_skb)(struct nf_conntrack_tuple *,
+ const struct sk_buff *);
+ void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb);
++ void (*set_closing)(struct nf_conntrack *nfct);
+ };
+ extern const struct nf_ct_hook __rcu *nf_ct_hook;
+
+diff --git a/include/net/netfilter/nf_conntrack.h b/include/net/netfilter/nf_conntrack.h
+index 34c266502a50e..39541ab912a16 100644
+--- a/include/net/netfilter/nf_conntrack.h
++++ b/include/net/netfilter/nf_conntrack.h
+@@ -123,6 +123,12 @@ struct nf_conn {
+ union nf_conntrack_proto proto;
+ };
+
++static inline struct nf_conn *
++nf_ct_to_nf_conn(const struct nf_conntrack *nfct)
++{
++ return container_of(nfct, struct nf_conn, ct_general);
++}
++
+ static inline struct nf_conn *
+ nf_ct_tuplehash_to_ctrack(const struct nf_conntrack_tuple_hash *hash)
+ {
+@@ -173,6 +179,8 @@ nf_ct_get(const struct sk_buff *skb, enum ip_conntrack_info *ctinfo)
+
+ void nf_ct_destroy(struct nf_conntrack *nfct);
+
++void nf_conntrack_tcp_set_closing(struct nf_conn *ct);
++
+ /* decrement reference count on a conntrack */
+ static inline void nf_ct_put(struct nf_conn *ct)
+ {
+diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
+index f2edb40c0db00..350aaca126181 100644
+--- a/net/ipv4/netfilter/nf_reject_ipv4.c
++++ b/net/ipv4/netfilter/nf_reject_ipv4.c
+@@ -278,6 +278,7 @@ void nf_send_reset(struct net *net, struct sock *sk, struct sk_buff *oldskb,
+ goto free_nskb;
+
+ nf_ct_attach(nskb, oldskb);
++ nf_ct_set_closing(skb_nfct(oldskb));
+
+ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ /* If we use ip_local_out for bridged traffic, the MAC source on
+diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
+index dffeaaaadcded..c0057edd84cfc 100644
+--- a/net/ipv6/netfilter/nf_reject_ipv6.c
++++ b/net/ipv6/netfilter/nf_reject_ipv6.c
+@@ -345,6 +345,7 @@ void nf_send_reset6(struct net *net, struct sock *sk, struct sk_buff *oldskb,
+ nf_reject_ip6_tcphdr_put(nskb, oldskb, otcph, otcplen);
+
+ nf_ct_attach(nskb, oldskb);
++ nf_ct_set_closing(skb_nfct(oldskb));
+
+ #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
+ /* If we use ip6_local_out for bridged traffic, the MAC source on
+diff --git a/net/netfilter/core.c b/net/netfilter/core.c
+index aa3f7d3228fda..fe81824799d95 100644
+--- a/net/netfilter/core.c
++++ b/net/netfilter/core.c
+@@ -674,6 +674,22 @@ void nf_conntrack_destroy(struct nf_conntrack *nfct)
+ }
+ EXPORT_SYMBOL(nf_conntrack_destroy);
+
++void nf_ct_set_closing(struct nf_conntrack *nfct)
++{
++ const struct nf_ct_hook *ct_hook;
++
++ if (!nfct)
++ return;
++
++ rcu_read_lock();
++ ct_hook = rcu_dereference(nf_ct_hook);
++ if (ct_hook)
++ ct_hook->set_closing(nfct);
++
++ rcu_read_unlock();
++}
++EXPORT_SYMBOL_GPL(nf_ct_set_closing);
++
+ bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+ const struct sk_buff *skb)
+ {
+diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
+index 2a4222eefc894..e0f4f76439d3d 100644
+--- a/net/netfilter/nf_conntrack_core.c
++++ b/net/netfilter/nf_conntrack_core.c
+@@ -2833,11 +2833,23 @@ int nf_conntrack_init_start(void)
+ return ret;
+ }
+
++static void nf_conntrack_set_closing(struct nf_conntrack *nfct)
++{
++ struct nf_conn *ct = nf_ct_to_nf_conn(nfct);
++
++ switch (nf_ct_protonum(ct)) {
++ case IPPROTO_TCP:
++ nf_conntrack_tcp_set_closing(ct);
++ break;
++ }
++}
++
+ static const struct nf_ct_hook nf_conntrack_hook = {
+ .update = nf_conntrack_update,
+ .destroy = nf_ct_destroy,
+ .get_tuple_skb = nf_conntrack_get_tuple_skb,
+ .attach = nf_conntrack_attach,
++ .set_closing = nf_conntrack_set_closing,
+ };
+
+ void nf_conntrack_init_end(void)
+diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c
+index 1ecfdc4f23be8..f33e6aea7f4da 100644
+--- a/net/netfilter/nf_conntrack_proto_tcp.c
++++ b/net/netfilter/nf_conntrack_proto_tcp.c
+@@ -870,6 +870,41 @@ static bool tcp_can_early_drop(const struct nf_conn *ct)
+ return false;
+ }
+
++void nf_conntrack_tcp_set_closing(struct nf_conn *ct)
++{
++ enum tcp_conntrack old_state;
++ const unsigned int *timeouts;
++ u32 timeout;
++
++ if (!nf_ct_is_confirmed(ct))
++ return;
++
++ spin_lock_bh(&ct->lock);
++ old_state = ct->proto.tcp.state;
++ ct->proto.tcp.state = TCP_CONNTRACK_CLOSE;
++
++ if (old_state == TCP_CONNTRACK_CLOSE ||
++ test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) {
++ spin_unlock_bh(&ct->lock);
++ return;
++ }
++
++ timeouts = nf_ct_timeout_lookup(ct);
++ if (!timeouts) {
++ const struct nf_tcp_net *tn;
++
++ tn = nf_tcp_pernet(nf_ct_net(ct));
++ timeouts = tn->timeouts;
++ }
++
++ timeout = timeouts[TCP_CONNTRACK_CLOSE];
++ WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp);
++
++ spin_unlock_bh(&ct->lock);
++
++ nf_conntrack_event_cache(IPCT_PROTOINFO, ct);
++}
++
+ static void nf_ct_tcp_state_reset(struct ip_ct_tcp_state *state)
+ {
+ state->td_end = 0;
+--
+2.43.0
+
--- /dev/null
+From 3ce961662e11bb6747c87f0831db00013039dbc6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 7 Jan 2022 05:03:24 +0100
+Subject: netfilter: make function op structures const
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit 285c8a7a58158cb1805c97ff03875df2ba2ea1fe ]
+
+No functional changes, these structures should be const.
+
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Stable-dep-of: 62e7151ae3eb ("netfilter: bridge: confirm multicast packets before passing them up the stack")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/netfilter.h | 8 ++++----
+ net/netfilter/core.c | 10 +++++-----
+ net/netfilter/nf_conntrack_core.c | 4 ++--
+ net/netfilter/nf_conntrack_netlink.c | 4 ++--
+ net/netfilter/nf_nat_core.c | 2 +-
+ net/netfilter/nfnetlink_queue.c | 8 ++++----
+ 6 files changed, 18 insertions(+), 18 deletions(-)
+
+diff --git a/include/linux/netfilter.h b/include/linux/netfilter.h
+index 64acdf22eb4fa..5a665034c30be 100644
+--- a/include/linux/netfilter.h
++++ b/include/linux/netfilter.h
+@@ -376,13 +376,13 @@ struct nf_nat_hook {
+ enum ip_conntrack_dir dir);
+ };
+
+-extern struct nf_nat_hook __rcu *nf_nat_hook;
++extern const struct nf_nat_hook __rcu *nf_nat_hook;
+
+ static inline void
+ nf_nat_decode_session(struct sk_buff *skb, struct flowi *fl, u_int8_t family)
+ {
+ #if IS_ENABLED(CONFIG_NF_NAT)
+- struct nf_nat_hook *nat_hook;
++ const struct nf_nat_hook *nat_hook;
+
+ rcu_read_lock();
+ nat_hook = rcu_dereference(nf_nat_hook);
+@@ -459,7 +459,7 @@ struct nf_ct_hook {
+ const struct sk_buff *);
+ void (*attach)(struct sk_buff *nskb, const struct sk_buff *skb);
+ };
+-extern struct nf_ct_hook __rcu *nf_ct_hook;
++extern const struct nf_ct_hook __rcu *nf_ct_hook;
+
+ struct nlattr;
+
+@@ -474,7 +474,7 @@ struct nfnl_ct_hook {
+ void (*seq_adjust)(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo, s32 off);
+ };
+-extern struct nfnl_ct_hook __rcu *nfnl_ct_hook;
++extern const struct nfnl_ct_hook __rcu *nfnl_ct_hook;
+
+ /**
+ * nf_skb_duplicated - TEE target has sent a packet
+diff --git a/net/netfilter/core.c b/net/netfilter/core.c
+index 5396d27ba6a71..aa3f7d3228fda 100644
+--- a/net/netfilter/core.c
++++ b/net/netfilter/core.c
+@@ -632,14 +632,14 @@ EXPORT_SYMBOL(nf_hook_slow_list);
+ /* This needs to be compiled in any case to avoid dependencies between the
+ * nfnetlink_queue code and nf_conntrack.
+ */
+-struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
++const struct nfnl_ct_hook __rcu *nfnl_ct_hook __read_mostly;
+ EXPORT_SYMBOL_GPL(nfnl_ct_hook);
+
+-struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
++const struct nf_ct_hook __rcu *nf_ct_hook __read_mostly;
+ EXPORT_SYMBOL_GPL(nf_ct_hook);
+
+ #if IS_ENABLED(CONFIG_NF_CONNTRACK)
+-struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
++const struct nf_nat_hook __rcu *nf_nat_hook __read_mostly;
+ EXPORT_SYMBOL_GPL(nf_nat_hook);
+
+ /* This does not belong here, but locally generated errors need it if connection
+@@ -662,7 +662,7 @@ EXPORT_SYMBOL(nf_ct_attach);
+
+ void nf_conntrack_destroy(struct nf_conntrack *nfct)
+ {
+- struct nf_ct_hook *ct_hook;
++ const struct nf_ct_hook *ct_hook;
+
+ rcu_read_lock();
+ ct_hook = rcu_dereference(nf_ct_hook);
+@@ -677,7 +677,7 @@ EXPORT_SYMBOL(nf_conntrack_destroy);
+ bool nf_ct_get_tuple_skb(struct nf_conntrack_tuple *dst_tuple,
+ const struct sk_buff *skb)
+ {
+- struct nf_ct_hook *ct_hook;
++ const struct nf_ct_hook *ct_hook;
+ bool ret = false;
+
+ rcu_read_lock();
+diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
+index 779e41d1afdce..2a4222eefc894 100644
+--- a/net/netfilter/nf_conntrack_core.c
++++ b/net/netfilter/nf_conntrack_core.c
+@@ -2145,9 +2145,9 @@ static int __nf_conntrack_update(struct net *net, struct sk_buff *skb,
+ struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+ {
++ const struct nf_nat_hook *nat_hook;
+ struct nf_conntrack_tuple_hash *h;
+ struct nf_conntrack_tuple tuple;
+- struct nf_nat_hook *nat_hook;
+ unsigned int status;
+ int dataoff;
+ u16 l3num;
+@@ -2833,7 +2833,7 @@ int nf_conntrack_init_start(void)
+ return ret;
+ }
+
+-static struct nf_ct_hook nf_conntrack_hook = {
++static const struct nf_ct_hook nf_conntrack_hook = {
+ .update = nf_conntrack_update,
+ .destroy = nf_ct_destroy,
+ .get_tuple_skb = nf_conntrack_get_tuple_skb,
+diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
+index c427f7625a3b5..1466015bc56dc 100644
+--- a/net/netfilter/nf_conntrack_netlink.c
++++ b/net/netfilter/nf_conntrack_netlink.c
+@@ -1816,7 +1816,7 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct,
+ const struct nlattr *attr)
+ __must_hold(RCU)
+ {
+- struct nf_nat_hook *nat_hook;
++ const struct nf_nat_hook *nat_hook;
+ int err;
+
+ nat_hook = rcu_dereference(nf_nat_hook);
+@@ -2922,7 +2922,7 @@ static void ctnetlink_glue_seqadj(struct sk_buff *skb, struct nf_conn *ct,
+ nf_ct_tcp_seqadj_set(skb, ct, ctinfo, diff);
+ }
+
+-static struct nfnl_ct_hook ctnetlink_glue_hook = {
++static const struct nfnl_ct_hook ctnetlink_glue_hook = {
+ .build_size = ctnetlink_glue_build_size,
+ .build = ctnetlink_glue_build,
+ .parse = ctnetlink_glue_parse,
+diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
+index 2731176839228..b776b3af78ca2 100644
+--- a/net/netfilter/nf_nat_core.c
++++ b/net/netfilter/nf_nat_core.c
+@@ -1120,7 +1120,7 @@ static struct pernet_operations nat_net_ops = {
+ .size = sizeof(struct nat_net),
+ };
+
+-static struct nf_nat_hook nat_hook = {
++static const struct nf_nat_hook nat_hook = {
+ .parse_nat_setup = nfnetlink_parse_nat_setup,
+ #ifdef CONFIG_XFRM
+ .decode_session = __nf_nat_decode_session,
+diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
+index f4468ef3d0a94..8c96e01f6a023 100644
+--- a/net/netfilter/nfnetlink_queue.c
++++ b/net/netfilter/nfnetlink_queue.c
+@@ -225,7 +225,7 @@ find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
+
+ static void nfqnl_reinject(struct nf_queue_entry *entry, unsigned int verdict)
+ {
+- struct nf_ct_hook *ct_hook;
++ const struct nf_ct_hook *ct_hook;
+ int err;
+
+ if (verdict == NF_ACCEPT ||
+@@ -388,7 +388,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
+ struct net_device *outdev;
+ struct nf_conn *ct = NULL;
+ enum ip_conntrack_info ctinfo = 0;
+- struct nfnl_ct_hook *nfnl_ct;
++ const struct nfnl_ct_hook *nfnl_ct;
+ bool csum_verify;
+ char *secdata = NULL;
+ u32 seclen = 0;
+@@ -1115,7 +1115,7 @@ static int nfqnl_recv_verdict_batch(struct sk_buff *skb,
+ return 0;
+ }
+
+-static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct,
++static struct nf_conn *nfqnl_ct_parse(const struct nfnl_ct_hook *nfnl_ct,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[],
+ struct nf_queue_entry *entry,
+@@ -1182,11 +1182,11 @@ static int nfqnl_recv_verdict(struct sk_buff *skb, const struct nfnl_info *info,
+ {
+ struct nfnl_queue_net *q = nfnl_queue_pernet(info->net);
+ u_int16_t queue_num = ntohs(info->nfmsg->res_id);
++ const struct nfnl_ct_hook *nfnl_ct;
+ struct nfqnl_msg_verdict_hdr *vhdr;
+ enum ip_conntrack_info ctinfo;
+ struct nfqnl_instance *queue;
+ struct nf_queue_entry *entry;
+- struct nfnl_ct_hook *nfnl_ct;
+ struct nf_conn *ct = NULL;
+ unsigned int verdict;
+ int err;
+--
+2.43.0
+
--- /dev/null
+From 35cc9aa0daa8cad7022e1c2a6d76448d4d912e79 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 22 Feb 2024 10:33:08 +0000
+Subject: netfilter: nf_tables: allow NFPROTO_INET in
+ nft_(match/target)_validate()
+
+From: Ignat Korchagin <ignat@cloudflare.com>
+
+[ Upstream commit 7e0f122c65912740327e4c54472acaa5f85868cb ]
+
+Commit d0009effa886 ("netfilter: nf_tables: validate NFPROTO_* family") added
+some validation of NFPROTO_* families in the nft_compat module, but it broke
+the ability to use legacy iptables modules in dual-stack nftables.
+
+While with legacy iptables one had to independently manage IPv4 and IPv6
+tables, with nftables it is possible to have dual-stack tables sharing the
+rules. Moreover, it was possible to use rules based on legacy iptables
+match/target modules in dual-stack nftables.
+
+As an example, the program from [2] creates an INET dual-stack family table
+using an xt_bpf based rule, which looks like the following (the actual output
+was generated with a patched nft tool as the current nft tool does not parse
+dual stack tables with legacy match rules, so consider it for illustrative
+purposes only):
+
+table inet testfw {
+ chain input {
+ type filter hook prerouting priority filter; policy accept;
+ bytecode counter packets 0 bytes 0 accept
+ }
+}
+
+After d0009effa886 ("netfilter: nf_tables: validate NFPROTO_* family") we get
+EOPNOTSUPP for the above program.
+
+Fix this by allowing NFPROTO_INET for nft_(match/target)_validate(), but also
+restrict the functions to classic iptables hooks.
+
+Changes in v3:
+ * clarify that upstream nft will not display such configuration properly and
+ that the output was generated with a patched nft tool
+ * remove example program from commit description and link to it instead
+ * no code changes otherwise
+
+Changes in v2:
+ * restrict nft_(match/target)_validate() to classic iptables hooks
+ * rewrite example program to use unmodified libnftnl
+
+Fixes: d0009effa886 ("netfilter: nf_tables: validate NFPROTO_* family")
+Link: https://lore.kernel.org/all/Zc1PfoWN38UuFJRI@calendula/T/#mc947262582c90fec044c7a3398cc92fac7afea72 [1]
+Link: https://lore.kernel.org/all/20240220145509.53357-1-ignat@cloudflare.com/ [2]
+Reported-by: Jordan Griege <jgriege@cloudflare.com>
+Signed-off-by: Ignat Korchagin <ignat@cloudflare.com>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nft_compat.c | 20 ++++++++++++++++++++
+ 1 file changed, 20 insertions(+)
+
+diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
+index 64a2a5f195896..aee046e00bfaf 100644
+--- a/net/netfilter/nft_compat.c
++++ b/net/netfilter/nft_compat.c
+@@ -358,10 +358,20 @@ static int nft_target_validate(const struct nft_ctx *ctx,
+
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
++ ctx->family != NFPROTO_INET &&
+ ctx->family != NFPROTO_BRIDGE &&
+ ctx->family != NFPROTO_ARP)
+ return -EOPNOTSUPP;
+
++ ret = nft_chain_validate_hooks(ctx->chain,
++ (1 << NF_INET_PRE_ROUTING) |
++ (1 << NF_INET_LOCAL_IN) |
++ (1 << NF_INET_FORWARD) |
++ (1 << NF_INET_LOCAL_OUT) |
++ (1 << NF_INET_POST_ROUTING));
++ if (ret)
++ return ret;
++
+ if (nft_is_base_chain(ctx->chain)) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+@@ -607,10 +617,20 @@ static int nft_match_validate(const struct nft_ctx *ctx,
+
+ if (ctx->family != NFPROTO_IPV4 &&
+ ctx->family != NFPROTO_IPV6 &&
++ ctx->family != NFPROTO_INET &&
+ ctx->family != NFPROTO_BRIDGE &&
+ ctx->family != NFPROTO_ARP)
+ return -EOPNOTSUPP;
+
++ ret = nft_chain_validate_hooks(ctx->chain,
++ (1 << NF_INET_PRE_ROUTING) |
++ (1 << NF_INET_LOCAL_IN) |
++ (1 << NF_INET_FORWARD) |
++ (1 << NF_INET_LOCAL_OUT) |
++ (1 << NF_INET_POST_ROUTING));
++ if (ret)
++ return ret;
++
+ if (nft_is_base_chain(ctx->chain)) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+--
+2.43.0
+
--- /dev/null
+From c2690174b5a63513a27ed277b2ffae498151a2db Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 26 Nov 2021 13:04:03 +0100
+Subject: netfilter: nfnetlink_queue: silence bogus compiler warning
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit b43c2793f5e9910862e8fe07846b74e45b104501 ]
+
+net/netfilter/nfnetlink_queue.c:601:36: warning: variable 'ctinfo' is
+uninitialized when used here [-Wuninitialized]
+ if (ct && nfnl_ct->build(skb, ct, ctinfo, NFQA_CT, NFQA_CT_INFO) < 0)
+
+ctinfo is only uninitialized if ct == NULL. Init it to 0 to silence this.
+
+Reported-by: kernel test robot <lkp@intel.com>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Stable-dep-of: 62e7151ae3eb ("netfilter: bridge: confirm multicast packets before passing them up the stack")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nfnetlink_queue.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
+index 5329ebf19a18b..f4468ef3d0a94 100644
+--- a/net/netfilter/nfnetlink_queue.c
++++ b/net/netfilter/nfnetlink_queue.c
+@@ -387,7 +387,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
+ struct net_device *indev;
+ struct net_device *outdev;
+ struct nf_conn *ct = NULL;
+- enum ip_conntrack_info ctinfo;
++ enum ip_conntrack_info ctinfo = 0;
+ struct nfnl_ct_hook *nfnl_ct;
+ bool csum_verify;
+ char *secdata = NULL;
+--
+2.43.0
+
--- /dev/null
+From 237b2b023ee1bef92716db824b94220a99b18a0a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 21 Feb 2024 16:40:48 +0900
+Subject: netlink: Fix kernel-infoleak-after-free in __skb_datagram_iter
+
+From: Ryosuke Yasuoka <ryasuoka@redhat.com>
+
+[ Upstream commit 661779e1fcafe1b74b3f3fe8e980c1e207fea1fd ]
+
+syzbot reported the following uninit-value access issue [1]:
+
+netlink_to_full_skb() creates a new `skb` and puts the `skb->data`
+passed as a 1st arg of netlink_to_full_skb() onto new `skb`. The data
+size is specified as `len` and passed to skb_put_data(). This `len`
+is based on `skb->end` that is not data offset but buffer offset. The
+`skb->end` contains data and tailroom. Since the tailroom is not
+initialized when the new `skb` created, KMSAN detects uninitialized
+memory area when copying the data.
+
+This patch resolved this issue by correct the len from `skb->end` to
+`skb->len`, which is the actual data offset.
+
+BUG: KMSAN: kernel-infoleak-after-free in instrument_copy_to_user include/linux/instrumented.h:114 [inline]
+BUG: KMSAN: kernel-infoleak-after-free in copy_to_user_iter lib/iov_iter.c:24 [inline]
+BUG: KMSAN: kernel-infoleak-after-free in iterate_ubuf include/linux/iov_iter.h:29 [inline]
+BUG: KMSAN: kernel-infoleak-after-free in iterate_and_advance2 include/linux/iov_iter.h:245 [inline]
+BUG: KMSAN: kernel-infoleak-after-free in iterate_and_advance include/linux/iov_iter.h:271 [inline]
+BUG: KMSAN: kernel-infoleak-after-free in _copy_to_iter+0x364/0x2520 lib/iov_iter.c:186
+ instrument_copy_to_user include/linux/instrumented.h:114 [inline]
+ copy_to_user_iter lib/iov_iter.c:24 [inline]
+ iterate_ubuf include/linux/iov_iter.h:29 [inline]
+ iterate_and_advance2 include/linux/iov_iter.h:245 [inline]
+ iterate_and_advance include/linux/iov_iter.h:271 [inline]
+ _copy_to_iter+0x364/0x2520 lib/iov_iter.c:186
+ copy_to_iter include/linux/uio.h:197 [inline]
+ simple_copy_to_iter+0x68/0xa0 net/core/datagram.c:532
+ __skb_datagram_iter+0x123/0xdc0 net/core/datagram.c:420
+ skb_copy_datagram_iter+0x5c/0x200 net/core/datagram.c:546
+ skb_copy_datagram_msg include/linux/skbuff.h:3960 [inline]
+ packet_recvmsg+0xd9c/0x2000 net/packet/af_packet.c:3482
+ sock_recvmsg_nosec net/socket.c:1044 [inline]
+ sock_recvmsg net/socket.c:1066 [inline]
+ sock_read_iter+0x467/0x580 net/socket.c:1136
+ call_read_iter include/linux/fs.h:2014 [inline]
+ new_sync_read fs/read_write.c:389 [inline]
+ vfs_read+0x8f6/0xe00 fs/read_write.c:470
+ ksys_read+0x20f/0x4c0 fs/read_write.c:613
+ __do_sys_read fs/read_write.c:623 [inline]
+ __se_sys_read fs/read_write.c:621 [inline]
+ __x64_sys_read+0x93/0xd0 fs/read_write.c:621
+ do_syscall_x64 arch/x86/entry/common.c:52 [inline]
+ do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83
+ entry_SYSCALL_64_after_hwframe+0x63/0x6b
+
+Uninit was stored to memory at:
+ skb_put_data include/linux/skbuff.h:2622 [inline]
+ netlink_to_full_skb net/netlink/af_netlink.c:181 [inline]
+ __netlink_deliver_tap_skb net/netlink/af_netlink.c:298 [inline]
+ __netlink_deliver_tap+0x5be/0xc90 net/netlink/af_netlink.c:325
+ netlink_deliver_tap net/netlink/af_netlink.c:338 [inline]
+ netlink_deliver_tap_kernel net/netlink/af_netlink.c:347 [inline]
+ netlink_unicast_kernel net/netlink/af_netlink.c:1341 [inline]
+ netlink_unicast+0x10f1/0x1250 net/netlink/af_netlink.c:1368
+ netlink_sendmsg+0x1238/0x13d0 net/netlink/af_netlink.c:1910
+ sock_sendmsg_nosec net/socket.c:730 [inline]
+ __sock_sendmsg net/socket.c:745 [inline]
+ ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2584
+ ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2638
+ __sys_sendmsg net/socket.c:2667 [inline]
+ __do_sys_sendmsg net/socket.c:2676 [inline]
+ __se_sys_sendmsg net/socket.c:2674 [inline]
+ __x64_sys_sendmsg+0x307/0x490 net/socket.c:2674
+ do_syscall_x64 arch/x86/entry/common.c:52 [inline]
+ do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83
+ entry_SYSCALL_64_after_hwframe+0x63/0x6b
+
+Uninit was created at:
+ free_pages_prepare mm/page_alloc.c:1087 [inline]
+ free_unref_page_prepare+0xb0/0xa40 mm/page_alloc.c:2347
+ free_unref_page_list+0xeb/0x1100 mm/page_alloc.c:2533
+ release_pages+0x23d3/0x2410 mm/swap.c:1042
+ free_pages_and_swap_cache+0xd9/0xf0 mm/swap_state.c:316
+ tlb_batch_pages_flush mm/mmu_gather.c:98 [inline]
+ tlb_flush_mmu_free mm/mmu_gather.c:293 [inline]
+ tlb_flush_mmu+0x6f5/0x980 mm/mmu_gather.c:300
+ tlb_finish_mmu+0x101/0x260 mm/mmu_gather.c:392
+ exit_mmap+0x49e/0xd30 mm/mmap.c:3321
+ __mmput+0x13f/0x530 kernel/fork.c:1349
+ mmput+0x8a/0xa0 kernel/fork.c:1371
+ exit_mm+0x1b8/0x360 kernel/exit.c:567
+ do_exit+0xd57/0x4080 kernel/exit.c:858
+ do_group_exit+0x2fd/0x390 kernel/exit.c:1021
+ __do_sys_exit_group kernel/exit.c:1032 [inline]
+ __se_sys_exit_group kernel/exit.c:1030 [inline]
+ __x64_sys_exit_group+0x3c/0x50 kernel/exit.c:1030
+ do_syscall_x64 arch/x86/entry/common.c:52 [inline]
+ do_syscall_64+0x44/0x110 arch/x86/entry/common.c:83
+ entry_SYSCALL_64_after_hwframe+0x63/0x6b
+
+Bytes 3852-3903 of 3904 are uninitialized
+Memory access of size 3904 starts at ffff88812ea1e000
+Data copied to user address 0000000020003280
+
+CPU: 1 PID: 5043 Comm: syz-executor297 Not tainted 6.7.0-rc5-syzkaller-00047-g5bd7ef53ffe5 #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/10/2023
+
+Fixes: 1853c9496460 ("netlink, mmap: transform mmap skb into full skb on taps")
+Reported-and-tested-by: syzbot+34ad5fab48f7bf510349@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=34ad5fab48f7bf510349 [1]
+Signed-off-by: Ryosuke Yasuoka <ryasuoka@redhat.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20240221074053.1794118-1-ryasuoka@redhat.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netlink/af_netlink.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
+index 2169a9c3da1c3..82df02695bbdd 100644
+--- a/net/netlink/af_netlink.c
++++ b/net/netlink/af_netlink.c
+@@ -165,7 +165,7 @@ static inline u32 netlink_group_mask(u32 group)
+ static struct sk_buff *netlink_to_full_skb(const struct sk_buff *skb,
+ gfp_t gfp_mask)
+ {
+- unsigned int len = skb_end_offset(skb);
++ unsigned int len = skb->len;
+ struct sk_buff *new;
+
+ new = alloc_skb(len, gfp_mask);
+--
+2.43.0
+
--- /dev/null
+From 1fa93963ef55c9a02940494d60aa163c7e3b573e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 15 Feb 2024 16:51:33 +0100
+Subject: power: supply: bq27xxx-i2c: Do not free non existing IRQ
+
+From: Hans de Goede <hdegoede@redhat.com>
+
+[ Upstream commit 2df70149e73e79783bcbc7db4fa51ecef0e2022c ]
+
+The bq27xxx i2c-client may not have an IRQ, in which case
+client->irq will be 0. bq27xxx_battery_i2c_probe() already has
+an if (client->irq) check wrapping the request_threaded_irq().
+
+But bq27xxx_battery_i2c_remove() unconditionally calls
+free_irq(client->irq) leading to:
+
+[ 190.310742] ------------[ cut here ]------------
+[ 190.310843] Trying to free already-free IRQ 0
+[ 190.310861] WARNING: CPU: 2 PID: 1304 at kernel/irq/manage.c:1893 free_irq+0x1b8/0x310
+
+Followed by a backtrace when unbinding the driver. Add
+an if (client->irq) to bq27xxx_battery_i2c_remove() mirroring
+probe() to fix this.
+
+Fixes: 444ff00734f3 ("power: supply: bq27xxx: Fix I2C IRQ race on remove")
+Signed-off-by: Hans de Goede <hdegoede@redhat.com>
+Link: https://lore.kernel.org/r/20240215155133.70537-1-hdegoede@redhat.com
+Signed-off-by: Sebastian Reichel <sebastian.reichel@collabora.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/power/supply/bq27xxx_battery_i2c.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/power/supply/bq27xxx_battery_i2c.c b/drivers/power/supply/bq27xxx_battery_i2c.c
+index b722ee2d7e142..4e5d773b3bf8d 100644
+--- a/drivers/power/supply/bq27xxx_battery_i2c.c
++++ b/drivers/power/supply/bq27xxx_battery_i2c.c
+@@ -209,7 +209,9 @@ static int bq27xxx_battery_i2c_remove(struct i2c_client *client)
+ {
+ struct bq27xxx_device_info *di = i2c_get_clientdata(client);
+
+- free_irq(client->irq, di);
++ if (client->irq)
++ free_irq(client->irq, di);
++
+ bq27xxx_battery_teardown(di);
+
+ mutex_lock(&battery_mutex);
+--
+2.43.0
+
--- /dev/null
+From 24732c67b0a17642fdcd5bd6aa3dd8e80c03f671 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 29 Feb 2024 21:17:23 +0200
+Subject: riscv: Sparse-Memory/vmemmap out-of-bounds fix
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Dimitris Vlachos <dvlachos@ics.forth.gr>
+
+[ Upstream commit a11dd49dcb9376776193e15641f84fcc1e5980c9 ]
+
+Offset vmemmap so that the first page of vmemmap will be mapped
+to the first page of physical memory in order to ensure that
+vmemmap’s bounds will be respected during
+pfn_to_page()/page_to_pfn() operations.
+The conversion macros will produce correct SV39/48/57 addresses
+for every possible/valid DRAM_BASE inside the physical memory limits.
+
+v2:Address Alex's comments
+
+Suggested-by: Alexandre Ghiti <alexghiti@rivosinc.com>
+Signed-off-by: Dimitris Vlachos <dvlachos@ics.forth.gr>
+Reported-by: Dimitris Vlachos <dvlachos@ics.forth.gr>
+Closes: https://lore.kernel.org/linux-riscv/20240202135030.42265-1-csd4492@csd.uoc.gr
+Fixes: d95f1a542c3d ("RISC-V: Implement sparsemem")
+Reviewed-by: Alexandre Ghiti <alexghiti@rivosinc.com>
+Link: https://lore.kernel.org/r/20240229191723.32779-1-dvlachos@ics.forth.gr
+Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/riscv/include/asm/pgtable.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h
+index 397cb945b16eb..9a3d9b68f2ff4 100644
+--- a/arch/riscv/include/asm/pgtable.h
++++ b/arch/riscv/include/asm/pgtable.h
+@@ -58,7 +58,7 @@
+ * Define vmemmap for pfn_to_page & page_to_pfn calls. Needed if kernel
+ * is configured with CONFIG_SPARSEMEM_VMEMMAP enabled.
+ */
+-#define vmemmap ((struct page *)VMEMMAP_START)
++#define vmemmap ((struct page *)VMEMMAP_START - (phys_ram_base >> PAGE_SHIFT))
+
+ #define PCI_IO_SIZE SZ_16M
+ #define PCI_IO_END VMEMMAP_START
+--
+2.43.0
+
--- /dev/null
+From ac5cea766854656b623a23a5d1370b9ea54d0fce Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 27 Feb 2024 20:11:28 +0800
+Subject: rtnetlink: fix error logic of IFLA_BRIDGE_FLAGS writing back
+
+From: Lin Ma <linma@zju.edu.cn>
+
+[ Upstream commit 743ad091fb46e622f1b690385bb15e3cd3daf874 ]
+
+In the commit d73ef2d69c0d ("rtnetlink: let rtnl_bridge_setlink checks
+IFLA_BRIDGE_MODE length"), an adjustment was made to the old loop logic
+in the function `rtnl_bridge_setlink` to enable the loop to also check
+the length of the IFLA_BRIDGE_MODE attribute. However, this adjustment
+removed the `break` statement and led to an error logic of the flags
+writing back at the end of this function.
+
+if (have_flags)
+ memcpy(nla_data(attr), &flags, sizeof(flags));
+ // attr should point to IFLA_BRIDGE_FLAGS NLA !!!
+
+Before the mentioned commit, the `attr` is granted to be IFLA_BRIDGE_FLAGS.
+However, this is not necessarily true fow now as the updated loop will let
+the attr point to the last NLA, even an invalid NLA which could cause
+overflow writes.
+
+This patch introduces a new variable `br_flag` to save the NLA pointer
+that points to IFLA_BRIDGE_FLAGS and uses it to resolve the mentioned
+error logic.
+
+Fixes: d73ef2d69c0d ("rtnetlink: let rtnl_bridge_setlink checks IFLA_BRIDGE_MODE length")
+Signed-off-by: Lin Ma <linma@zju.edu.cn>
+Acked-by: Nikolay Aleksandrov <razor@blackwall.org>
+Link: https://lore.kernel.org/r/20240227121128.608110-1-linma@zju.edu.cn
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/core/rtnetlink.c | 11 +++++------
+ 1 file changed, 5 insertions(+), 6 deletions(-)
+
+diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
+index 1b71e5c582bbc..ef218e290dfba 100644
+--- a/net/core/rtnetlink.c
++++ b/net/core/rtnetlink.c
+@@ -4925,10 +4925,9 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ struct net *net = sock_net(skb->sk);
+ struct ifinfomsg *ifm;
+ struct net_device *dev;
+- struct nlattr *br_spec, *attr = NULL;
++ struct nlattr *br_spec, *attr, *br_flags_attr = NULL;
+ int rem, err = -EOPNOTSUPP;
+ u16 flags = 0;
+- bool have_flags = false;
+
+ if (nlmsg_len(nlh) < sizeof(*ifm))
+ return -EINVAL;
+@@ -4946,11 +4945,11 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ br_spec = nlmsg_find_attr(nlh, sizeof(struct ifinfomsg), IFLA_AF_SPEC);
+ if (br_spec) {
+ nla_for_each_nested(attr, br_spec, rem) {
+- if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !have_flags) {
++ if (nla_type(attr) == IFLA_BRIDGE_FLAGS && !br_flags_attr) {
+ if (nla_len(attr) < sizeof(flags))
+ return -EINVAL;
+
+- have_flags = true;
++ br_flags_attr = attr;
+ flags = nla_get_u16(attr);
+ }
+
+@@ -4994,8 +4993,8 @@ static int rtnl_bridge_setlink(struct sk_buff *skb, struct nlmsghdr *nlh,
+ }
+ }
+
+- if (have_flags)
+- memcpy(nla_data(attr), &flags, sizeof(flags));
++ if (br_flags_attr)
++ memcpy(nla_data(br_flags_attr), &flags, sizeof(flags));
+ out:
+ return err;
+ }
+--
+2.43.0
+
netfilter-nf_tables-disallow-timeout-for-anonymous-sets.patch
+mtd-spinand-gigadevice-fix-the-get-ecc-status-issue.patch
+netlink-fix-kernel-infoleak-after-free-in-__skb_data.patch
+net-ip_tunnel-prevent-perpetual-headroom-growth.patch
+tun-fix-xdp_rxq_info-s-queue_index-when-detaching.patch
+cpufreq-intel_pstate-fix-pstate-limits-enforcement-f.patch
+net-veth-clear-gro-when-clearing-xdp-even-when-down.patch
+ipv6-fix-potential-struct-net-leak-in-inet6_rtm_geta.patch
+lan78xx-enable-auto-speed-configuration-for-lan7850-.patch
+net-enable-memcg-accounting-for-veth-queues.patch
+veth-try-harder-when-allocating-queue-memory.patch
+net-usb-dm9601-fix-wrong-return-value-in-dm9601_mdio.patch
+uapi-in6-replace-temporary-label-with-rfc9486.patch
+stmmac-clear-variable-when-destroying-workqueue.patch
+bluetooth-avoid-potential-use-after-free-in-hci_erro.patch
+bluetooth-hci_event-fix-wrongly-recorded-wakeup-bd_a.patch
+bluetooth-hci_event-fix-handling-of-hci_ev_io_capa_r.patch
+bluetooth-enforce-validation-on-max-value-of-connect.patch
+netfilter-nf_tables-allow-nfproto_inet-in-nft_-match.patch
+netfilter-nfnetlink_queue-silence-bogus-compiler-war.patch
+netfilter-core-move-ip_ct_attach-indirection-to-stru.patch
+netfilter-make-function-op-structures-const.patch
+netfilter-let-reset-rules-clean-out-conntrack-entrie.patch
+netfilter-bridge-confirm-multicast-packets-before-pa.patch
+rtnetlink-fix-error-logic-of-ifla_bridge_flags-writi.patch
+igb-extend-ptp-timestamp-adjustments-to-i211.patch
+tls-rx-don-t-store-the-record-type-in-socket-context.patch
+tls-rx-don-t-store-the-decryption-status-in-socket-c.patch
+tls-rx-don-t-issue-wake-ups-when-data-is-decrypted.patch
+tls-rx-refactor-decrypt_skb_update.patch
+tls-hw-rx-use-return-value-of-tls_device_decrypted-t.patch
+tls-rx-drop-unnecessary-arguments-from-tls_setup_fro.patch
+tls-rx-don-t-report-text-length-from-the-bowels-of-d.patch
+tls-rx-wrap-decryption-arguments-in-a-structure.patch
+tls-rx-factor-out-writing-contenttype-to-cmsg.patch
+tls-rx-don-t-track-the-async-count.patch
+tls-rx-move-counting-tlsdecrypterrors-for-sync.patch
+tls-rx-assume-crypto-always-calls-our-callback.patch
+tls-rx-use-async-as-an-in-out-argument.patch
+tls-decrement-decrypt_pending-if-no-async-completion.patch
+efi-capsule-loader-fix-incorrect-allocation-size.patch
+power-supply-bq27xxx-i2c-do-not-free-non-existing-ir.patch
+alsa-drop-leftover-snd-rtctimer-stuff-from-makefile.patch
+fbcon-always-restore-the-old-font-data-in-fbcon_do_s.patch
+afs-fix-endless-loop-in-directory-parsing.patch
+riscv-sparse-memory-vmemmap-out-of-bounds-fix.patch
--- /dev/null
+From dea96a3b376af58a1727fb971f53406a7c2d1e67 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 26 Feb 2024 17:42:32 +0100
+Subject: stmmac: Clear variable when destroying workqueue
+
+From: Jakub Raczynski <j.raczynski@samsung.com>
+
+[ Upstream commit 8af411bbba1f457c33734795f024d0ef26d0963f ]
+
+Currently when suspending driver and stopping workqueue it is checked whether
+workqueue is not NULL and if so, it is destroyed.
+Function destroy_workqueue() does drain queue and does clear variable, but
+it does not set workqueue variable to NULL. This can cause kernel/module
+panic if code attempts to clear workqueue that was not initialized.
+
+This scenario is possible when resuming suspended driver in stmmac_resume(),
+because there is no handling for failed stmmac_hw_setup(),
+which can fail and return if DMA engine has failed to initialize,
+and workqueue is initialized after DMA engine.
+Should DMA engine fail to initialize, resume will proceed normally,
+but interface won't work and TX queue will eventually timeout,
+causing 'Reset adapter' error.
+This then does destroy workqueue during reset process.
+And since workqueue is initialized after DMA engine and can be skipped,
+it will cause kernel/module panic.
+
+To secure against this possible crash, set workqueue variable to NULL when
+destroying workqueue.
+
+Log/backtrace from crash goes as follows:
+[88.031977]------------[ cut here ]------------
+[88.031985]NETDEV WATCHDOG: eth0 (sxgmac): transmit queue 1 timed out
+[88.032017]WARNING: CPU: 0 PID: 0 at net/sched/sch_generic.c:477 dev_watchdog+0x390/0x398
+ <Skipping backtrace for watchdog timeout>
+[88.032251]---[ end trace e70de432e4d5c2c0 ]---
+[88.032282]sxgmac 16d88000.ethernet eth0: Reset adapter.
+[88.036359]------------[ cut here ]------------
+[88.036519]Call trace:
+[88.036523] flush_workqueue+0x3e4/0x430
+[88.036528] drain_workqueue+0xc4/0x160
+[88.036533] destroy_workqueue+0x40/0x270
+[88.036537] stmmac_fpe_stop_wq+0x4c/0x70
+[88.036541] stmmac_release+0x278/0x280
+[88.036546] __dev_close_many+0xcc/0x158
+[88.036551] dev_close_many+0xbc/0x190
+[88.036555] dev_close.part.0+0x70/0xc0
+[88.036560] dev_close+0x24/0x30
+[88.036564] stmmac_service_task+0x110/0x140
+[88.036569] process_one_work+0x1d8/0x4a0
+[88.036573] worker_thread+0x54/0x408
+[88.036578] kthread+0x164/0x170
+[88.036583] ret_from_fork+0x10/0x20
+[88.036588]---[ end trace e70de432e4d5c2c1 ]---
+[88.036597]Unable to handle kernel NULL pointer dereference at virtual address 0000000000000004
+
+Fixes: 5a5586112b929 ("net: stmmac: support FPE link partner hand-shaking procedure")
+Signed-off-by: Jakub Raczynski <j.raczynski@samsung.com>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+index a1c1e353ca072..b0ab8f6986f8b 100644
+--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
++++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+@@ -3825,8 +3825,10 @@ static void stmmac_fpe_stop_wq(struct stmmac_priv *priv)
+ {
+ set_bit(__FPE_REMOVING, &priv->fpe_task_state);
+
+- if (priv->fpe_wq)
++ if (priv->fpe_wq) {
+ destroy_workqueue(priv->fpe_wq);
++ priv->fpe_wq = NULL;
++ }
+
+ netdev_info(priv->dev, "FPE workqueue stop");
+ }
+--
+2.43.0
+
--- /dev/null
+From 1ff365db9e76d0b1fa3372803386e06c4e2e34bf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 28 Feb 2024 23:43:57 +0100
+Subject: tls: decrement decrypt_pending if no async completion will be called
+
+From: Sabrina Dubroca <sd@queasysnail.net>
+
+[ Upstream commit f7fa16d49837f947ee59492958f9e6f0e51d9a78 ]
+
+With mixed sync/async decryption, or failures of crypto_aead_decrypt,
+we increment decrypt_pending but we never do the corresponding
+decrement since tls_decrypt_done will not be called. In this case, we
+should decrement decrypt_pending immediately to avoid getting stuck.
+
+For example, the prequeue prequeue test gets stuck with mixed
+modes (one async decrypt + one sync decrypt).
+
+Fixes: 94524d8fc965 ("net/tls: Add support for async decryption of tls records")
+Signed-off-by: Sabrina Dubroca <sd@queasysnail.net>
+Link: https://lore.kernel.org/r/c56d5fc35543891d5319f834f25622360e1bfbec.1709132643.git.sd@queasysnail.net
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls_sw.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index a1a99f9f093b1..83319a3b8bdd1 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -273,6 +273,8 @@ static int tls_do_decryption(struct sock *sk,
+ return 0;
+
+ ret = crypto_wait_req(ret, &ctx->async_wait);
++ } else if (darg->async) {
++ atomic_dec(&ctx->decrypt_pending);
+ }
+ darg->async = false;
+
+--
+2.43.0
+
--- /dev/null
+From ee9dec9a54e9e842b9958aa991a0e679e73f8e98 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Apr 2022 20:38:23 -0700
+Subject: tls: hw: rx: use return value of tls_device_decrypted() to carry
+ status
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 71471ca32505afa7c3f7f6a8268716e1ddb81cd4 ]
+
+Instead of tls_device poking into internals of the message
+return 1 from tls_device_decrypted() if the device handled
+the decryption.
+
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: f7fa16d49837 ("tls: decrement decrypt_pending if no async completion will be called")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls_device.c | 7 ++-----
+ net/tls/tls_sw.c | 5 ++---
+ 2 files changed, 4 insertions(+), 8 deletions(-)
+
+diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
+index f23d18e666284..e7c361807590d 100644
+--- a/net/tls/tls_device.c
++++ b/net/tls/tls_device.c
+@@ -936,7 +936,6 @@ int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx,
+ struct sk_buff *skb, struct strp_msg *rxm)
+ {
+ struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx);
+- struct tls_msg *tlm = tls_msg(skb);
+ int is_decrypted = skb->decrypted;
+ int is_encrypted = !is_decrypted;
+ struct sk_buff *skb_iter;
+@@ -951,11 +950,9 @@ int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx,
+ tls_ctx->rx.rec_seq, rxm->full_len,
+ is_encrypted, is_decrypted);
+
+- tlm->decrypted |= is_decrypted;
+-
+ if (unlikely(test_bit(TLS_RX_DEV_DEGRADED, &tls_ctx->flags))) {
+ if (likely(is_encrypted || is_decrypted))
+- return 0;
++ return is_decrypted;
+
+ /* After tls_device_down disables the offload, the next SKB will
+ * likely have initial fragments decrypted, and final ones not
+@@ -970,7 +967,7 @@ int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx,
+ */
+ if (is_decrypted) {
+ ctx->resync_nh_reset = 1;
+- return 0;
++ return is_decrypted;
+ }
+ if (is_encrypted) {
+ tls_device_core_ctrl_rx_resync(tls_ctx, ctx, sk, skb);
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index 7da17dd7c38b9..eed32ef3ca4a0 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1571,9 +1571,8 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ err = tls_device_decrypted(sk, tls_ctx, skb, rxm);
+ if (err < 0)
+ return err;
+-
+- /* skip SW decryption if NIC handled it already */
+- if (tlm->decrypted) {
++ if (err > 0) {
++ tlm->decrypted = 1;
+ *zc = false;
+ goto decrypt_done;
+ }
+--
+2.43.0
+
--- /dev/null
+From 4d8900bf4b8d44dab517788ebb440b614a6a719c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Apr 2022 12:19:12 -0700
+Subject: tls: rx: assume crypto always calls our callback
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 1c699ffa48a15710746989c36a82cbfb07e8d17f ]
+
+If crypto didn't always invoke our callback for async
+we'd not be clearing skb->sk and would crash in the
+skb core when freeing it. This if must be dead code.
+
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: f7fa16d49837 ("tls: decrement decrypt_pending if no async completion will be called")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls_sw.c | 3 ---
+ 1 file changed, 3 deletions(-)
+
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index 85fa49170b4e5..27ac27daec868 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -277,9 +277,6 @@ static int tls_do_decryption(struct sock *sk,
+ if (ret == -EBADMSG)
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR);
+
+- if (async)
+- atomic_dec(&ctx->decrypt_pending);
+-
+ return ret;
+ }
+
+--
+2.43.0
+
--- /dev/null
+From b89682750d6bee89bbcf232970f6d5770424ad76 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Apr 2022 20:38:21 -0700
+Subject: tls: rx: don't issue wake ups when data is decrypted
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 5dbda02d322db7762f1a0348117cde913fb46c13 ]
+
+We inform the applications that data is available when
+the record is received. Decryption happens inline inside
+recvmsg or splice call. Generating another wakeup inside
+the decryption handler seems pointless as someone must
+be actively reading the socket if we are executing this
+code.
+
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: f7fa16d49837 ("tls: decrement decrypt_pending if no async completion will be called")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls_sw.c | 2 --
+ 1 file changed, 2 deletions(-)
+
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index 0a6630bbef53e..5fdc4f5193ee5 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1557,7 +1557,6 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ bool async)
+ {
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+- struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
+ struct strp_msg *rxm = strp_msg(skb);
+ struct tls_msg *tlm = tls_msg(skb);
+@@ -1596,7 +1595,6 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ rxm->full_len -= prot->overhead_size;
+ tls_advance_record_sn(sk, prot, &tls_ctx->rx);
+ tlm->decrypted = 1;
+- ctx->saved_data_ready(sk);
+ } else {
+ *zc = false;
+ }
+--
+2.43.0
+
--- /dev/null
+From c4bd2ea6944d8b91a240bb02187ea60feec48ead Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Apr 2022 11:31:25 -0700
+Subject: tls: rx: don't report text length from the bowels of decrypt
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 9bdf75ccffa690237cd0b472cd598cf6d22873dc ]
+
+We plumb pointer to chunk all the way to the decryption method.
+It's set to the length of the text when decrypt_skb_update()
+returns.
+
+I think the code is written this way because original TLS
+implementation passed &chunk to zerocopy_from_iter() and this
+was carried forward as the code gotten more complex, without
+any refactoring.
+
+The fix for peek() introduced a new variable - to_decrypt
+which for all practical purposes is what chunk is going to
+get set to. Spare ourselves the pointer passing, use to_decrypt.
+
+Use this opportunity to clean things up a little further.
+
+Note that chunk / to_decrypt was mostly needed for the async
+path, since the sync path would access rxm->full_len (decryption
+transforms full_len from record size to text size). Use the
+right source of truth more explicitly.
+
+We have three cases:
+ - async - it's TLS 1.2 only, so chunk == to_decrypt, but we
+ need the min() because to_decrypt is a whole record
+ and we don't want to underflow len. Note that we can't
+ handle partial record by falling back to sync as it
+ would introduce reordering against records in flight.
+ - zc - again, TLS 1.2 only for now, so chunk == to_decrypt,
+ we don't do zc if len < to_decrypt, no need to check again.
+ - normal - it already handles chunk > len, we can factor out the
+ assignment to rxm->full_len and share it with zc.
+
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: f7fa16d49837 ("tls: decrement decrypt_pending if no async completion will be called")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls_sw.c | 33 ++++++++++++++-------------------
+ 1 file changed, 14 insertions(+), 19 deletions(-)
+
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index cf09f147f5a09..fc1fa98d21937 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1415,7 +1415,7 @@ static int tls_setup_from_iter(struct iov_iter *from,
+ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
+ struct iov_iter *out_iov,
+ struct scatterlist *out_sg,
+- int *chunk, bool *zc, bool async)
++ bool *zc, bool async)
+ {
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+@@ -1522,7 +1522,6 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
+ (n_sgout - 1));
+ if (err < 0)
+ goto fallback_to_reg_recv;
+- *chunk = data_len;
+ } else if (out_sg) {
+ memcpy(sgout, out_sg, n_sgout * sizeof(*sgout));
+ } else {
+@@ -1532,7 +1531,6 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
+ fallback_to_reg_recv:
+ sgout = sgin;
+ pages = 0;
+- *chunk = data_len;
+ *zc = false;
+ }
+
+@@ -1551,8 +1549,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
+ }
+
+ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+- struct iov_iter *dest, int *chunk, bool *zc,
+- bool async)
++ struct iov_iter *dest, bool *zc, bool async)
+ {
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
+@@ -1576,7 +1573,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ }
+ }
+
+- err = decrypt_internal(sk, skb, dest, NULL, chunk, zc, async);
++ err = decrypt_internal(sk, skb, dest, NULL, zc, async);
+ if (err < 0) {
+ if (err == -EINPROGRESS)
+ tls_advance_record_sn(sk, prot, &tls_ctx->rx);
+@@ -1603,9 +1600,8 @@ int decrypt_skb(struct sock *sk, struct sk_buff *skb,
+ struct scatterlist *sgout)
+ {
+ bool zc = true;
+- int chunk;
+
+- return decrypt_internal(sk, skb, NULL, sgout, &chunk, &zc, false);
++ return decrypt_internal(sk, skb, NULL, sgout, &zc, false);
+ }
+
+ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
+@@ -1795,9 +1791,8 @@ int tls_sw_recvmsg(struct sock *sk,
+ num_async = 0;
+ while (len && (decrypted + copied < target || ctx->recv_pkt)) {
+ bool retain_skb = false;
++ int to_decrypt, chunk;
+ bool zc = false;
+- int to_decrypt;
+- int chunk = 0;
+ bool async_capable;
+ bool async = false;
+
+@@ -1834,7 +1829,7 @@ int tls_sw_recvmsg(struct sock *sk,
+ async_capable = false;
+
+ err = decrypt_skb_update(sk, skb, &msg->msg_iter,
+- &chunk, &zc, async_capable);
++ &zc, async_capable);
+ if (err < 0 && err != -EINPROGRESS) {
+ tls_err_abort(sk, -EBADMSG);
+ goto recv_end;
+@@ -1872,8 +1867,13 @@ int tls_sw_recvmsg(struct sock *sk,
+ }
+ }
+
+- if (async)
++ if (async) {
++ /* TLS 1.2-only, to_decrypt must be text length */
++ chunk = min_t(int, to_decrypt, len);
+ goto pick_next_record;
++ }
++ /* TLS 1.3 may have updated the length by more than overhead */
++ chunk = rxm->full_len;
+
+ if (!zc) {
+ if (bpf_strp_enabled) {
+@@ -1889,11 +1889,9 @@ int tls_sw_recvmsg(struct sock *sk,
+ }
+ }
+
+- if (rxm->full_len > len) {
++ if (chunk > len) {
+ retain_skb = true;
+ chunk = len;
+- } else {
+- chunk = rxm->full_len;
+ }
+
+ err = skb_copy_datagram_msg(skb, rxm->offset,
+@@ -1908,9 +1906,6 @@ int tls_sw_recvmsg(struct sock *sk,
+ }
+
+ pick_next_record:
+- if (chunk > len)
+- chunk = len;
+-
+ decrypted += chunk;
+ len -= chunk;
+
+@@ -2011,7 +2006,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
+ if (!skb)
+ goto splice_read_end;
+
+- err = decrypt_skb_update(sk, skb, NULL, &chunk, &zc, false);
++ err = decrypt_skb_update(sk, skb, NULL, &zc, false);
+ if (err < 0) {
+ tls_err_abort(sk, -EBADMSG);
+ goto splice_read_end;
+--
+2.43.0
+
--- /dev/null
+From 6262e20920b50d8cc830964df259ff18380046d2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Apr 2022 20:38:17 -0700
+Subject: tls: rx: don't store the decryption status in socket context
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 7dc59c33d62c4520a119051d4486c214ef5caa23 ]
+
+Similar justification to previous change, the information
+about decryption status belongs in the skb.
+
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: f7fa16d49837 ("tls: decrement decrypt_pending if no async completion will be called")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/strparser.h | 1 +
+ include/net/tls.h | 1 -
+ net/tls/tls_device.c | 3 ++-
+ net/tls/tls_sw.c | 10 ++++++----
+ 4 files changed, 9 insertions(+), 6 deletions(-)
+
+diff --git a/include/net/strparser.h b/include/net/strparser.h
+index c271543076cf8..a191486eb1e4c 100644
+--- a/include/net/strparser.h
++++ b/include/net/strparser.h
+@@ -72,6 +72,7 @@ struct sk_skb_cb {
+ u64 temp_reg;
+ struct tls_msg {
+ u8 control;
++ u8 decrypted;
+ } tls;
+ };
+
+diff --git a/include/net/tls.h b/include/net/tls.h
+index 24c1b718ceacc..ea0aeae26cf76 100644
+--- a/include/net/tls.h
++++ b/include/net/tls.h
+@@ -147,7 +147,6 @@ struct tls_sw_context_rx {
+
+ struct sk_buff *recv_pkt;
+ u8 async_capable:1;
+- u8 decrypted:1;
+ atomic_t decrypt_pending;
+ /* protect crypto_wait with decrypt_pending*/
+ spinlock_t decrypt_compl_lock;
+diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c
+index 88785196a8966..f23d18e666284 100644
+--- a/net/tls/tls_device.c
++++ b/net/tls/tls_device.c
+@@ -936,6 +936,7 @@ int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx,
+ struct sk_buff *skb, struct strp_msg *rxm)
+ {
+ struct tls_offload_context_rx *ctx = tls_offload_ctx_rx(tls_ctx);
++ struct tls_msg *tlm = tls_msg(skb);
+ int is_decrypted = skb->decrypted;
+ int is_encrypted = !is_decrypted;
+ struct sk_buff *skb_iter;
+@@ -950,7 +951,7 @@ int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx,
+ tls_ctx->rx.rec_seq, rxm->full_len,
+ is_encrypted, is_decrypted);
+
+- ctx->sw.decrypted |= is_decrypted;
++ tlm->decrypted |= is_decrypted;
+
+ if (unlikely(test_bit(TLS_RX_DEV_DEGRADED, &tls_ctx->flags))) {
+ if (likely(is_encrypted || is_decrypted))
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index 82d7c9b036bc7..0a6630bbef53e 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1560,9 +1560,10 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
+ struct strp_msg *rxm = strp_msg(skb);
++ struct tls_msg *tlm = tls_msg(skb);
+ int pad, err = 0;
+
+- if (!ctx->decrypted) {
++ if (!tlm->decrypted) {
+ if (tls_ctx->rx_conf == TLS_HW) {
+ err = tls_device_decrypted(sk, tls_ctx, skb, rxm);
+ if (err < 0)
+@@ -1570,7 +1571,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ }
+
+ /* Still not decrypted after tls_device */
+- if (!ctx->decrypted) {
++ if (!tlm->decrypted) {
+ err = decrypt_internal(sk, skb, dest, NULL, chunk, zc,
+ async);
+ if (err < 0) {
+@@ -1594,7 +1595,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ rxm->offset += prot->prepend_size;
+ rxm->full_len -= prot->overhead_size;
+ tls_advance_record_sn(sk, prot, &tls_ctx->rx);
+- ctx->decrypted = 1;
++ tlm->decrypted = 1;
+ ctx->saved_data_ready(sk);
+ } else {
+ *zc = false;
+@@ -2137,8 +2138,9 @@ static void tls_queue(struct strparser *strp, struct sk_buff *skb)
+ {
+ struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
++ struct tls_msg *tlm = tls_msg(skb);
+
+- ctx->decrypted = 0;
++ tlm->decrypted = 0;
+
+ ctx->recv_pkt = skb;
+ strp_pause(strp);
+--
+2.43.0
+
--- /dev/null
+From f550346a37e8a8e8b3dc78a2e78d0efb3b9cac0a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Apr 2022 20:38:16 -0700
+Subject: tls: rx: don't store the record type in socket context
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit c3f6bb74137c68b515b7e2ff123a80611e801013 ]
+
+Original TLS implementation was handling one record at a time.
+It stashed the type of the record inside tls context (per socket
+structure) for convenience. When async crypto support was added
+[1] the author had to use skb->cb to store the type per-message.
+
+The use of skb->cb overlaps with strparser, however, so a hybrid
+approach was taken where type is stored in context while parsing
+(since we parse a message at a time) but once parsed its copied
+to skb->cb.
+
+Recently a workaround for sockmaps [2] exposed the previously
+private struct _strp_msg and started a trend of adding user
+fields directly in strparser's header. This is cleaner than
+storing information about an skb in the context.
+
+This change is not strictly necessary, but IMHO the ownership
+of the context field is confusing. Information naturally
+belongs to the skb.
+
+[1] commit 94524d8fc965 ("net/tls: Add support for async decryption of tls records")
+[2] commit b2c4618162ec ("bpf, sockmap: sk_skb data_end access incorrect when src_reg = dst_reg")
+
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: f7fa16d49837 ("tls: decrement decrypt_pending if no async completion will be called")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/strparser.h | 3 +++
+ include/net/tls.h | 10 +++-------
+ net/tls/tls_sw.c | 38 +++++++++++++++++---------------------
+ 3 files changed, 23 insertions(+), 28 deletions(-)
+
+diff --git a/include/net/strparser.h b/include/net/strparser.h
+index 732b7097d78e4..c271543076cf8 100644
+--- a/include/net/strparser.h
++++ b/include/net/strparser.h
+@@ -70,6 +70,9 @@ struct sk_skb_cb {
+ * when dst_reg == src_reg.
+ */
+ u64 temp_reg;
++ struct tls_msg {
++ u8 control;
++ } tls;
+ };
+
+ static inline struct strp_msg *strp_msg(struct sk_buff *skb)
+diff --git a/include/net/tls.h b/include/net/tls.h
+index eda0015c5c592..24c1b718ceacc 100644
+--- a/include/net/tls.h
++++ b/include/net/tls.h
+@@ -116,11 +116,6 @@ struct tls_rec {
+ u8 aead_req_ctx[];
+ };
+
+-struct tls_msg {
+- struct strp_msg rxm;
+- u8 control;
+-};
+-
+ struct tx_work {
+ struct delayed_work work;
+ struct sock *sk;
+@@ -151,7 +146,6 @@ struct tls_sw_context_rx {
+ void (*saved_data_ready)(struct sock *sk);
+
+ struct sk_buff *recv_pkt;
+- u8 control;
+ u8 async_capable:1;
+ u8 decrypted:1;
+ atomic_t decrypt_pending;
+@@ -410,7 +404,9 @@ void tls_free_partial_record(struct sock *sk, struct tls_context *ctx);
+
+ static inline struct tls_msg *tls_msg(struct sk_buff *skb)
+ {
+- return (struct tls_msg *)strp_msg(skb);
++ struct sk_skb_cb *scb = (struct sk_skb_cb *)skb->cb;
++
++ return &scb->tls;
+ }
+
+ static inline bool tls_is_partially_sent_record(struct tls_context *ctx)
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index e6f700f67c010..82d7c9b036bc7 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -128,10 +128,10 @@ static int skb_nsg(struct sk_buff *skb, int offset, int len)
+ return __skb_nsg(skb, offset, len, 0);
+ }
+
+-static int padding_length(struct tls_sw_context_rx *ctx,
+- struct tls_prot_info *prot, struct sk_buff *skb)
++static int padding_length(struct tls_prot_info *prot, struct sk_buff *skb)
+ {
+ struct strp_msg *rxm = strp_msg(skb);
++ struct tls_msg *tlm = tls_msg(skb);
+ int sub = 0;
+
+ /* Determine zero-padding length */
+@@ -153,7 +153,7 @@ static int padding_length(struct tls_sw_context_rx *ctx,
+ sub++;
+ back++;
+ }
+- ctx->control = content_type;
++ tlm->control = content_type;
+ }
+ return sub;
+ }
+@@ -187,7 +187,7 @@ static void tls_decrypt_done(struct crypto_async_request *req, int err)
+ struct strp_msg *rxm = strp_msg(skb);
+ int pad;
+
+- pad = padding_length(ctx, prot, skb);
++ pad = padding_length(prot, skb);
+ if (pad < 0) {
+ ctx->async_wait.err = pad;
+ tls_err_abort(skb->sk, pad);
+@@ -1423,6 +1423,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
+ struct strp_msg *rxm = strp_msg(skb);
++ struct tls_msg *tlm = tls_msg(skb);
+ int n_sgin, n_sgout, nsg, mem_size, aead_size, err, pages = 0;
+ struct aead_request *aead_req;
+ struct sk_buff *unused;
+@@ -1500,7 +1501,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
+ /* Prepare AAD */
+ tls_make_aad(aad, rxm->full_len - prot->overhead_size +
+ prot->tail_size,
+- tls_ctx->rx.rec_seq, ctx->control, prot);
++ tls_ctx->rx.rec_seq, tlm->control, prot);
+
+ /* Prepare sgin */
+ sg_init_table(sgin, n_sgin);
+@@ -1585,7 +1586,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ *zc = false;
+ }
+
+- pad = padding_length(ctx, prot, skb);
++ pad = padding_length(prot, skb);
+ if (pad < 0)
+ return pad;
+
+@@ -1817,26 +1818,21 @@ int tls_sw_recvmsg(struct sock *sk,
+ }
+ }
+ goto recv_end;
+- } else {
+- tlm = tls_msg(skb);
+- if (prot->version == TLS_1_3_VERSION)
+- tlm->control = 0;
+- else
+- tlm->control = ctx->control;
+ }
+
+ rxm = strp_msg(skb);
++ tlm = tls_msg(skb);
+
+ to_decrypt = rxm->full_len - prot->overhead_size;
+
+ if (to_decrypt <= len && !is_kvec && !is_peek &&
+- ctx->control == TLS_RECORD_TYPE_DATA &&
++ tlm->control == TLS_RECORD_TYPE_DATA &&
+ prot->version != TLS_1_3_VERSION &&
+ !bpf_strp_enabled)
+ zc = true;
+
+ /* Do not use async mode if record is non-data */
+- if (ctx->control == TLS_RECORD_TYPE_DATA && !bpf_strp_enabled)
++ if (tlm->control == TLS_RECORD_TYPE_DATA && !bpf_strp_enabled)
+ async_capable = ctx->async_capable;
+ else
+ async_capable = false;
+@@ -1851,8 +1847,6 @@ int tls_sw_recvmsg(struct sock *sk,
+ if (err == -EINPROGRESS) {
+ async = true;
+ num_async++;
+- } else if (prot->version == TLS_1_3_VERSION) {
+- tlm->control = ctx->control;
+ }
+
+ /* If the type of records being processed is not known yet,
+@@ -1999,6 +1993,7 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct strp_msg *rxm = NULL;
+ struct sock *sk = sock->sk;
++ struct tls_msg *tlm;
+ struct sk_buff *skb;
+ ssize_t copied = 0;
+ bool from_queue;
+@@ -2027,14 +2022,15 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
+ }
+ }
+
++ rxm = strp_msg(skb);
++ tlm = tls_msg(skb);
++
+ /* splice does not support reading control messages */
+- if (ctx->control != TLS_RECORD_TYPE_DATA) {
++ if (tlm->control != TLS_RECORD_TYPE_DATA) {
+ err = -EINVAL;
+ goto splice_read_end;
+ }
+
+- rxm = strp_msg(skb);
+-
+ chunk = min_t(unsigned int, rxm->full_len, len);
+ copied = skb_splice_bits(skb, sk, rxm->offset, pipe, chunk, flags);
+ if (copied < 0)
+@@ -2077,10 +2073,10 @@ bool tls_sw_sock_is_readable(struct sock *sk)
+ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
+ {
+ struct tls_context *tls_ctx = tls_get_ctx(strp->sk);
+- struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
+ char header[TLS_HEADER_SIZE + MAX_IV_SIZE];
+ struct strp_msg *rxm = strp_msg(skb);
++ struct tls_msg *tlm = tls_msg(skb);
+ size_t cipher_overhead;
+ size_t data_len = 0;
+ int ret;
+@@ -2101,7 +2097,7 @@ static int tls_read_size(struct strparser *strp, struct sk_buff *skb)
+ if (ret < 0)
+ goto read_failure;
+
+- ctx->control = header[0];
++ tlm->control = header[0];
+
+ data_len = ((header[4] & 0xFF) | (header[3] << 8));
+
+--
+2.43.0
+
--- /dev/null
+From aaacfbe1c45047270c216374c84a8149b10d630a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Apr 2022 11:31:30 -0700
+Subject: tls: rx: don't track the async count
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 7da18bcc5e4cfd14ea520367546c5697e64ae592 ]
+
+We track both if the last record was handled by async crypto
+and how many records were async. This is not necessary. We
+implicitly assume once crypto goes async it will stay that
+way, otherwise we'd reorder records. So just track if we're
+in async mode, the exact number of records is not necessary.
+
+This change also forces us into "async" mode more consistently
+in case crypto ever decided to interleave async and sync.
+
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: f7fa16d49837 ("tls: decrement decrypt_pending if no async completion will be called")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls_sw.c | 12 +++++-------
+ 1 file changed, 5 insertions(+), 7 deletions(-)
+
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index ca71a9f559b37..d3bbae9af9f41 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1753,13 +1753,13 @@ int tls_sw_recvmsg(struct sock *sk,
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
+ struct sk_psock *psock;
+- int num_async, pending;
+ unsigned char control = 0;
+ ssize_t decrypted = 0;
+ struct strp_msg *rxm;
+ struct tls_msg *tlm;
+ struct sk_buff *skb;
+ ssize_t copied = 0;
++ bool async = false;
+ int target, err = 0;
+ long timeo;
+ bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
+@@ -1791,12 +1791,10 @@ int tls_sw_recvmsg(struct sock *sk,
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ decrypted = 0;
+- num_async = 0;
+ while (len && (decrypted + copied < target || ctx->recv_pkt)) {
+ struct tls_decrypt_arg darg = {};
+ bool retain_skb = false;
+ int to_decrypt, chunk;
+- bool async;
+
+ skb = tls_wait_data(sk, psock, flags & MSG_DONTWAIT, timeo, &err);
+ if (!skb) {
+@@ -1836,10 +1834,8 @@ int tls_sw_recvmsg(struct sock *sk,
+ goto recv_end;
+ }
+
+- if (err == -EINPROGRESS) {
++ if (err == -EINPROGRESS)
+ async = true;
+- num_async++;
+- }
+
+ /* If the type of records being processed is not known yet,
+ * set it to record type just dequeued. If it is already known,
+@@ -1914,7 +1910,9 @@ int tls_sw_recvmsg(struct sock *sk,
+ }
+
+ recv_end:
+- if (num_async) {
++ if (async) {
++ int pending;
++
+ /* Wait for all previously submitted records to be decrypted */
+ spin_lock_bh(&ctx->decrypt_compl_lock);
+ ctx->async_notify = true;
+--
+2.43.0
+
--- /dev/null
+From 1a773bacd9783962656ab5cba2346b81f34697cc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Apr 2022 11:31:24 -0700
+Subject: tls: rx: drop unnecessary arguments from tls_setup_from_iter()
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit d4bd88e67666c73cfa9d75c282e708890d4f10a7 ]
+
+sk is unused, remove it to make it clear the function
+doesn't poke at the socket.
+
+size_used is always 0 on input and @length on success.
+
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: f7fa16d49837 ("tls: decrement decrypt_pending if no async completion will be called")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls_sw.c | 14 ++++++--------
+ 1 file changed, 6 insertions(+), 8 deletions(-)
+
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index eed32ef3ca4a0..cf09f147f5a09 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1348,15 +1348,14 @@ static struct sk_buff *tls_wait_data(struct sock *sk, struct sk_psock *psock,
+ return skb;
+ }
+
+-static int tls_setup_from_iter(struct sock *sk, struct iov_iter *from,
++static int tls_setup_from_iter(struct iov_iter *from,
+ int length, int *pages_used,
+- unsigned int *size_used,
+ struct scatterlist *to,
+ int to_max_pages)
+ {
+ int rc = 0, i = 0, num_elem = *pages_used, maxpages;
+ struct page *pages[MAX_SKB_FRAGS];
+- unsigned int size = *size_used;
++ unsigned int size = 0;
+ ssize_t copied, use;
+ size_t offset;
+
+@@ -1399,8 +1398,7 @@ static int tls_setup_from_iter(struct sock *sk, struct iov_iter *from,
+ sg_mark_end(&to[num_elem - 1]);
+ out:
+ if (rc)
+- iov_iter_revert(from, size - *size_used);
+- *size_used = size;
++ iov_iter_revert(from, size);
+ *pages_used = num_elem;
+
+ return rc;
+@@ -1519,12 +1517,12 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
+ sg_init_table(sgout, n_sgout);
+ sg_set_buf(&sgout[0], aad, prot->aad_size);
+
+- *chunk = 0;
+- err = tls_setup_from_iter(sk, out_iov, data_len,
+- &pages, chunk, &sgout[1],
++ err = tls_setup_from_iter(out_iov, data_len,
++ &pages, &sgout[1],
+ (n_sgout - 1));
+ if (err < 0)
+ goto fallback_to_reg_recv;
++ *chunk = data_len;
+ } else if (out_sg) {
+ memcpy(sgout, out_sg, n_sgout * sizeof(*sgout));
+ } else {
+--
+2.43.0
+
--- /dev/null
+From 0691e263e075592a7610cea28a3a235650592b15 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Apr 2022 11:31:28 -0700
+Subject: tls: rx: factor out writing ContentType to cmsg
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 06554f4ffc2595ae52ee80aec4a13bd77d22bed7 ]
+
+cmsg can be filled in during rx_list processing or normal
+receive. Consolidate the code.
+
+We don't need to keep the boolean to track if the cmsg was
+created. 0 is an invalid content type.
+
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: f7fa16d49837 ("tls: decrement decrypt_pending if no async completion will be called")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls_sw.c | 91 +++++++++++++++++++-----------------------------
+ 1 file changed, 36 insertions(+), 55 deletions(-)
+
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index c491cde30504e..ca71a9f559b37 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1634,6 +1634,29 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
+ return true;
+ }
+
++static int tls_record_content_type(struct msghdr *msg, struct tls_msg *tlm,
++ u8 *control)
++{
++ int err;
++
++ if (!*control) {
++ *control = tlm->control;
++ if (!*control)
++ return -EBADMSG;
++
++ err = put_cmsg(msg, SOL_TLS, TLS_GET_RECORD_TYPE,
++ sizeof(*control), control);
++ if (*control != TLS_RECORD_TYPE_DATA) {
++ if (err || msg->msg_flags & MSG_CTRUNC)
++ return -EIO;
++ }
++ } else if (*control != tlm->control) {
++ return 0;
++ }
++
++ return 1;
++}
++
+ /* This function traverses the rx_list in tls receive context to copies the
+ * decrypted records into the buffer provided by caller zero copy is not
+ * true. Further, the records are removed from the rx_list if it is not a peek
+@@ -1642,31 +1665,23 @@ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
+ static int process_rx_list(struct tls_sw_context_rx *ctx,
+ struct msghdr *msg,
+ u8 *control,
+- bool *cmsg,
+ size_t skip,
+ size_t len,
+ bool zc,
+ bool is_peek)
+ {
+ struct sk_buff *skb = skb_peek(&ctx->rx_list);
+- u8 ctrl = *control;
+- u8 msgc = *cmsg;
+ struct tls_msg *tlm;
+ ssize_t copied = 0;
+-
+- /* Set the record type in 'control' if caller didn't pass it */
+- if (!ctrl && skb) {
+- tlm = tls_msg(skb);
+- ctrl = tlm->control;
+- }
++ int err;
+
+ while (skip && skb) {
+ struct strp_msg *rxm = strp_msg(skb);
+ tlm = tls_msg(skb);
+
+- /* Cannot process a record of different type */
+- if (ctrl != tlm->control)
+- return 0;
++ err = tls_record_content_type(msg, tlm, control);
++ if (err <= 0)
++ return err;
+
+ if (skip < rxm->full_len)
+ break;
+@@ -1682,27 +1697,12 @@ static int process_rx_list(struct tls_sw_context_rx *ctx,
+
+ tlm = tls_msg(skb);
+
+- /* Cannot process a record of different type */
+- if (ctrl != tlm->control)
+- return 0;
+-
+- /* Set record type if not already done. For a non-data record,
+- * do not proceed if record type could not be copied.
+- */
+- if (!msgc) {
+- int cerr = put_cmsg(msg, SOL_TLS, TLS_GET_RECORD_TYPE,
+- sizeof(ctrl), &ctrl);
+- msgc = true;
+- if (ctrl != TLS_RECORD_TYPE_DATA) {
+- if (cerr || msg->msg_flags & MSG_CTRUNC)
+- return -EIO;
+-
+- *cmsg = msgc;
+- }
+- }
++ err = tls_record_content_type(msg, tlm, control);
++ if (err <= 0)
++ return err;
+
+ if (!zc || (rxm->full_len - skip) > len) {
+- int err = skb_copy_datagram_msg(skb, rxm->offset + skip,
++ err = skb_copy_datagram_msg(skb, rxm->offset + skip,
+ msg, chunk);
+ if (err < 0)
+ return err;
+@@ -1739,7 +1739,6 @@ static int process_rx_list(struct tls_sw_context_rx *ctx,
+ skb = next_skb;
+ }
+
+- *control = ctrl;
+ return copied;
+ }
+
+@@ -1761,7 +1760,6 @@ int tls_sw_recvmsg(struct sock *sk,
+ struct tls_msg *tlm;
+ struct sk_buff *skb;
+ ssize_t copied = 0;
+- bool cmsg = false;
+ int target, err = 0;
+ long timeo;
+ bool is_kvec = iov_iter_is_kvec(&msg->msg_iter);
+@@ -1778,8 +1776,7 @@ int tls_sw_recvmsg(struct sock *sk,
+ bpf_strp_enabled = sk_psock_strp_enabled(psock);
+
+ /* Process pending decrypted records. It must be non-zero-copy */
+- err = process_rx_list(ctx, msg, &control, &cmsg, 0, len, false,
+- is_peek);
++ err = process_rx_list(ctx, msg, &control, 0, len, false, is_peek);
+ if (err < 0) {
+ tls_err_abort(sk, err);
+ goto end;
+@@ -1851,26 +1848,10 @@ int tls_sw_recvmsg(struct sock *sk,
+ * is known just after record is dequeued from stream parser.
+ * For tls1.3, we disable async.
+ */
+-
+- if (!control)
+- control = tlm->control;
+- else if (control != tlm->control)
++ err = tls_record_content_type(msg, tlm, &control);
++ if (err <= 0)
+ goto recv_end;
+
+- if (!cmsg) {
+- int cerr;
+-
+- cerr = put_cmsg(msg, SOL_TLS, TLS_GET_RECORD_TYPE,
+- sizeof(control), &control);
+- cmsg = true;
+- if (control != TLS_RECORD_TYPE_DATA) {
+- if (cerr || msg->msg_flags & MSG_CTRUNC) {
+- err = -EIO;
+- goto recv_end;
+- }
+- }
+- }
+-
+ if (async) {
+ /* TLS 1.2-only, to_decrypt must be text length */
+ chunk = min_t(int, to_decrypt, len);
+@@ -1959,10 +1940,10 @@ int tls_sw_recvmsg(struct sock *sk,
+
+ /* Drain records from the rx_list & copy if required */
+ if (is_peek || is_kvec)
+- err = process_rx_list(ctx, msg, &control, &cmsg, copied,
++ err = process_rx_list(ctx, msg, &control, copied,
+ decrypted, false, is_peek);
+ else
+- err = process_rx_list(ctx, msg, &control, &cmsg, 0,
++ err = process_rx_list(ctx, msg, &control, 0,
+ decrypted, true, is_peek);
+ if (err < 0) {
+ tls_err_abort(sk, err);
+--
+2.43.0
+
--- /dev/null
+From c2fb73ea5bd49d8a9c033c57ad6b98af0a162093 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Apr 2022 12:19:10 -0700
+Subject: tls: rx: move counting TlsDecryptErrors for sync
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 284b4d93daee56dff3e10029ddf2e03227f50dbf ]
+
+Move counting TlsDecryptErrors to tls_do_decryption()
+where differences between sync and async crypto are
+reconciled.
+
+No functional changes, this code just always gave
+me a pause.
+
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: f7fa16d49837 ("tls: decrement decrypt_pending if no async completion will be called")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls_sw.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index d3bbae9af9f41..85fa49170b4e5 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -274,6 +274,8 @@ static int tls_do_decryption(struct sock *sk,
+
+ ret = crypto_wait_req(ret, &ctx->async_wait);
+ }
++ if (ret == -EBADMSG)
++ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR);
+
+ if (async)
+ atomic_dec(&ctx->decrypt_pending);
+@@ -1583,8 +1585,6 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ if (err < 0) {
+ if (err == -EINPROGRESS)
+ tls_advance_record_sn(sk, prot, &tls_ctx->rx);
+- else if (err == -EBADMSG)
+- TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR);
+ return err;
+ }
+
+--
+2.43.0
+
--- /dev/null
+From a31e78e9ccb122c8276bfbc8343347a95e5e48af Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 7 Apr 2022 20:38:22 -0700
+Subject: tls: rx: refactor decrypt_skb_update()
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 3764ae5ba6615095de86698a00e814513b9ad0d5 ]
+
+Use early return and a jump label to remove two indentation levels.
+No functional changes.
+
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: f7fa16d49837 ("tls: decrement decrypt_pending if no async completion will be called")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls_sw.c | 66 ++++++++++++++++++++++++------------------------
+ 1 file changed, 33 insertions(+), 33 deletions(-)
+
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index 5fdc4f5193ee5..7da17dd7c38b9 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -1560,46 +1560,46 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
+ struct strp_msg *rxm = strp_msg(skb);
+ struct tls_msg *tlm = tls_msg(skb);
+- int pad, err = 0;
++ int pad, err;
+
+- if (!tlm->decrypted) {
+- if (tls_ctx->rx_conf == TLS_HW) {
+- err = tls_device_decrypted(sk, tls_ctx, skb, rxm);
+- if (err < 0)
+- return err;
+- }
++ if (tlm->decrypted) {
++ *zc = false;
++ return 0;
++ }
+
+- /* Still not decrypted after tls_device */
+- if (!tlm->decrypted) {
+- err = decrypt_internal(sk, skb, dest, NULL, chunk, zc,
+- async);
+- if (err < 0) {
+- if (err == -EINPROGRESS)
+- tls_advance_record_sn(sk, prot,
+- &tls_ctx->rx);
+- else if (err == -EBADMSG)
+- TLS_INC_STATS(sock_net(sk),
+- LINUX_MIB_TLSDECRYPTERROR);
+- return err;
+- }
+- } else {
++ if (tls_ctx->rx_conf == TLS_HW) {
++ err = tls_device_decrypted(sk, tls_ctx, skb, rxm);
++ if (err < 0)
++ return err;
++
++ /* skip SW decryption if NIC handled it already */
++ if (tlm->decrypted) {
+ *zc = false;
++ goto decrypt_done;
+ }
++ }
+
+- pad = padding_length(prot, skb);
+- if (pad < 0)
+- return pad;
+-
+- rxm->full_len -= pad;
+- rxm->offset += prot->prepend_size;
+- rxm->full_len -= prot->overhead_size;
+- tls_advance_record_sn(sk, prot, &tls_ctx->rx);
+- tlm->decrypted = 1;
+- } else {
+- *zc = false;
++ err = decrypt_internal(sk, skb, dest, NULL, chunk, zc, async);
++ if (err < 0) {
++ if (err == -EINPROGRESS)
++ tls_advance_record_sn(sk, prot, &tls_ctx->rx);
++ else if (err == -EBADMSG)
++ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR);
++ return err;
+ }
+
+- return err;
++decrypt_done:
++ pad = padding_length(prot, skb);
++ if (pad < 0)
++ return pad;
++
++ rxm->full_len -= pad;
++ rxm->offset += prot->prepend_size;
++ rxm->full_len -= prot->overhead_size;
++ tls_advance_record_sn(sk, prot, &tls_ctx->rx);
++ tlm->decrypted = 1;
++
++ return 0;
+ }
+
+ int decrypt_skb(struct sock *sk, struct sk_buff *skb,
+--
+2.43.0
+
--- /dev/null
+From bda007d9121ca97630a5bd3ffede8021d4c75177 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 11 Apr 2022 12:19:15 -0700
+Subject: tls: rx: use async as an in-out argument
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 3547a1f9d988d88ecff4fc365d2773037c849f49 ]
+
+Propagating EINPROGRESS thru multiple layers of functions is
+error prone. Use darg->async as an in/out argument, like we
+use darg->zc today. On input it tells the code if async is
+allowed, on output if it took place.
+
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: f7fa16d49837 ("tls: decrement decrypt_pending if no async completion will be called")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls_sw.c | 31 ++++++++++++++++---------------
+ 1 file changed, 16 insertions(+), 15 deletions(-)
+
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index 27ac27daec868..a1a99f9f093b1 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -236,7 +236,7 @@ static int tls_do_decryption(struct sock *sk,
+ char *iv_recv,
+ size_t data_len,
+ struct aead_request *aead_req,
+- bool async)
++ struct tls_decrypt_arg *darg)
+ {
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
+@@ -249,7 +249,7 @@ static int tls_do_decryption(struct sock *sk,
+ data_len + prot->tag_size,
+ (u8 *)iv_recv);
+
+- if (async) {
++ if (darg->async) {
+ /* Using skb->sk to push sk through to crypto async callback
+ * handler. This allows propagating errors up to the socket
+ * if needed. It _must_ be cleared in the async handler
+@@ -269,11 +269,13 @@ static int tls_do_decryption(struct sock *sk,
+
+ ret = crypto_aead_decrypt(aead_req);
+ if (ret == -EINPROGRESS) {
+- if (async)
+- return ret;
++ if (darg->async)
++ return 0;
+
+ ret = crypto_wait_req(ret, &ctx->async_wait);
+ }
++ darg->async = false;
++
+ if (ret == -EBADMSG)
+ TLS_INC_STATS(sock_net(sk), LINUX_MIB_TLSDECRYPTERROR);
+
+@@ -1540,9 +1542,9 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
+
+ /* Prepare and submit AEAD request */
+ err = tls_do_decryption(sk, skb, sgin, sgout, iv,
+- data_len, aead_req, darg->async);
+- if (err == -EINPROGRESS)
+- return err;
++ data_len, aead_req, darg);
++ if (darg->async)
++ return 0;
+
+ /* Release the pages in case iov was mapped to pages */
+ for (; pages > 0; pages--)
+@@ -1579,11 +1581,10 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ }
+
+ err = decrypt_internal(sk, skb, dest, NULL, darg);
+- if (err < 0) {
+- if (err == -EINPROGRESS)
+- tls_advance_record_sn(sk, prot, &tls_ctx->rx);
++ if (err < 0)
+ return err;
+- }
++ if (darg->async)
++ goto decrypt_next;
+
+ decrypt_done:
+ pad = padding_length(prot, skb);
+@@ -1593,8 +1594,9 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ rxm->full_len -= pad;
+ rxm->offset += prot->prepend_size;
+ rxm->full_len -= prot->overhead_size;
+- tls_advance_record_sn(sk, prot, &tls_ctx->rx);
+ tlm->decrypted = 1;
++decrypt_next:
++ tls_advance_record_sn(sk, prot, &tls_ctx->rx);
+
+ return 0;
+ }
+@@ -1826,13 +1828,12 @@ int tls_sw_recvmsg(struct sock *sk,
+ darg.async = false;
+
+ err = decrypt_skb_update(sk, skb, &msg->msg_iter, &darg);
+- if (err < 0 && err != -EINPROGRESS) {
++ if (err < 0) {
+ tls_err_abort(sk, -EBADMSG);
+ goto recv_end;
+ }
+
+- if (err == -EINPROGRESS)
+- async = true;
++ async |= darg.async;
+
+ /* If the type of records being processed is not known yet,
+ * set it to record type just dequeued. If it is already known,
+--
+2.43.0
+
--- /dev/null
+From 890569dace1deb9afd771b5fb88d07455426c5b3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 8 Apr 2022 11:31:26 -0700
+Subject: tls: rx: wrap decryption arguments in a structure
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 4175eac37123a68ebee71f288826339fb89bfec7 ]
+
+We pass zc as a pointer to bool a few functions down as an in/out
+argument. This is error prone since C will happily evalue a pointer
+as a boolean (IOW forgetting *zc and writing zc leads to loss of
+developer time..). Wrap the arguments into a structure.
+
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: f7fa16d49837 ("tls: decrement decrypt_pending if no async completion will be called")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tls/tls_sw.c | 49 ++++++++++++++++++++++++++----------------------
+ 1 file changed, 27 insertions(+), 22 deletions(-)
+
+diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
+index fc1fa98d21937..c491cde30504e 100644
+--- a/net/tls/tls_sw.c
++++ b/net/tls/tls_sw.c
+@@ -44,6 +44,11 @@
+ #include <net/strparser.h>
+ #include <net/tls.h>
+
++struct tls_decrypt_arg {
++ bool zc;
++ bool async;
++};
++
+ noinline void tls_err_abort(struct sock *sk, int err)
+ {
+ WARN_ON_ONCE(err >= 0);
+@@ -1415,7 +1420,7 @@ static int tls_setup_from_iter(struct iov_iter *from,
+ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
+ struct iov_iter *out_iov,
+ struct scatterlist *out_sg,
+- bool *zc, bool async)
++ struct tls_decrypt_arg *darg)
+ {
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_sw_context_rx *ctx = tls_sw_ctx_rx(tls_ctx);
+@@ -1432,7 +1437,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
+ prot->tail_size;
+ int iv_offset = 0;
+
+- if (*zc && (out_iov || out_sg)) {
++ if (darg->zc && (out_iov || out_sg)) {
+ if (out_iov)
+ n_sgout = iov_iter_npages(out_iov, INT_MAX) + 1;
+ else
+@@ -1441,7 +1446,7 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
+ rxm->full_len - prot->prepend_size);
+ } else {
+ n_sgout = 0;
+- *zc = false;
++ darg->zc = false;
+ n_sgin = skb_cow_data(skb, 0, &unused);
+ }
+
+@@ -1531,12 +1536,12 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
+ fallback_to_reg_recv:
+ sgout = sgin;
+ pages = 0;
+- *zc = false;
++ darg->zc = false;
+ }
+
+ /* Prepare and submit AEAD request */
+ err = tls_do_decryption(sk, skb, sgin, sgout, iv,
+- data_len, aead_req, async);
++ data_len, aead_req, darg->async);
+ if (err == -EINPROGRESS)
+ return err;
+
+@@ -1549,7 +1554,8 @@ static int decrypt_internal(struct sock *sk, struct sk_buff *skb,
+ }
+
+ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+- struct iov_iter *dest, bool *zc, bool async)
++ struct iov_iter *dest,
++ struct tls_decrypt_arg *darg)
+ {
+ struct tls_context *tls_ctx = tls_get_ctx(sk);
+ struct tls_prot_info *prot = &tls_ctx->prot_info;
+@@ -1558,7 +1564,7 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ int pad, err;
+
+ if (tlm->decrypted) {
+- *zc = false;
++ darg->zc = false;
+ return 0;
+ }
+
+@@ -1568,12 +1574,12 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ return err;
+ if (err > 0) {
+ tlm->decrypted = 1;
+- *zc = false;
++ darg->zc = false;
+ goto decrypt_done;
+ }
+ }
+
+- err = decrypt_internal(sk, skb, dest, NULL, zc, async);
++ err = decrypt_internal(sk, skb, dest, NULL, darg);
+ if (err < 0) {
+ if (err == -EINPROGRESS)
+ tls_advance_record_sn(sk, prot, &tls_ctx->rx);
+@@ -1599,9 +1605,9 @@ static int decrypt_skb_update(struct sock *sk, struct sk_buff *skb,
+ int decrypt_skb(struct sock *sk, struct sk_buff *skb,
+ struct scatterlist *sgout)
+ {
+- bool zc = true;
++ struct tls_decrypt_arg darg = { .zc = true, };
+
+- return decrypt_internal(sk, skb, NULL, sgout, &zc, false);
++ return decrypt_internal(sk, skb, NULL, sgout, &darg);
+ }
+
+ static bool tls_sw_advance_skb(struct sock *sk, struct sk_buff *skb,
+@@ -1790,11 +1796,10 @@ int tls_sw_recvmsg(struct sock *sk,
+ decrypted = 0;
+ num_async = 0;
+ while (len && (decrypted + copied < target || ctx->recv_pkt)) {
++ struct tls_decrypt_arg darg = {};
+ bool retain_skb = false;
+ int to_decrypt, chunk;
+- bool zc = false;
+- bool async_capable;
+- bool async = false;
++ bool async;
+
+ skb = tls_wait_data(sk, psock, flags & MSG_DONTWAIT, timeo, &err);
+ if (!skb) {
+@@ -1820,16 +1825,15 @@ int tls_sw_recvmsg(struct sock *sk,
+ tlm->control == TLS_RECORD_TYPE_DATA &&
+ prot->version != TLS_1_3_VERSION &&
+ !bpf_strp_enabled)
+- zc = true;
++ darg.zc = true;
+
+ /* Do not use async mode if record is non-data */
+ if (tlm->control == TLS_RECORD_TYPE_DATA && !bpf_strp_enabled)
+- async_capable = ctx->async_capable;
++ darg.async = ctx->async_capable;
+ else
+- async_capable = false;
++ darg.async = false;
+
+- err = decrypt_skb_update(sk, skb, &msg->msg_iter,
+- &zc, async_capable);
++ err = decrypt_skb_update(sk, skb, &msg->msg_iter, &darg);
+ if (err < 0 && err != -EINPROGRESS) {
+ tls_err_abort(sk, -EBADMSG);
+ goto recv_end;
+@@ -1875,7 +1879,7 @@ int tls_sw_recvmsg(struct sock *sk,
+ /* TLS 1.3 may have updated the length by more than overhead */
+ chunk = rxm->full_len;
+
+- if (!zc) {
++ if (!darg.zc) {
+ if (bpf_strp_enabled) {
+ err = sk_psock_tls_strp_read(psock, skb);
+ if (err != __SK_PASS) {
+@@ -1991,7 +1995,6 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
+ int err = 0;
+ long timeo;
+ int chunk;
+- bool zc = false;
+
+ lock_sock(sk);
+
+@@ -2001,12 +2004,14 @@ ssize_t tls_sw_splice_read(struct socket *sock, loff_t *ppos,
+ if (from_queue) {
+ skb = __skb_dequeue(&ctx->rx_list);
+ } else {
++ struct tls_decrypt_arg darg = {};
++
+ skb = tls_wait_data(sk, NULL, flags & SPLICE_F_NONBLOCK, timeo,
+ &err);
+ if (!skb)
+ goto splice_read_end;
+
+- err = decrypt_skb_update(sk, skb, NULL, &zc, false);
++ err = decrypt_skb_update(sk, skb, NULL, &darg);
+ if (err < 0) {
+ tls_err_abort(sk, -EBADMSG);
+ goto splice_read_end;
+--
+2.43.0
+
--- /dev/null
+From ae5b910252b82a29df0eb2f8a1196cd113446330 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 20 Feb 2024 11:12:07 +0800
+Subject: tun: Fix xdp_rxq_info's queue_index when detaching
+
+From: Yunjian Wang <wangyunjian@huawei.com>
+
+[ Upstream commit 2a770cdc4382b457ca3d43d03f0f0064f905a0d0 ]
+
+When a queue(tfile) is detached, we only update tfile's queue_index,
+but do not update xdp_rxq_info's queue_index. This patch fixes it.
+
+Fixes: 8bf5c4ee1889 ("tun: setup xdp_rxq_info")
+Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
+Link: https://lore.kernel.org/r/1708398727-46308-1-git-send-email-wangyunjian@huawei.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/tun.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/net/tun.c b/drivers/net/tun.c
+index 603530e6cd7b9..42bf0a3ec632e 100644
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -654,6 +654,7 @@ static void __tun_detach(struct tun_file *tfile, bool clean)
+ tun->tfiles[tun->numqueues - 1]);
+ ntfile = rtnl_dereference(tun->tfiles[index]);
+ ntfile->queue_index = index;
++ ntfile->xdp_rxq.queue_index = index;
+ rcu_assign_pointer(tun->tfiles[tun->numqueues - 1],
+ NULL);
+
+--
+2.43.0
+
--- /dev/null
+From 29b0929be27d9f67e2bba4ba7a07aa11a6e6cd64 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 26 Feb 2024 13:49:21 +0100
+Subject: uapi: in6: replace temporary label with rfc9486
+
+From: Justin Iurman <justin.iurman@uliege.be>
+
+[ Upstream commit 6a2008641920a9c6fe1abbeb9acbec463215d505 ]
+
+Not really a fix per se, but IPV6_TLV_IOAM is still tagged as "TEMPORARY
+IANA allocation for IOAM", while RFC 9486 is available for some time
+now. Just update the reference.
+
+Fixes: 9ee11f0fff20 ("ipv6: ioam: Data plane support for Pre-allocated Trace")
+Signed-off-by: Justin Iurman <justin.iurman@uliege.be>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240226124921.9097-1-justin.iurman@uliege.be
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/uapi/linux/in6.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/include/uapi/linux/in6.h b/include/uapi/linux/in6.h
+index c4c53a9ab9595..ff8d21f9e95b7 100644
+--- a/include/uapi/linux/in6.h
++++ b/include/uapi/linux/in6.h
+@@ -145,7 +145,7 @@ struct in6_flowlabel_req {
+ #define IPV6_TLV_PADN 1
+ #define IPV6_TLV_ROUTERALERT 5
+ #define IPV6_TLV_CALIPSO 7 /* RFC 5570 */
+-#define IPV6_TLV_IOAM 49 /* TEMPORARY IANA allocation for IOAM */
++#define IPV6_TLV_IOAM 49 /* RFC 9486 */
+ #define IPV6_TLV_JUMBO 194
+ #define IPV6_TLV_HAO 201 /* home address option */
+
+--
+2.43.0
+
--- /dev/null
+From 0fef6a2d8b391073a15483c69a6a0b378aac6569 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 23 Feb 2024 15:59:08 -0800
+Subject: veth: try harder when allocating queue memory
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 1ce7d306ea63f3e379557c79abd88052e0483813 ]
+
+struct veth_rq is pretty large, 832B total without debug
+options enabled. Since commit under Fixes we try to pre-allocate
+enough queues for every possible CPU. Miao Wang reports that
+this may lead to order-5 allocations which will fail in production.
+
+Let the allocation fallback to vmalloc() and try harder.
+These are the same flags we pass to netdev queue allocation.
+
+Reported-and-tested-by: Miao Wang <shankerwangmiao@gmail.com>
+Fixes: 9d3684c24a52 ("veth: create by default nr_possible_cpus queues")
+Link: https://lore.kernel.org/all/5F52CAE2-2FB7-4712-95F1-3312FBBFA8DD@gmail.com/
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20240223235908.693010-1-kuba@kernel.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/veth.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/veth.c b/drivers/net/veth.c
+index 87cee614618ca..0102f86d48676 100644
+--- a/drivers/net/veth.c
++++ b/drivers/net/veth.c
+@@ -1303,7 +1303,8 @@ static int veth_alloc_queues(struct net_device *dev)
+ struct veth_priv *priv = netdev_priv(dev);
+ int i;
+
+- priv->rq = kcalloc(dev->num_rx_queues, sizeof(*priv->rq), GFP_KERNEL_ACCOUNT);
++ priv->rq = kvcalloc(dev->num_rx_queues, sizeof(*priv->rq),
++ GFP_KERNEL_ACCOUNT | __GFP_RETRY_MAYFAIL);
+ if (!priv->rq)
+ return -ENOMEM;
+
+@@ -1319,7 +1320,7 @@ static void veth_free_queues(struct net_device *dev)
+ {
+ struct veth_priv *priv = netdev_priv(dev);
+
+- kfree(priv->rq);
++ kvfree(priv->rq);
+ }
+
+ static int veth_dev_init(struct net_device *dev)
+--
+2.43.0
+