--- /dev/null
+From 4b2efb9db0c22a130bbd1275e489b42c02d08050 Mon Sep 17 00:00:00 2001
+From: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+Date: Tue, 10 Dec 2024 14:09:37 +0100
+Subject: accel/ivpu: Fix general protection fault in ivpu_bo_list()
+
+From: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+
+commit 4b2efb9db0c22a130bbd1275e489b42c02d08050 upstream.
+
+Check if ctx is not NULL before accessing its fields.
+
+Fixes: 37dee2a2f433 ("accel/ivpu: Improve buffer object debug logs")
+Cc: stable@vger.kernel.org # v6.8
+Reviewed-by: Karol Wachowski <karol.wachowski@intel.com>
+Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
+Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20241210130939.1575610-2-jacek.lawrynowicz@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/accel/ivpu/ivpu_gem.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/accel/ivpu/ivpu_gem.c
++++ b/drivers/accel/ivpu/ivpu_gem.c
+@@ -406,7 +406,7 @@ static void ivpu_bo_print_info(struct iv
+ mutex_lock(&bo->lock);
+
+ drm_printf(p, "%-9p %-3u 0x%-12llx %-10lu 0x%-8x %-4u",
+- bo, bo->ctx->id, bo->vpu_addr, bo->base.base.size,
++ bo, bo->ctx ? bo->ctx->id : 0, bo->vpu_addr, bo->base.base.size,
+ bo->flags, kref_read(&bo->base.base.refcount));
+
+ if (bo->base.pages)
--- /dev/null
+From 0f6482caa6acdfdfc744db7430771fe7e6c4e787 Mon Sep 17 00:00:00 2001
+From: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+Date: Tue, 10 Dec 2024 14:09:39 +0100
+Subject: accel/ivpu: Fix WARN in ivpu_ipc_send_receive_internal()
+
+From: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+
+commit 0f6482caa6acdfdfc744db7430771fe7e6c4e787 upstream.
+
+Move pm_runtime_set_active() to ivpu_pm_init() so when
+ivpu_ipc_send_receive_internal() is executed before ivpu_pm_enable()
+it already has correct runtime state, even if last resume was
+not successful.
+
+Fixes: 8ed520ff4682 ("accel/ivpu: Move set autosuspend delay to HW specific code")
+Cc: stable@vger.kernel.org # v6.7+
+Reviewed-by: Karol Wachowski <karol.wachowski@intel.com>
+Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
+Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20241210130939.1575610-4-jacek.lawrynowicz@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/accel/ivpu/ivpu_pm.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/accel/ivpu/ivpu_pm.c
++++ b/drivers/accel/ivpu/ivpu_pm.c
+@@ -364,6 +364,7 @@ void ivpu_pm_init(struct ivpu_device *vd
+
+ pm_runtime_use_autosuspend(dev);
+ pm_runtime_set_autosuspend_delay(dev, delay);
++ pm_runtime_set_active(dev);
+
+ ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay);
+ }
+@@ -378,7 +379,6 @@ void ivpu_pm_enable(struct ivpu_device *
+ {
+ struct device *dev = vdev->drm.dev;
+
+- pm_runtime_set_active(dev);
+ pm_runtime_allow(dev);
+ pm_runtime_mark_last_busy(dev);
+ pm_runtime_put_autosuspend(dev);
--- /dev/null
+From d75d72a858f0c00ca8ae161b48cdb403807be4de Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 13 Nov 2024 11:11:55 -0500
+Subject: btrfs: fix improper generation check in snapshot delete
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit d75d72a858f0c00ca8ae161b48cdb403807be4de upstream.
+
+We have been using the following check
+
+ if (generation <= root->root_key.offset)
+
+to make decisions about whether or not to visit a node during snapshot
+delete. This is because for normal subvolumes this is set to 0, and for
+snapshots it's set to the creation generation. The idea being that if
+the generation of the node is less than or equal to our creation
+generation then we don't need to visit that node, because it doesn't
+belong to us, we can simply drop our reference and move on.
+
+However reloc roots don't have their generation stored in
+root->root_key.offset, instead that is the objectid of their
+corresponding fs root. This means we can incorrectly not walk into
+nodes that need to be dropped when deleting a reloc root.
+
+There are a variety of consequences to making the wrong choice in two
+distinct areas.
+
+visit_node_for_delete()
+
+1. False positive. We think we are newer than the block when we really
+ aren't. We don't visit the node and drop our reference to the node
+ and carry on. This would result in leaked space.
+2. False negative. We do decide to walk down into a block that we
+ should have just dropped our reference to. However this means that
+ the child node will have refs > 1, so we will switch to
+ UPDATE_BACKREF, and then the subsequent walk_down_proc() will notice
+ that btrfs_header_owner(node) != root->root_key.objectid and it'll
+ break out of the loop, and then walk_up_proc() will drop our reference,
+ so this appears to be ok.
+
+do_walk_down()
+
+1. False positive. We are in UPDATE_BACKREF and incorrectly decide that
+ we are done and don't need to update the backref for our lower nodes.
+ This is another case that simply won't happen with relocation, as we
+ only have to do UPDATE_BACKREF if the node below us was shared and
+ didn't have FULL_BACKREF set, and since we don't own that node
+ because we're a reloc root we actually won't end up in this case.
+2. False negative. Again this is tricky because as described above, we
+ simply wouldn't be here from relocation, because we don't own any of
+ the nodes because we never set btrfs_header_owner() to the reloc root
+ objectid, and we always use FULL_BACKREF, we never actually need to
+ set FULL_BACKREF on any children.
+
+Having spent a lot of time stressing relocation/snapshot delete recently
+I've not seen this pop in practice. But this is objectively incorrect,
+so fix this to get the correct starting generation based on the root
+we're dropping to keep me from thinking there's a problem here.
+
+CC: stable@vger.kernel.org
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.h | 19 +++++++++++++++++++
+ fs/btrfs/extent-tree.c | 6 +++---
+ 2 files changed, 22 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -371,6 +371,25 @@ static inline void btrfs_set_root_last_t
+ }
+
+ /*
++ * Return the generation this root started with.
++ *
++ * Every normal root that is created with root->root_key.offset set to it's
++ * originating generation. If it is a snapshot it is the generation when the
++ * snapshot was created.
++ *
++ * However for TREE_RELOC roots root_key.offset is the objectid of the owning
++ * tree root. Thankfully we copy the root item of the owning tree root, which
++ * has it's last_snapshot set to what we would have root_key.offset set to, so
++ * return that if this is a TREE_RELOC root.
++ */
++static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root)
++{
++ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
++ return btrfs_root_last_snapshot(&root->root_item);
++ return root->root_key.offset;
++}
++
++/*
+ * Structure that conveys information about an extent that is going to replace
+ * all the extents in a file range.
+ */
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -5308,7 +5308,7 @@ static bool visit_node_for_delete(struct
+ * reference to it.
+ */
+ generation = btrfs_node_ptr_generation(eb, slot);
+- if (!wc->update_ref || generation <= root->root_key.offset)
++ if (!wc->update_ref || generation <= btrfs_root_origin_generation(root))
+ return false;
+
+ /*
+@@ -5363,7 +5363,7 @@ static noinline void reada_walk_down(str
+ goto reada;
+
+ if (wc->stage == UPDATE_BACKREF &&
+- generation <= root->root_key.offset)
++ generation <= btrfs_root_origin_generation(root))
+ continue;
+
+ /* We don't lock the tree block, it's OK to be racy here */
+@@ -5706,7 +5706,7 @@ static noinline int do_walk_down(struct
+ * for the subtree
+ */
+ if (wc->stage == UPDATE_BACKREF &&
+- generation <= root->root_key.offset) {
++ generation <= btrfs_root_origin_generation(root)) {
+ wc->lookup_info = 1;
+ return 1;
+ }
--- /dev/null
+From be691b5e593f2cc8cef67bbc59c1fb91b74a86a9 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Mon, 4 Nov 2024 07:26:33 +0100
+Subject: btrfs: split bios to the fs sector size boundary
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit be691b5e593f2cc8cef67bbc59c1fb91b74a86a9 upstream.
+
+Btrfs like other file systems can't really deal with I/O not aligned to
+it's internal block size (which strangely is called sector size in
+btrfs, for historical reasons), but the block layer split helper doesn't
+even know about that.
+
+Round down the split boundary so that all I/Os are aligned.
+
+Fixes: d5e4377d5051 ("btrfs: split zone append bios in btrfs_submit_bio")
+CC: stable@vger.kernel.org # 6.12
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/bio.c | 10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/bio.c
++++ b/fs/btrfs/bio.c
+@@ -649,8 +649,14 @@ static u64 btrfs_append_map_length(struc
+ map_length = min(map_length, bbio->fs_info->max_zone_append_size);
+ sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits,
+ &nr_segs, map_length);
+- if (sector_offset)
+- return sector_offset << SECTOR_SHIFT;
++ if (sector_offset) {
++ /*
++ * bio_split_rw_at() could split at a size smaller than our
++ * sectorsize and thus cause unaligned I/Os. Fix that by
++ * always rounding down to the nearest boundary.
++ */
++ return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize);
++ }
+ return map_length;
+ }
+
--- /dev/null
+From dfb92681a19e1d5172420baa242806414b3eff6f Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 4 Dec 2024 13:30:46 +1030
+Subject: btrfs: tree-checker: reject inline extent items with 0 ref count
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit dfb92681a19e1d5172420baa242806414b3eff6f upstream.
+
+[BUG]
+There is a bug report in the mailing list where btrfs_run_delayed_refs()
+failed to drop the ref count for logical 25870311358464 num_bytes
+2113536.
+
+The involved leaf dump looks like this:
+
+ item 166 key (25870311358464 168 2113536) itemoff 10091 itemsize 50
+ extent refs 1 gen 84178 flags 1
+ ref#0: shared data backref parent 32399126528000 count 0 <<<
+ ref#1: shared data backref parent 31808973717504 count 1
+
+Notice the count number is 0.
+
+[CAUSE]
+There is no concrete evidence yet, but considering 0 -> 1 is also a
+single bit flipped, it's possible that hardware memory bitflip is
+involved, causing the on-disk extent tree to be corrupted.
+
+[FIX]
+To prevent us reading such corrupted extent item, or writing such
+damaged extent item back to disk, enhance the handling of
+BTRFS_EXTENT_DATA_REF_KEY and BTRFS_SHARED_DATA_REF_KEY keys for both
+inlined and key items, to detect such 0 ref count and reject them.
+
+CC: stable@vger.kernel.org # 5.4+
+Link: https://lore.kernel.org/linux-btrfs/7c69dd49-c346-4806-86e7-e6f863a66f48@app.fastmail.com/
+Reported-by: Frankie Fisher <frankie@terrorise.me.uk>
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-checker.c | 27 ++++++++++++++++++++++++++-
+ 1 file changed, 26 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -1527,6 +1527,11 @@ static int check_extent_item(struct exte
+ dref_offset, fs_info->sectorsize);
+ return -EUCLEAN;
+ }
++ if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) {
++ extent_err(leaf, slot,
++ "invalid data ref count, should have non-zero value");
++ return -EUCLEAN;
++ }
+ inline_refs += btrfs_extent_data_ref_count(leaf, dref);
+ break;
+ /* Contains parent bytenr and ref count */
+@@ -1539,6 +1544,11 @@ static int check_extent_item(struct exte
+ inline_offset, fs_info->sectorsize);
+ return -EUCLEAN;
+ }
++ if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) {
++ extent_err(leaf, slot,
++ "invalid shared data ref count, should have non-zero value");
++ return -EUCLEAN;
++ }
+ inline_refs += btrfs_shared_data_ref_count(leaf, sref);
+ break;
+ case BTRFS_EXTENT_OWNER_REF_KEY:
+@@ -1611,8 +1621,18 @@ static int check_simple_keyed_refs(struc
+ {
+ u32 expect_item_size = 0;
+
+- if (key->type == BTRFS_SHARED_DATA_REF_KEY)
++ if (key->type == BTRFS_SHARED_DATA_REF_KEY) {
++ struct btrfs_shared_data_ref *sref;
++
++ sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref);
++ if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) {
++ extent_err(leaf, slot,
++ "invalid shared data backref count, should have non-zero value");
++ return -EUCLEAN;
++ }
++
+ expect_item_size = sizeof(struct btrfs_shared_data_ref);
++ }
+
+ if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) {
+ generic_err(leaf, slot,
+@@ -1689,6 +1709,11 @@ static int check_extent_data_ref(struct
+ offset, leaf->fs_info->sectorsize);
+ return -EUCLEAN;
+ }
++ if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) {
++ extent_err(leaf, slot,
++ "invalid extent data backref count, should have non-zero value");
++ return -EUCLEAN;
++ }
+ }
+ return 0;
+ }
+++ /dev/null
-From da4d8c83358163df9a4addaeba0ef8bcb03b22e8 Mon Sep 17 00:00:00 2001
-From: Davidlohr Bueso <dave@stgolabs.net>
-Date: Fri, 15 Nov 2024 09:00:32 -0800
-Subject: cxl/pci: Fix potential bogus return value upon successful probing
-
-From: Davidlohr Bueso <dave@stgolabs.net>
-
-commit da4d8c83358163df9a4addaeba0ef8bcb03b22e8 upstream.
-
-If cxl_pci_ras_unmask() returns non-zero, cxl_pci_probe() will end up
-returning that value, instead of zero.
-
-Fixes: 248529edc86f ("cxl: add RAS status unmasking for CXL")
-Reviewed-by: Fan Ni <fan.ni@samsung.com>
-Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
-Reviewed-by: Ira Weiny <ira.weiny@intel.com>
-Link: https://patch.msgid.link/20241115170032.108445-1-dave@stgolabs.net
-Signed-off-by: Dave Jiang <dave.jiang@intel.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/cxl/pci.c | 3 +--
- 1 file changed, 1 insertion(+), 2 deletions(-)
-
-diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
-index 0241d1d7133a..26ab06c9deff 100644
---- a/drivers/cxl/pci.c
-+++ b/drivers/cxl/pci.c
-@@ -1032,8 +1032,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
- if (rc)
- return rc;
-
-- rc = cxl_pci_ras_unmask(pdev);
-- if (rc)
-+ if (cxl_pci_ras_unmask(pdev))
- dev_dbg(&pdev->dev, "No RAS reporting unmasked\n");
-
- pci_save_state(pdev);
---
-2.47.1
-
--- /dev/null
+From 07a756a49f4b4290b49ea46e089cbe6f79ff8d26 Mon Sep 17 00:00:00 2001
+From: Michael Kelley <mhklinux@outlook.com>
+Date: Wed, 6 Nov 2024 07:42:47 -0800
+Subject: Drivers: hv: util: Avoid accessing a ringbuffer not initialized yet
+
+From: Michael Kelley <mhklinux@outlook.com>
+
+commit 07a756a49f4b4290b49ea46e089cbe6f79ff8d26 upstream.
+
+If the KVP (or VSS) daemon starts before the VMBus channel's ringbuffer is
+fully initialized, we can hit the panic below:
+
+hv_utils: Registering HyperV Utility Driver
+hv_vmbus: registering driver hv_utils
+...
+BUG: kernel NULL pointer dereference, address: 0000000000000000
+CPU: 44 UID: 0 PID: 2552 Comm: hv_kvp_daemon Tainted: G E 6.11.0-rc3+ #1
+RIP: 0010:hv_pkt_iter_first+0x12/0xd0
+Call Trace:
+...
+ vmbus_recvpacket
+ hv_kvp_onchannelcallback
+ vmbus_on_event
+ tasklet_action_common
+ tasklet_action
+ handle_softirqs
+ irq_exit_rcu
+ sysvec_hyperv_stimer0
+ </IRQ>
+ <TASK>
+ asm_sysvec_hyperv_stimer0
+...
+ kvp_register_done
+ hvt_op_read
+ vfs_read
+ ksys_read
+ __x64_sys_read
+
+This can happen because the KVP/VSS channel callback can be invoked
+even before the channel is fully opened:
+1) as soon as hv_kvp_init() -> hvutil_transport_init() creates
+/dev/vmbus/hv_kvp, the kvp daemon can open the device file immediately and
+register itself to the driver by writing a message KVP_OP_REGISTER1 to the
+file (which is handled by kvp_on_msg() ->kvp_handle_handshake()) and
+reading the file for the driver's response, which is handled by
+hvt_op_read(), which calls hvt->on_read(), i.e. kvp_register_done().
+
+2) the problem with kvp_register_done() is that it can cause the
+channel callback to be called even before the channel is fully opened,
+and when the channel callback is starting to run, util_probe()->
+vmbus_open() may have not initialized the ringbuffer yet, so the
+callback can hit the panic of NULL pointer dereference.
+
+To reproduce the panic consistently, we can add a "ssleep(10)" for KVP in
+__vmbus_open(), just before the first hv_ringbuffer_init(), and then we
+unload and reload the driver hv_utils, and run the daemon manually within
+the 10 seconds.
+
+Fix the panic by reordering the steps in util_probe() so the char dev
+entry used by the KVP or VSS daemon is not created until after
+vmbus_open() has completed. This reordering prevents the race condition
+from happening.
+
+Reported-by: Dexuan Cui <decui@microsoft.com>
+Fixes: e0fa3e5e7df6 ("Drivers: hv: utils: fix a race on userspace daemons registration")
+Cc: stable@vger.kernel.org
+Signed-off-by: Michael Kelley <mhklinux@outlook.com>
+Acked-by: Wei Liu <wei.liu@kernel.org>
+Link: https://lore.kernel.org/r/20241106154247.2271-3-mhklinux@outlook.com
+Signed-off-by: Wei Liu <wei.liu@kernel.org>
+Message-ID: <20241106154247.2271-3-mhklinux@outlook.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/hv/hv_kvp.c | 6 ++++++
+ drivers/hv/hv_snapshot.c | 6 ++++++
+ drivers/hv/hv_util.c | 9 +++++++++
+ drivers/hv/hyperv_vmbus.h | 2 ++
+ include/linux/hyperv.h | 1 +
+ 5 files changed, 24 insertions(+)
+
+--- a/drivers/hv/hv_kvp.c
++++ b/drivers/hv/hv_kvp.c
+@@ -767,6 +767,12 @@ hv_kvp_init(struct hv_util_service *srv)
+ */
+ kvp_transaction.state = HVUTIL_DEVICE_INIT;
+
++ return 0;
++}
++
++int
++hv_kvp_init_transport(void)
++{
+ hvt = hvutil_transport_init(kvp_devname, CN_KVP_IDX, CN_KVP_VAL,
+ kvp_on_msg, kvp_on_reset);
+ if (!hvt)
+--- a/drivers/hv/hv_snapshot.c
++++ b/drivers/hv/hv_snapshot.c
+@@ -388,6 +388,12 @@ hv_vss_init(struct hv_util_service *srv)
+ */
+ vss_transaction.state = HVUTIL_DEVICE_INIT;
+
++ return 0;
++}
++
++int
++hv_vss_init_transport(void)
++{
+ hvt = hvutil_transport_init(vss_devname, CN_VSS_IDX, CN_VSS_VAL,
+ vss_on_msg, vss_on_reset);
+ if (!hvt) {
+--- a/drivers/hv/hv_util.c
++++ b/drivers/hv/hv_util.c
+@@ -141,6 +141,7 @@ static struct hv_util_service util_heart
+ static struct hv_util_service util_kvp = {
+ .util_cb = hv_kvp_onchannelcallback,
+ .util_init = hv_kvp_init,
++ .util_init_transport = hv_kvp_init_transport,
+ .util_pre_suspend = hv_kvp_pre_suspend,
+ .util_pre_resume = hv_kvp_pre_resume,
+ .util_deinit = hv_kvp_deinit,
+@@ -149,6 +150,7 @@ static struct hv_util_service util_kvp =
+ static struct hv_util_service util_vss = {
+ .util_cb = hv_vss_onchannelcallback,
+ .util_init = hv_vss_init,
++ .util_init_transport = hv_vss_init_transport,
+ .util_pre_suspend = hv_vss_pre_suspend,
+ .util_pre_resume = hv_vss_pre_resume,
+ .util_deinit = hv_vss_deinit,
+@@ -613,6 +615,13 @@ static int util_probe(struct hv_device *
+ if (ret)
+ goto error;
+
++ if (srv->util_init_transport) {
++ ret = srv->util_init_transport();
++ if (ret) {
++ vmbus_close(dev->channel);
++ goto error;
++ }
++ }
+ return 0;
+
+ error:
+--- a/drivers/hv/hyperv_vmbus.h
++++ b/drivers/hv/hyperv_vmbus.h
+@@ -370,12 +370,14 @@ void vmbus_on_event(unsigned long data);
+ void vmbus_on_msg_dpc(unsigned long data);
+
+ int hv_kvp_init(struct hv_util_service *srv);
++int hv_kvp_init_transport(void);
+ void hv_kvp_deinit(void);
+ int hv_kvp_pre_suspend(void);
+ int hv_kvp_pre_resume(void);
+ void hv_kvp_onchannelcallback(void *context);
+
+ int hv_vss_init(struct hv_util_service *srv);
++int hv_vss_init_transport(void);
+ void hv_vss_deinit(void);
+ int hv_vss_pre_suspend(void);
+ int hv_vss_pre_resume(void);
+--- a/include/linux/hyperv.h
++++ b/include/linux/hyperv.h
+@@ -1559,6 +1559,7 @@ struct hv_util_service {
+ void *channel;
+ void (*util_cb)(void *);
+ int (*util_init)(struct hv_util_service *);
++ int (*util_init_transport)(void);
+ void (*util_deinit)(void);
+ int (*util_pre_suspend)(void);
+ int (*util_pre_resume)(void);
--- /dev/null
+From 41be00f839e9ee7753892a73a36ce4c14c6f5cbf Mon Sep 17 00:00:00 2001
+From: Alex Deucher <alexander.deucher@amd.com>
+Date: Thu, 12 Dec 2024 17:04:58 -0500
+Subject: drm/amdgpu/gfx12: fix IP version check
+
+From: Alex Deucher <alexander.deucher@amd.com>
+
+commit 41be00f839e9ee7753892a73a36ce4c14c6f5cbf upstream.
+
+Use the helper function rather than reading it directly.
+
+Reviewed-by: Yang Wang <kevinyang.wang@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+(cherry picked from commit f1fd1d0f40272948aa6ab82a3a82ecbbc76dff53)
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+@@ -4105,7 +4105,7 @@ static int gfx_v12_0_set_clockgating_sta
+ if (amdgpu_sriov_vf(adev))
+ return 0;
+
+- switch (adev->ip_versions[GC_HWIP][0]) {
++ switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+ case IP_VERSION(12, 0, 0):
+ case IP_VERSION(12, 0, 1):
+ gfx_v12_0_update_gfx_clock_gating(adev,
--- /dev/null
+From 6ebc5b92190e01dd48313b68cbf752c9adcfefa8 Mon Sep 17 00:00:00 2001
+From: Alex Deucher <alexander.deucher@amd.com>
+Date: Thu, 12 Dec 2024 17:03:20 -0500
+Subject: drm/amdgpu/mmhub4.1: fix IP version check
+
+From: Alex Deucher <alexander.deucher@amd.com>
+
+commit 6ebc5b92190e01dd48313b68cbf752c9adcfefa8 upstream.
+
+Use the helper function rather than reading it directly.
+
+Reviewed-by: Yang Wang <kevinyang.wang@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+(cherry picked from commit 63bfd24088b42c6f55c2096bfc41b50213d419b2)
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c
+index 0fbc3be81f14..f2ab5001b492 100644
+--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c
+@@ -108,7 +108,7 @@ mmhub_v4_1_0_print_l2_protection_fault_status(struct amdgpu_device *adev,
+ dev_err(adev->dev,
+ "MMVM_L2_PROTECTION_FAULT_STATUS_LO32:0x%08X\n",
+ status);
+- switch (adev->ip_versions[MMHUB_HWIP][0]) {
++ switch (amdgpu_ip_version(adev, MMHUB_HWIP, 0)) {
+ case IP_VERSION(4, 1, 0):
+ mmhub_cid = mmhub_client_ids_v4_1_0[cid][rw];
+ break;
+--
+2.47.1
+
--- /dev/null
+From 3abb660f9e18925468685591a3702bda05faba4f Mon Sep 17 00:00:00 2001
+From: Alex Deucher <alexander.deucher@amd.com>
+Date: Thu, 12 Dec 2024 16:49:20 -0500
+Subject: drm/amdgpu/nbio7.0: fix IP version check
+
+From: Alex Deucher <alexander.deucher@amd.com>
+
+commit 3abb660f9e18925468685591a3702bda05faba4f upstream.
+
+Use the helper function rather than reading it directly.
+
+Reviewed-by: Yang Wang <kevinyang.wang@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+(cherry picked from commit 0ec43fbece784215d3c4469973e4556d70bce915)
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
+index 49e953f86ced..d1032e9992b4 100644
+--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
+@@ -278,7 +278,7 @@ static void nbio_v7_0_init_registers(struct amdgpu_device *adev)
+ {
+ uint32_t data;
+
+- switch (adev->ip_versions[NBIO_HWIP][0]) {
++ switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) {
+ case IP_VERSION(2, 5, 0):
+ data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4) & ~BIT(23);
+ WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4, data);
+--
+2.47.1
+
--- /dev/null
+From cc252bb592638e0f7aea40d580186c36d89526b8 Mon Sep 17 00:00:00 2001
+From: Steven Rostedt <rostedt@goodmis.org>
+Date: Wed, 11 Dec 2024 13:53:35 -0500
+Subject: fgraph: Still initialize idle shadow stacks when starting
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+commit cc252bb592638e0f7aea40d580186c36d89526b8 upstream.
+
+A bug was discovered where the idle shadow stacks were not initialized
+for offline CPUs when starting function graph tracer, and when they came
+online they were not traced due to the missing shadow stack. To fix
+this, the idle task shadow stack initialization was moved to using the
+CPU hotplug callbacks. But it removed the initialization when the
+function graph was enabled. The problem here is that the hotplug
+callbacks are called when the CPUs come online, but the idle shadow
+stack initialization only happens if function graph is currently
+active. This caused the online CPUs to not get their shadow stack
+initialized.
+
+The idle shadow stack initialization still needs to be done when the
+function graph is registered, as they will not be allocated if function
+graph is not registered.
+
+Cc: stable@vger.kernel.org
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Link: https://lore.kernel.org/20241211135335.094ba282@batman.local.home
+Fixes: 2c02f7375e65 ("fgraph: Use CPU hotplug mechanism to initialize idle shadow stacks")
+Reported-by: Linus Walleij <linus.walleij@linaro.org>
+Tested-by: Linus Walleij <linus.walleij@linaro.org>
+Closes: https://lore.kernel.org/all/CACRpkdaTBrHwRbbrphVy-=SeDz6MSsXhTKypOtLrTQ+DgGAOcQ@mail.gmail.com/
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/fgraph.c | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/kernel/trace/fgraph.c
++++ b/kernel/trace/fgraph.c
+@@ -1160,7 +1160,7 @@ void fgraph_update_pid_func(void)
+ static int start_graph_tracing(void)
+ {
+ unsigned long **ret_stack_list;
+- int ret;
++ int ret, cpu;
+
+ ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE,
+ sizeof(*ret_stack_list), GFP_KERNEL);
+@@ -1168,6 +1168,12 @@ static int start_graph_tracing(void)
+ if (!ret_stack_list)
+ return -ENOMEM;
+
++ /* The cpu_boot init_task->ret_stack will never be freed */
++ for_each_online_cpu(cpu) {
++ if (!idle_task(cpu)->ret_stack)
++ ftrace_graph_init_idle_task(idle_task(cpu), cpu);
++ }
++
+ do {
+ ret = alloc_retstack_tasklist(ret_stack_list);
+ } while (ret == -EAGAIN);
--- /dev/null
+From dbd2ca9367eb19bc5e269b8c58b0b1514ada9156 Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Thu, 19 Dec 2024 19:52:58 +0000
+Subject: io_uring: check if iowq is killed before queuing
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit dbd2ca9367eb19bc5e269b8c58b0b1514ada9156 upstream.
+
+task work can be executed after the task has gone through io_uring
+termination, whether it's the final task_work run or the fallback path.
+In this case, task work will find ->io_wq being already killed and
+null'ed, which is a problem if it then tries to forward the request to
+io_queue_iowq(). Make io_queue_iowq() fail requests in this case.
+
+Note that it also checks PF_KTHREAD, because the user can first close
+a DEFER_TASKRUN ring and shortly after kill the task, in which case
+->iowq check would race.
+
+Cc: stable@vger.kernel.org
+Fixes: 50c52250e2d74 ("block: implement async io_uring discard cmd")
+Fixes: 773af69121ecc ("io_uring: always reissue from task_work context")
+Reported-by: Will <willsroot@protonmail.com>
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/63312b4a2c2bb67ad67b857d17a300e1d3b078e8.1734637909.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/io_uring.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -515,7 +515,11 @@ static void io_queue_iowq(struct io_kioc
+ struct io_uring_task *tctx = req->task->io_uring;
+
+ BUG_ON(!tctx);
+- BUG_ON(!tctx->io_wq);
++
++ if ((current->flags & PF_KTHREAD) || !tctx->io_wq) {
++ io_req_task_queue_fail(req, -ECANCELED);
++ return;
++ }
+
+ /* init ->work of the whole link before punting */
+ io_prep_async_link(req);
--- /dev/null
+From 12d908116f7efd34f255a482b9afc729d7a5fb78 Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Wed, 18 Dec 2024 17:56:25 +0100
+Subject: io_uring: Fix registered ring file refcount leak
+
+From: Jann Horn <jannh@google.com>
+
+commit 12d908116f7efd34f255a482b9afc729d7a5fb78 upstream.
+
+Currently, io_uring_unreg_ringfd() (which cleans up registered rings) is
+only called on exit, but __io_uring_free (which frees the tctx in which the
+registered ring pointers are stored) is also called on execve (via
+begin_new_exec -> io_uring_task_cancel -> __io_uring_cancel ->
+io_uring_cancel_generic -> __io_uring_free).
+
+This means: A process going through execve while having registered rings
+will leak references to the rings' `struct file`.
+
+Fix it by zapping registered rings on execve(). This is implemented by
+moving the io_uring_unreg_ringfd() from io_uring_files_cancel() into its
+callee __io_uring_cancel(), which is called from io_uring_task_cancel() on
+execve.
+
+This could probably be exploited *on 32-bit kernels* by leaking 2^32
+references to the same ring, because the file refcount is stored in a
+pointer-sized field and get_file() doesn't have protection against
+refcount overflow, just a WARN_ONCE(); but on 64-bit it should have no
+impact beyond a memory leak.
+
+Cc: stable@vger.kernel.org
+Fixes: e7a6c00dc77a ("io_uring: add support for registering ring file descriptors")
+Signed-off-by: Jann Horn <jannh@google.com>
+Link: https://lore.kernel.org/r/20241218-uring-reg-ring-cleanup-v1-1-8f63e999045b@google.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/io_uring.h | 4 +---
+ io_uring/io_uring.c | 1 +
+ 2 files changed, 2 insertions(+), 3 deletions(-)
+
+--- a/include/linux/io_uring.h
++++ b/include/linux/io_uring.h
+@@ -15,10 +15,8 @@ bool io_is_uring_fops(struct file *file)
+
+ static inline void io_uring_files_cancel(void)
+ {
+- if (current->io_uring) {
+- io_uring_unreg_ringfd();
++ if (current->io_uring)
+ __io_uring_cancel(false);
+- }
+ }
+ static inline void io_uring_task_cancel(void)
+ {
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -3230,6 +3230,7 @@ end_wait:
+
+ void __io_uring_cancel(bool cancel_all)
+ {
++ io_uring_unreg_ringfd();
+ io_uring_cancel_generic(cancel_all, NULL);
+ }
+
--- /dev/null
+From 9b42d1e8e4fe9dc631162c04caa69b0d1860b0f0 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Wed, 27 Nov 2024 16:43:39 -0800
+Subject: KVM: x86: Play nice with protected guests in complete_hypercall_exit()
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 9b42d1e8e4fe9dc631162c04caa69b0d1860b0f0 upstream.
+
+Use is_64_bit_hypercall() instead of is_64_bit_mode() to detect a 64-bit
+hypercall when completing said hypercall. For guests with protected state,
+e.g. SEV-ES and SEV-SNP, KVM must assume the hypercall was made in 64-bit
+mode as the vCPU state needed to detect 64-bit mode is unavailable.
+
+Hacking the sev_smoke_test selftest to generate a KVM_HC_MAP_GPA_RANGE
+hypercall via VMGEXIT trips the WARN:
+
+ ------------[ cut here ]------------
+ WARNING: CPU: 273 PID: 326626 at arch/x86/kvm/x86.h:180 complete_hypercall_exit+0x44/0xe0 [kvm]
+ Modules linked in: kvm_amd kvm ... [last unloaded: kvm]
+ CPU: 273 UID: 0 PID: 326626 Comm: sev_smoke_test Not tainted 6.12.0-smp--392e932fa0f3-feat #470
+ Hardware name: Google Astoria/astoria, BIOS 0.20240617.0-0 06/17/2024
+ RIP: 0010:complete_hypercall_exit+0x44/0xe0 [kvm]
+ Call Trace:
+ <TASK>
+ kvm_arch_vcpu_ioctl_run+0x2400/0x2720 [kvm]
+ kvm_vcpu_ioctl+0x54f/0x630 [kvm]
+ __se_sys_ioctl+0x6b/0xc0
+ do_syscall_64+0x83/0x160
+ entry_SYSCALL_64_after_hwframe+0x76/0x7e
+ </TASK>
+ ---[ end trace 0000000000000000 ]---
+
+Fixes: b5aead0064f3 ("KVM: x86: Assume a 64-bit hypercall for guests with protected state")
+Cc: stable@vger.kernel.org
+Cc: Tom Lendacky <thomas.lendacky@amd.com>
+Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
+Reviewed-by: Nikunj A Dadhania <nikunj@amd.com>
+Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
+Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
+Reviewed-by: Kai Huang <kai.huang@intel.com>
+Link: https://lore.kernel.org/r/20241128004344.4072099-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -9991,7 +9991,7 @@ static int complete_hypercall_exit(struc
+ {
+ u64 ret = vcpu->run->hypercall.ret;
+
+- if (!is_64_bit_mode(vcpu))
++ if (!is_64_bit_hypercall(vcpu))
+ ret = (u32)ret;
+ kvm_rax_write(vcpu, ret);
+ ++vcpu->stat.hypercalls;
--- /dev/null
+From 41856638e6c4ed51d8aa9e54f70059d1e357b46e Mon Sep 17 00:00:00 2001
+From: Heiko Carstens <hca@linux.ibm.com>
+Date: Fri, 29 Nov 2024 17:39:27 +0100
+Subject: s390/mm: Fix DirectMap accounting
+
+From: Heiko Carstens <hca@linux.ibm.com>
+
+commit 41856638e6c4ed51d8aa9e54f70059d1e357b46e upstream.
+
+With uncoupling of physical and virtual address spaces population of
+the identity mapping was changed to use the type POPULATE_IDENTITY
+instead of POPULATE_DIRECT. This breaks DirectMap accounting:
+
+> cat /proc/meminfo
+DirectMap4k: 55296 kB
+DirectMap1M: 18446744073709496320 kB
+
+Adjust all locations of update_page_count() in vmem.c to use
+POPULATE_IDENTITY instead of POPULATE_DIRECT as well. With this
+accounting is correct again:
+
+> cat /proc/meminfo
+DirectMap4k: 54264 kB
+DirectMap1M: 8334336 kB
+
+Fixes: c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces")
+Cc: stable@vger.kernel.org
+Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
+Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
+Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/s390/boot/vmem.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/arch/s390/boot/vmem.c
++++ b/arch/s390/boot/vmem.c
+@@ -306,7 +306,7 @@ static void pgtable_pte_populate(pmd_t *
+ pages++;
+ }
+ }
+- if (mode == POPULATE_DIRECT)
++ if (mode == POPULATE_IDENTITY)
+ update_page_count(PG_DIRECT_MAP_4K, pages);
+ }
+
+@@ -339,7 +339,7 @@ static void pgtable_pmd_populate(pud_t *
+ }
+ pgtable_pte_populate(pmd, addr, next, mode);
+ }
+- if (mode == POPULATE_DIRECT)
++ if (mode == POPULATE_IDENTITY)
+ update_page_count(PG_DIRECT_MAP_1M, pages);
+ }
+
+@@ -372,7 +372,7 @@ static void pgtable_pud_populate(p4d_t *
+ }
+ pgtable_pmd_populate(pud, addr, next, mode);
+ }
+- if (mode == POPULATE_DIRECT)
++ if (mode == POPULATE_IDENTITY)
+ update_page_count(PG_DIRECT_MAP_2G, pages);
+ }
+
--- /dev/null
+From 29d44cce324dab2bd86c447071a596262e7109b6 Mon Sep 17 00:00:00 2001
+From: Tiezhu Yang <yangtiezhu@loongson.cn>
+Date: Thu, 19 Dec 2024 19:15:06 +0800
+Subject: selftests/bpf: Use asm constraint "m" for LoongArch
+
+From: Tiezhu Yang <yangtiezhu@loongson.cn>
+
+commit 29d44cce324dab2bd86c447071a596262e7109b6 upstream.
+
+Currently, LoongArch LLVM does not support the constraint "o" and no plan
+to support it, it only supports the similar constraint "m", so change the
+constraints from "nor" in the "else" case to arch-specific "nmr" to avoid
+the build error such as "unexpected asm memory constraint" for LoongArch.
+
+Fixes: 630301b0d59d ("selftests/bpf: Add basic USDT selftests")
+Suggested-by: Weining Lu <luweining@loongson.cn>
+Suggested-by: Li Chen <chenli@loongson.cn>
+Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Reviewed-by: Huacai Chen <chenhuacai@loongson.cn>
+Cc: stable@vger.kernel.org
+Link: https://llvm.org/docs/LangRef.html#supported-constraint-code-list
+Link: https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp#L172
+Link: https://lore.kernel.org/bpf/20241219111506.20643-1-yangtiezhu@loongson.cn
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/sdt.h | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/tools/testing/selftests/bpf/sdt.h
++++ b/tools/testing/selftests/bpf/sdt.h
+@@ -102,6 +102,8 @@
+ # define STAP_SDT_ARG_CONSTRAINT nZr
+ # elif defined __arm__
+ # define STAP_SDT_ARG_CONSTRAINT g
++# elif defined __loongarch__
++# define STAP_SDT_ARG_CONSTRAINT nmr
+ # else
+ # define STAP_SDT_ARG_CONSTRAINT nor
+ # endif
--- /dev/null
+From 6a75f19af16ff482cfd6085c77123aa0f464f8dd Mon Sep 17 00:00:00 2001
+From: "Isaac J. Manjarres" <isaacmanjarres@google.com>
+Date: Thu, 5 Dec 2024 11:29:41 -0800
+Subject: selftests/memfd: run sysctl tests when PID namespace support is enabled
+
+From: Isaac J. Manjarres <isaacmanjarres@google.com>
+
+commit 6a75f19af16ff482cfd6085c77123aa0f464f8dd upstream.
+
+The sysctl tests for vm.memfd_noexec rely on the kernel to support PID
+namespaces (i.e. the kernel is built with CONFIG_PID_NS=y). If the
+kernel the test runs on does not support PID namespaces, the first sysctl
+test will fail when attempting to spawn a new thread in a new PID
+namespace, abort the test, preventing the remaining tests from being run.
+
+This is not desirable, as not all kernels need PID namespaces, but can
+still use the other features provided by memfd. Therefore, only run the
+sysctl tests if the kernel supports PID namespaces. Otherwise, skip those
+tests and emit an informative message to let the user know why the sysctl
+tests are not being run.
+
+Link: https://lkml.kernel.org/r/20241205192943.3228757-1-isaacmanjarres@google.com
+Fixes: 11f75a01448f ("selftests/memfd: add tests for MFD_NOEXEC_SEAL MFD_EXEC")
+Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
+Reviewed-by: Jeff Xu <jeffxu@google.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: <stable@vger.kernel.org> [6.6+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/memfd/memfd_test.c | 14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+--- a/tools/testing/selftests/memfd/memfd_test.c
++++ b/tools/testing/selftests/memfd/memfd_test.c
+@@ -9,6 +9,7 @@
+ #include <fcntl.h>
+ #include <linux/memfd.h>
+ #include <sched.h>
++#include <stdbool.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <signal.h>
+@@ -1557,6 +1558,11 @@ static void test_share_fork(char *banner
+ close(fd);
+ }
+
++static bool pid_ns_supported(void)
++{
++ return access("/proc/self/ns/pid", F_OK) == 0;
++}
++
+ int main(int argc, char **argv)
+ {
+ pid_t pid;
+@@ -1591,8 +1597,12 @@ int main(int argc, char **argv)
+ test_seal_grow();
+ test_seal_resize();
+
+- test_sysctl_simple();
+- test_sysctl_nested();
++ if (pid_ns_supported()) {
++ test_sysctl_simple();
++ test_sysctl_nested();
++ } else {
++ printf("PID namespaces are not supported; skipping sysctl tests\n");
++ }
+
+ test_share_dup("SHARE-DUP", "");
+ test_share_mmap("SHARE-MMAP", "");
mm-page_alloc-don-t-call-pfn_to_page-on-possibly-non-existent-pfn-in-split_large_buddy.patch
ring-buffer-fix-overflow-in-__rb_map_vma.patch
alloc_tag-fix-set_codetag_empty-when-config_mem_alloc_profiling_debug.patch
-cxl-pci-fix-potential-bogus-return-value-upon-successful-probing.patch
+btrfs-split-bios-to-the-fs-sector-size-boundary.patch
+btrfs-fix-improper-generation-check-in-snapshot-delete.patch
+btrfs-tree-checker-reject-inline-extent-items-with-0-ref-count.patch
+s390-mm-fix-directmap-accounting.patch
+drm-amdgpu-nbio7.0-fix-ip-version-check.patch
+drm-amdgpu-gfx12-fix-ip-version-check.patch
+drm-amdgpu-mmhub4.1-fix-ip-version-check.patch
+fgraph-still-initialize-idle-shadow-stacks-when-starting.patch
+drivers-hv-util-avoid-accessing-a-ringbuffer-not-initialized-yet.patch
+tools-hv-fix-a-complier-warning-in-the-fcopy-uio-daemon.patch
+x86-hyperv-fix-hv-tsc-page-based-sched_clock-for-hibernation.patch
+kvm-x86-play-nice-with-protected-guests-in-complete_hypercall_exit.patch
+smb-client-fix-tcp-timers-deadlock-after-rmmod.patch
+accel-ivpu-fix-general-protection-fault-in-ivpu_bo_list.patch
+accel-ivpu-fix-warn-in-ivpu_ipc_send_receive_internal.patch
+tracing-fix-test_event_printk-to-process-entire-print-argument.patch
+tracing-add-missing-helper-functions-in-event-pointer-dereference-check.patch
+tracing-add-s-check-in-test_event_printk.patch
+tracing-check-s-dereference-via-the-field-and-not-the-tp_printk-format.patch
+selftests-memfd-run-sysctl-tests-when-pid-namespace-support-is-enabled.patch
+selftests-bpf-use-asm-constraint-m-for-loongarch.patch
+io_uring-fix-registered-ring-file-refcount-leak.patch
+io_uring-check-if-iowq-is-killed-before-queuing.patch
--- /dev/null
+From e9f2517a3e18a54a3943c098d2226b245d488801 Mon Sep 17 00:00:00 2001
+From: Enzo Matsumiya <ematsumiya@suse.de>
+Date: Tue, 10 Dec 2024 18:15:12 -0300
+Subject: smb: client: fix TCP timers deadlock after rmmod
+
+From: Enzo Matsumiya <ematsumiya@suse.de>
+
+commit e9f2517a3e18a54a3943c098d2226b245d488801 upstream.
+
+Commit ef7134c7fc48 ("smb: client: Fix use-after-free of network namespace.")
+fixed a netns UAF by manually enabled socket refcounting
+(sk->sk_net_refcnt=1 and sock_inuse_add(net, 1)).
+
+The reason the patch worked for that bug was because we now hold
+references to the netns (get_net_track() gets a ref internally)
+and they're properly released (internally, on __sk_destruct()),
+but only because sk->sk_net_refcnt was set.
+
+Problem:
+(this happens regardless of CONFIG_NET_NS_REFCNT_TRACKER and regardless
+if init_net or other)
+
+Setting sk->sk_net_refcnt=1 *manually* and *after* socket creation is not
+only out of cifs scope, but also technically wrong -- it's set conditionally
+based on user (=1) vs kernel (=0) sockets. And net/ implementations
+seem to base their user vs kernel space operations on it.
+
+e.g. upon TCP socket close, the TCP timers are not cleared because
+sk->sk_net_refcnt=1:
+(cf. commit 151c9c724d05 ("tcp: properly terminate timers for kernel sockets"))
+
+net/ipv4/tcp.c:
+ void tcp_close(struct sock *sk, long timeout)
+ {
+ lock_sock(sk);
+ __tcp_close(sk, timeout);
+ release_sock(sk);
+ if (!sk->sk_net_refcnt)
+ inet_csk_clear_xmit_timers_sync(sk);
+ sock_put(sk);
+ }
+
+Which will throw a lockdep warning and then, as expected, deadlock on
+tcp_write_timer().
+
+A way to reproduce this is by running the reproducer from ef7134c7fc48
+and then 'rmmod cifs'. A few seconds later, the deadlock/lockdep
+warning shows up.
+
+Fix:
+We shouldn't mess with socket internals ourselves, so do not set
+sk_net_refcnt manually.
+
+Also change __sock_create() to sock_create_kern() for explicitness.
+
+As for non-init_net network namespaces, we deal with it the best way
+we can -- hold an extra netns reference for server->ssocket and drop it
+when it's released. This ensures that the netns still exists whenever
+we need to create/destroy server->ssocket, but is not directly tied to
+it.
+
+Fixes: ef7134c7fc48 ("smb: client: Fix use-after-free of network namespace.")
+Cc: stable@vger.kernel.org
+Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/smb/client/connect.c | 36 ++++++++++++++++++++++++++----------
+ 1 file changed, 26 insertions(+), 10 deletions(-)
+
+--- a/fs/smb/client/connect.c
++++ b/fs/smb/client/connect.c
+@@ -987,9 +987,13 @@ clean_demultiplex_info(struct TCP_Server
+ msleep(125);
+ if (cifs_rdma_enabled(server))
+ smbd_destroy(server);
++
+ if (server->ssocket) {
+ sock_release(server->ssocket);
+ server->ssocket = NULL;
++
++ /* Release netns reference for the socket. */
++ put_net(cifs_net_ns(server));
+ }
+
+ if (!list_empty(&server->pending_mid_q)) {
+@@ -1037,6 +1041,7 @@ clean_demultiplex_info(struct TCP_Server
+ */
+ }
+
++ /* Release netns reference for this server. */
+ put_net(cifs_net_ns(server));
+ kfree(server->leaf_fullpath);
+ kfree(server);
+@@ -1713,6 +1718,8 @@ cifs_get_tcp_session(struct smb3_fs_cont
+
+ tcp_ses->ops = ctx->ops;
+ tcp_ses->vals = ctx->vals;
++
++ /* Grab netns reference for this server. */
+ cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
+
+ tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId);
+@@ -1844,6 +1851,7 @@ smbd_connected:
+ out_err_crypto_release:
+ cifs_crypto_secmech_release(tcp_ses);
+
++ /* Release netns reference for this server. */
+ put_net(cifs_net_ns(tcp_ses));
+
+ out_err:
+@@ -1852,8 +1860,10 @@ out_err:
+ cifs_put_tcp_session(tcp_ses->primary_server, false);
+ kfree(tcp_ses->hostname);
+ kfree(tcp_ses->leaf_fullpath);
+- if (tcp_ses->ssocket)
++ if (tcp_ses->ssocket) {
+ sock_release(tcp_ses->ssocket);
++ put_net(cifs_net_ns(tcp_ses));
++ }
+ kfree(tcp_ses);
+ }
+ return ERR_PTR(rc);
+@@ -3111,20 +3121,20 @@ generic_ip_connect(struct TCP_Server_Inf
+ socket = server->ssocket;
+ } else {
+ struct net *net = cifs_net_ns(server);
+- struct sock *sk;
+
+- rc = __sock_create(net, sfamily, SOCK_STREAM,
+- IPPROTO_TCP, &server->ssocket, 1);
++ rc = sock_create_kern(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket);
+ if (rc < 0) {
+ cifs_server_dbg(VFS, "Error %d creating socket\n", rc);
+ return rc;
+ }
+
+- sk = server->ssocket->sk;
+- __netns_tracker_free(net, &sk->ns_tracker, false);
+- sk->sk_net_refcnt = 1;
+- get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
+- sock_inuse_add(net, 1);
++ /*
++ * Grab netns reference for the socket.
++ *
++ * It'll be released here, on error, or in clean_demultiplex_info() upon server
++ * teardown.
++ */
++ get_net(net);
+
+ /* BB other socket options to set KEEPALIVE, NODELAY? */
+ cifs_dbg(FYI, "Socket created\n");
+@@ -3138,8 +3148,10 @@ generic_ip_connect(struct TCP_Server_Inf
+ }
+
+ rc = bind_socket(server);
+- if (rc < 0)
++ if (rc < 0) {
++ put_net(cifs_net_ns(server));
+ return rc;
++ }
+
+ /*
+ * Eventually check for other socket options to change from
+@@ -3176,6 +3188,7 @@ generic_ip_connect(struct TCP_Server_Inf
+ if (rc < 0) {
+ cifs_dbg(FYI, "Error %d connecting to server\n", rc);
+ trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc);
++ put_net(cifs_net_ns(server));
+ sock_release(socket);
+ server->ssocket = NULL;
+ return rc;
+@@ -3184,6 +3197,9 @@ generic_ip_connect(struct TCP_Server_Inf
+ if (sport == htons(RFC1001_PORT))
+ rc = ip_rfc1001_connect(server);
+
++ if (rc < 0)
++ put_net(cifs_net_ns(server));
++
+ return rc;
+ }
+
--- /dev/null
+From cb1b78f1c726c938bd47497c1ab16b01ce967f37 Mon Sep 17 00:00:00 2001
+From: Dexuan Cui <decui@microsoft.com>
+Date: Tue, 10 Sep 2024 00:44:32 +0000
+Subject: tools: hv: Fix a complier warning in the fcopy uio daemon
+
+From: Dexuan Cui <decui@microsoft.com>
+
+commit cb1b78f1c726c938bd47497c1ab16b01ce967f37 upstream.
+
+hv_fcopy_uio_daemon.c:436:53: warning: '%s' directive output may be truncated
+writing up to 14 bytes into a region of size 10 [-Wformat-truncation=]
+ 436 | snprintf(uio_dev_path, sizeof(uio_dev_path), "/dev/%s", uio_name);
+
+Also added 'static' for the array 'desc[]'.
+
+Fixes: 82b0945ce2c2 ("tools: hv: Add new fcopy application based on uio driver")
+Cc: stable@vger.kernel.org # 6.10+
+Signed-off-by: Dexuan Cui <decui@microsoft.com>
+Reviewed-by: Saurabh Sengar <ssengar@linux.microsoft.com>
+Link: https://lore.kernel.org/r/20240910004433.50254-1-decui@microsoft.com
+Signed-off-by: Wei Liu <wei.liu@kernel.org>
+Message-ID: <20240910004433.50254-1-decui@microsoft.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/hv/hv_fcopy_uio_daemon.c | 8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+diff --git a/tools/hv/hv_fcopy_uio_daemon.c b/tools/hv/hv_fcopy_uio_daemon.c
+index 7a00f3066a98..12743d7f164f 100644
+--- a/tools/hv/hv_fcopy_uio_daemon.c
++++ b/tools/hv/hv_fcopy_uio_daemon.c
+@@ -35,8 +35,6 @@
+ #define WIN8_SRV_MINOR 1
+ #define WIN8_SRV_VERSION (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR)
+
+-#define MAX_FOLDER_NAME 15
+-#define MAX_PATH_LEN 15
+ #define FCOPY_UIO "/sys/bus/vmbus/devices/eb765408-105f-49b6-b4aa-c123b64d17d4/uio"
+
+ #define FCOPY_VER_COUNT 1
+@@ -51,7 +49,7 @@ static const int fw_versions[] = {
+
+ #define HV_RING_SIZE 0x4000 /* 16KB ring buffer size */
+
+-unsigned char desc[HV_RING_SIZE];
++static unsigned char desc[HV_RING_SIZE];
+
+ static int target_fd;
+ static char target_fname[PATH_MAX];
+@@ -409,8 +407,8 @@ int main(int argc, char *argv[])
+ struct vmbus_br txbr, rxbr;
+ void *ring;
+ uint32_t len = HV_RING_SIZE;
+- char uio_name[MAX_FOLDER_NAME] = {0};
+- char uio_dev_path[MAX_PATH_LEN] = {0};
++ char uio_name[NAME_MAX] = {0};
++ char uio_dev_path[PATH_MAX] = {0};
+
+ static struct option long_options[] = {
+ {"help", no_argument, 0, 'h' },
+--
+2.47.1
+
--- /dev/null
+From 917110481f6bc1c96b1e54b62bb114137fbc6d17 Mon Sep 17 00:00:00 2001
+From: Steven Rostedt <rostedt@goodmis.org>
+Date: Mon, 16 Dec 2024 21:41:20 -0500
+Subject: tracing: Add missing helper functions in event pointer dereference check
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+commit 917110481f6bc1c96b1e54b62bb114137fbc6d17 upstream.
+
+The process_pointer() helper function looks to see if various trace event
+macros are used. These macros are for storing data in the event. This
+makes it safe to dereference as the dereference will then point into the
+event on the ring buffer where the content of the data stays with the
+event itself.
+
+A few helper functions were missing. Those were:
+
+ __get_rel_dynamic_array()
+ __get_dynamic_array_len()
+ __get_rel_dynamic_array_len()
+ __get_rel_sockaddr()
+
+Also add a helper function find_print_string() to not need to use a middle
+man variable to test if the string exists.
+
+Cc: stable@vger.kernel.org
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Al Viro <viro@ZenIV.linux.org.uk>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Link: https://lore.kernel.org/20241217024720.521836792@goodmis.org
+Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/trace_events.c | 21 +++++++++++++++++++--
+ 1 file changed, 19 insertions(+), 2 deletions(-)
+
+--- a/kernel/trace/trace_events.c
++++ b/kernel/trace/trace_events.c
+@@ -274,6 +274,15 @@ static bool test_field(const char *fmt,
+ return false;
+ }
+
++/* Look for a string within an argument */
++static bool find_print_string(const char *arg, const char *str, const char *end)
++{
++ const char *r;
++
++ r = strstr(arg, str);
++ return r && r < end;
++}
++
+ /* Return true if the argument pointer is safe */
+ static bool process_pointer(const char *fmt, int len, struct trace_event_call *call)
+ {
+@@ -292,9 +301,17 @@ static bool process_pointer(const char *
+ a = strchr(fmt, '&');
+ if ((a && (a < r)) || test_field(r, call))
+ return true;
+- } else if ((r = strstr(fmt, "__get_dynamic_array(")) && r < e) {
++ } else if (find_print_string(fmt, "__get_dynamic_array(", e)) {
++ return true;
++ } else if (find_print_string(fmt, "__get_rel_dynamic_array(", e)) {
++ return true;
++ } else if (find_print_string(fmt, "__get_dynamic_array_len(", e)) {
++ return true;
++ } else if (find_print_string(fmt, "__get_rel_dynamic_array_len(", e)) {
++ return true;
++ } else if (find_print_string(fmt, "__get_sockaddr(", e)) {
+ return true;
+- } else if ((r = strstr(fmt, "__get_sockaddr(")) && r < e) {
++ } else if (find_print_string(fmt, "__get_rel_sockaddr(", e)) {
+ return true;
+ }
+ return false;
--- /dev/null
+From 65a25d9f7ac02e0cf361356e834d1c71d36acca9 Mon Sep 17 00:00:00 2001
+From: Steven Rostedt <rostedt@goodmis.org>
+Date: Mon, 16 Dec 2024 21:41:21 -0500
+Subject: tracing: Add "%s" check in test_event_printk()
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+commit 65a25d9f7ac02e0cf361356e834d1c71d36acca9 upstream.
+
+The test_event_printk() code makes sure that when a trace event is
+registered, any dereferenced pointers in from the event's TP_printk() are
+pointing to content in the ring buffer. But currently it does not handle
+"%s", as there's cases where the string pointer saved in the ring buffer
+points to a static string in the kernel that will never be freed. As that
+is a valid case, the pointer needs to be checked at runtime.
+
+Currently the runtime check is done via trace_check_vprintf(), but to not
+have to replicate everything in vsnprintf() it does some logic with the
+va_list that may not be reliable across architectures. In order to get rid
+of that logic, more work in the test_event_printk() needs to be done. Some
+of the strings can be validated at this time when it is obvious the string
+is valid because the string will be saved in the ring buffer content.
+
+Do all the validation of strings in the ring buffer at boot in
+test_event_printk(), and make sure that the field of the strings that
+point into the kernel are accessible. This will allow adding checks at
+runtime that will validate the fields themselves and not rely on paring
+the TP_printk() format at runtime.
+
+Cc: stable@vger.kernel.org
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Al Viro <viro@ZenIV.linux.org.uk>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Link: https://lore.kernel.org/20241217024720.685917008@goodmis.org
+Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/trace_events.c | 104 +++++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 89 insertions(+), 15 deletions(-)
+
+--- a/kernel/trace/trace_events.c
++++ b/kernel/trace/trace_events.c
+@@ -244,19 +244,16 @@ int trace_event_get_offsets(struct trace
+ return tail->offset + tail->size;
+ }
+
+-/*
+- * Check if the referenced field is an array and return true,
+- * as arrays are OK to dereference.
+- */
+-static bool test_field(const char *fmt, struct trace_event_call *call)
++
++static struct trace_event_fields *find_event_field(const char *fmt,
++ struct trace_event_call *call)
+ {
+ struct trace_event_fields *field = call->class->fields_array;
+- const char *array_descriptor;
+ const char *p = fmt;
+ int len;
+
+ if (!(len = str_has_prefix(fmt, "REC->")))
+- return false;
++ return NULL;
+ fmt += len;
+ for (p = fmt; *p; p++) {
+ if (!isalnum(*p) && *p != '_')
+@@ -267,11 +264,26 @@ static bool test_field(const char *fmt,
+ for (; field->type; field++) {
+ if (strncmp(field->name, fmt, len) || field->name[len])
+ continue;
+- array_descriptor = strchr(field->type, '[');
+- /* This is an array and is OK to dereference. */
+- return array_descriptor != NULL;
++
++ return field;
+ }
+- return false;
++ return NULL;
++}
++
++/*
++ * Check if the referenced field is an array and return true,
++ * as arrays are OK to dereference.
++ */
++static bool test_field(const char *fmt, struct trace_event_call *call)
++{
++ struct trace_event_fields *field;
++
++ field = find_event_field(fmt, call);
++ if (!field)
++ return false;
++
++ /* This is an array and is OK to dereference. */
++ return strchr(field->type, '[') != NULL;
+ }
+
+ /* Look for a string within an argument */
+@@ -317,6 +329,53 @@ static bool process_pointer(const char *
+ return false;
+ }
+
++/* Return true if the string is safe */
++static bool process_string(const char *fmt, int len, struct trace_event_call *call)
++{
++ const char *r, *e, *s;
++
++ e = fmt + len;
++
++ /*
++ * There are several helper functions that return strings.
++ * If the argument contains a function, then assume its field is valid.
++ * It is considered that the argument has a function if it has:
++ * alphanumeric or '_' before a parenthesis.
++ */
++ s = fmt;
++ do {
++ r = strstr(s, "(");
++ if (!r || r >= e)
++ break;
++ for (int i = 1; r - i >= s; i++) {
++ char ch = *(r - i);
++ if (isspace(ch))
++ continue;
++ if (isalnum(ch) || ch == '_')
++ return true;
++ /* Anything else, this isn't a function */
++ break;
++ }
++ /* A function could be wrapped in parethesis, try the next one */
++ s = r + 1;
++ } while (s < e);
++
++ /*
++ * If there's any strings in the argument consider this arg OK as it
++ * could be: REC->field ? "foo" : "bar" and we don't want to get into
++ * verifying that logic here.
++ */
++ if (find_print_string(fmt, "\"", e))
++ return true;
++
++ /* Dereferenced strings are also valid like any other pointer */
++ if (process_pointer(fmt, len, call))
++ return true;
++
++ /* Make sure the field is found, and consider it OK for now if it is */
++ return find_event_field(fmt, call) != NULL;
++}
++
+ /*
+ * Examine the print fmt of the event looking for unsafe dereference
+ * pointers using %p* that could be recorded in the trace event and
+@@ -326,6 +385,7 @@ static bool process_pointer(const char *
+ static void test_event_printk(struct trace_event_call *call)
+ {
+ u64 dereference_flags = 0;
++ u64 string_flags = 0;
+ bool first = true;
+ const char *fmt;
+ int parens = 0;
+@@ -416,8 +476,16 @@ static void test_event_printk(struct tra
+ star = true;
+ continue;
+ }
+- if ((fmt[i + j] == 's') && star)
+- arg++;
++ if ((fmt[i + j] == 's')) {
++ if (star)
++ arg++;
++ if (WARN_ONCE(arg == 63,
++ "Too many args for event: %s",
++ trace_event_name(call)))
++ return;
++ dereference_flags |= 1ULL << arg;
++ string_flags |= 1ULL << arg;
++ }
+ break;
+ }
+ break;
+@@ -464,7 +532,10 @@ static void test_event_printk(struct tra
+ }
+
+ if (dereference_flags & (1ULL << arg)) {
+- if (process_pointer(fmt + start_arg, e - start_arg, call))
++ if (string_flags & (1ULL << arg)) {
++ if (process_string(fmt + start_arg, e - start_arg, call))
++ dereference_flags &= ~(1ULL << arg);
++ } else if (process_pointer(fmt + start_arg, e - start_arg, call))
+ dereference_flags &= ~(1ULL << arg);
+ }
+
+@@ -476,7 +547,10 @@ static void test_event_printk(struct tra
+ }
+
+ if (dereference_flags & (1ULL << arg)) {
+- if (process_pointer(fmt + start_arg, i - start_arg, call))
++ if (string_flags & (1ULL << arg)) {
++ if (process_string(fmt + start_arg, i - start_arg, call))
++ dereference_flags &= ~(1ULL << arg);
++ } else if (process_pointer(fmt + start_arg, i - start_arg, call))
+ dereference_flags &= ~(1ULL << arg);
+ }
+
--- /dev/null
+From afd2627f727b89496d79a6b934a025fc916d4ded Mon Sep 17 00:00:00 2001
+From: Steven Rostedt <rostedt@goodmis.org>
+Date: Mon, 16 Dec 2024 21:41:22 -0500
+Subject: tracing: Check "%s" dereference via the field and not the TP_printk format
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+commit afd2627f727b89496d79a6b934a025fc916d4ded upstream.
+
+The TP_printk() portion of a trace event is executed at the time a event
+is read from the trace. This can happen seconds, minutes, hours, days,
+months, years possibly later since the event was recorded. If the print
+format contains a dereference to a string via "%s", and that string was
+allocated, there's a chance that string could be freed before it is read
+by the trace file.
+
+To protect against such bugs, there are two functions that verify the
+event. The first one is test_event_printk(), which is called when the
+event is created. It reads the TP_printk() format as well as its arguments
+to make sure nothing may be dereferencing a pointer that was not copied
+into the ring buffer along with the event. If it is, it will trigger a
+WARN_ON().
+
+For strings that use "%s", it is not so easy. The string may not reside in
+the ring buffer but may still be valid. Strings that are static and part
+of the kernel proper which will not be freed for the life of the running
+system, are safe to dereference. But to know if it is a pointer to a
+static string or to something on the heap can not be determined until the
+event is triggered.
+
+This brings us to the second function that tests for the bad dereferencing
+of strings, trace_check_vprintf(). It would walk through the printf format
+looking for "%s", and when it finds it, it would validate that the pointer
+is safe to read. If not, it would produces a WARN_ON() as well and write
+into the ring buffer "[UNSAFE-MEMORY]".
+
+The problem with this is how it used va_list to have vsnprintf() handle
+all the cases that it didn't need to check. Instead of re-implementing
+vsnprintf(), it would make a copy of the format up to the %s part, and
+call vsnprintf() with the current va_list ap variable, where the ap would
+then be ready to point at the string in question.
+
+For architectures that passed va_list by reference this was possible. For
+architectures that passed it by copy it was not. A test_can_verify()
+function was used to differentiate between the two, and if it wasn't
+possible, it would disable it.
+
+Even for architectures where this was feasible, it was a stretch to rely
+on such a method that is undocumented, and could cause issues later on
+with new optimizations of the compiler.
+
+Instead, the first function test_event_printk() was updated to look at
+"%s" as well. If the "%s" argument is a pointer outside the event in the
+ring buffer, it would find the field type of the event that is the problem
+and mark the structure with a new flag called "needs_test". The event
+itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that
+this event has a field that needs to be verified before the event can be
+printed using the printf format.
+
+When the event fields are created from the field type structure, the
+fields would copy the field type's "needs_test" value.
+
+Finally, before being printed, a new function ignore_event() is called
+which will check if the event has the TEST_STR flag set (if not, it
+returns false). If the flag is set, it then iterates through the events
+fields looking for the ones that have the "needs_test" flag set.
+
+Then it uses the offset field from the field structure to find the pointer
+in the ring buffer event. It runs the tests to make sure that pointer is
+safe to print and if not, it triggers the WARN_ON() and also adds to the
+trace output that the event in question has an unsafe memory access.
+
+The ignore_event() makes the trace_check_vprintf() obsolete so it is
+removed.
+
+Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/
+
+Cc: stable@vger.kernel.org
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Al Viro <viro@ZenIV.linux.org.uk>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org
+Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/trace_events.h | 6 -
+ kernel/trace/trace.c | 255 ++++++++-----------------------------------
+ kernel/trace/trace.h | 6 -
+ kernel/trace/trace_events.c | 32 +++--
+ kernel/trace/trace_output.c | 6 -
+ 5 files changed, 88 insertions(+), 217 deletions(-)
+
+--- a/include/linux/trace_events.h
++++ b/include/linux/trace_events.h
+@@ -285,7 +285,8 @@ struct trace_event_fields {
+ const char *name;
+ const int size;
+ const int align;
+- const int is_signed;
++ const unsigned int is_signed:1;
++ unsigned int needs_test:1;
+ const int filter_type;
+ const int len;
+ };
+@@ -337,6 +338,7 @@ enum {
+ TRACE_EVENT_FL_EPROBE_BIT,
+ TRACE_EVENT_FL_FPROBE_BIT,
+ TRACE_EVENT_FL_CUSTOM_BIT,
++ TRACE_EVENT_FL_TEST_STR_BIT,
+ };
+
+ /*
+@@ -354,6 +356,7 @@ enum {
+ * CUSTOM - Event is a custom event (to be attached to an exsiting tracepoint)
+ * This is set when the custom event has not been attached
+ * to a tracepoint yet, then it is cleared when it is.
++ * TEST_STR - The event has a "%s" that points to a string outside the event
+ */
+ enum {
+ TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT),
+@@ -367,6 +370,7 @@ enum {
+ TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT),
+ TRACE_EVENT_FL_FPROBE = (1 << TRACE_EVENT_FL_FPROBE_BIT),
+ TRACE_EVENT_FL_CUSTOM = (1 << TRACE_EVENT_FL_CUSTOM_BIT),
++ TRACE_EVENT_FL_TEST_STR = (1 << TRACE_EVENT_FL_TEST_STR_BIT),
+ };
+
+ #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE)
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -3635,17 +3635,12 @@ char *trace_iter_expand_format(struct tr
+ }
+
+ /* Returns true if the string is safe to dereference from an event */
+-static bool trace_safe_str(struct trace_iterator *iter, const char *str,
+- bool star, int len)
++static bool trace_safe_str(struct trace_iterator *iter, const char *str)
+ {
+ unsigned long addr = (unsigned long)str;
+ struct trace_event *trace_event;
+ struct trace_event_call *event;
+
+- /* Ignore strings with no length */
+- if (star && !len)
+- return true;
+-
+ /* OK if part of the event data */
+ if ((addr >= (unsigned long)iter->ent) &&
+ (addr < (unsigned long)iter->ent + iter->ent_size))
+@@ -3685,181 +3680,69 @@ static bool trace_safe_str(struct trace_
+ return false;
+ }
+
+-static DEFINE_STATIC_KEY_FALSE(trace_no_verify);
+-
+-static int test_can_verify_check(const char *fmt, ...)
+-{
+- char buf[16];
+- va_list ap;
+- int ret;
+-
+- /*
+- * The verifier is dependent on vsnprintf() modifies the va_list
+- * passed to it, where it is sent as a reference. Some architectures
+- * (like x86_32) passes it by value, which means that vsnprintf()
+- * does not modify the va_list passed to it, and the verifier
+- * would then need to be able to understand all the values that
+- * vsnprintf can use. If it is passed by value, then the verifier
+- * is disabled.
+- */
+- va_start(ap, fmt);
+- vsnprintf(buf, 16, "%d", ap);
+- ret = va_arg(ap, int);
+- va_end(ap);
+-
+- return ret;
+-}
+-
+-static void test_can_verify(void)
+-{
+- if (!test_can_verify_check("%d %d", 0, 1)) {
+- pr_info("trace event string verifier disabled\n");
+- static_branch_inc(&trace_no_verify);
+- }
+-}
+-
+ /**
+- * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer
++ * ignore_event - Check dereferenced fields while writing to the seq buffer
+ * @iter: The iterator that holds the seq buffer and the event being printed
+- * @fmt: The format used to print the event
+- * @ap: The va_list holding the data to print from @fmt.
+ *
+- * This writes the data into the @iter->seq buffer using the data from
+- * @fmt and @ap. If the format has a %s, then the source of the string
+- * is examined to make sure it is safe to print, otherwise it will
+- * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string
+- * pointer.
++ * At boot up, test_event_printk() will flag any event that dereferences
++ * a string with "%s" that does exist in the ring buffer. It may still
++ * be valid, as the string may point to a static string in the kernel
++ * rodata that never gets freed. But if the string pointer is pointing
++ * to something that was allocated, there's a chance that it can be freed
++ * by the time the user reads the trace. This would cause a bad memory
++ * access by the kernel and possibly crash the system.
++ *
++ * This function will check if the event has any fields flagged as needing
++ * to be checked at runtime and perform those checks.
++ *
++ * If it is found that a field is unsafe, it will write into the @iter->seq
++ * a message stating what was found to be unsafe.
++ *
++ * @return: true if the event is unsafe and should be ignored,
++ * false otherwise.
+ */
+-void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
+- va_list ap)
++bool ignore_event(struct trace_iterator *iter)
+ {
+- long text_delta = 0;
+- long data_delta = 0;
+- const char *p = fmt;
+- const char *str;
+- bool good;
+- int i, j;
++ struct ftrace_event_field *field;
++ struct trace_event *trace_event;
++ struct trace_event_call *event;
++ struct list_head *head;
++ struct trace_seq *seq;
++ const void *ptr;
+
+- if (WARN_ON_ONCE(!fmt))
+- return;
++ trace_event = ftrace_find_event(iter->ent->type);
+
+- if (static_branch_unlikely(&trace_no_verify))
+- goto print;
++ seq = &iter->seq;
+
+- /*
+- * When the kernel is booted with the tp_printk command line
+- * parameter, trace events go directly through to printk().
+- * It also is checked by this function, but it does not
+- * have an associated trace_array (tr) for it.
+- */
+- if (iter->tr) {
+- text_delta = iter->tr->text_delta;
+- data_delta = iter->tr->data_delta;
++ if (!trace_event) {
++ trace_seq_printf(seq, "EVENT ID %d NOT FOUND?\n", iter->ent->type);
++ return true;
+ }
+
+- /* Don't bother checking when doing a ftrace_dump() */
+- if (iter->fmt == static_fmt_buf)
+- goto print;
+-
+- while (*p) {
+- bool star = false;
+- int len = 0;
+-
+- j = 0;
+-
+- /*
+- * We only care about %s and variants
+- * as well as %p[sS] if delta is non-zero
+- */
+- for (i = 0; p[i]; i++) {
+- if (i + 1 >= iter->fmt_size) {
+- /*
+- * If we can't expand the copy buffer,
+- * just print it.
+- */
+- if (!trace_iter_expand_format(iter))
+- goto print;
+- }
+-
+- if (p[i] == '\\' && p[i+1]) {
+- i++;
+- continue;
+- }
+- if (p[i] == '%') {
+- /* Need to test cases like %08.*s */
+- for (j = 1; p[i+j]; j++) {
+- if (isdigit(p[i+j]) ||
+- p[i+j] == '.')
+- continue;
+- if (p[i+j] == '*') {
+- star = true;
+- continue;
+- }
+- break;
+- }
+- if (p[i+j] == 's')
+- break;
++ event = container_of(trace_event, struct trace_event_call, event);
++ if (!(event->flags & TRACE_EVENT_FL_TEST_STR))
++ return false;
+
+- if (text_delta && p[i+1] == 'p' &&
+- ((p[i+2] == 's' || p[i+2] == 'S')))
+- break;
++ head = trace_get_fields(event);
++ if (!head) {
++ trace_seq_printf(seq, "FIELDS FOR EVENT '%s' NOT FOUND?\n",
++ trace_event_name(event));
++ return true;
++ }
+
+- star = false;
+- }
+- j = 0;
+- }
+- /* If no %s found then just print normally */
+- if (!p[i])
+- break;
++ /* Offsets are from the iter->ent that points to the raw event */
++ ptr = iter->ent;
+
+- /* Copy up to the %s, and print that */
+- strncpy(iter->fmt, p, i);
+- iter->fmt[i] = '\0';
+- trace_seq_vprintf(&iter->seq, iter->fmt, ap);
+-
+- /* Add delta to %pS pointers */
+- if (p[i+1] == 'p') {
+- unsigned long addr;
+- char fmt[4];
+-
+- fmt[0] = '%';
+- fmt[1] = 'p';
+- fmt[2] = p[i+2]; /* Either %ps or %pS */
+- fmt[3] = '\0';
+-
+- addr = va_arg(ap, unsigned long);
+- addr += text_delta;
+- trace_seq_printf(&iter->seq, fmt, (void *)addr);
++ list_for_each_entry(field, head, link) {
++ const char *str;
++ bool good;
+
+- p += i + 3;
++ if (!field->needs_test)
+ continue;
+- }
+
+- /*
+- * If iter->seq is full, the above call no longer guarantees
+- * that ap is in sync with fmt processing, and further calls
+- * to va_arg() can return wrong positional arguments.
+- *
+- * Ensure that ap is no longer used in this case.
+- */
+- if (iter->seq.full) {
+- p = "";
+- break;
+- }
+-
+- if (star)
+- len = va_arg(ap, int);
+-
+- /* The ap now points to the string data of the %s */
+- str = va_arg(ap, const char *);
+-
+- good = trace_safe_str(iter, str, star, len);
++ str = *(const char **)(ptr + field->offset);
+
+- /* Could be from the last boot */
+- if (data_delta && !good) {
+- str += data_delta;
+- good = trace_safe_str(iter, str, star, len);
+- }
++ good = trace_safe_str(iter, str);
+
+ /*
+ * If you hit this warning, it is likely that the
+@@ -3870,44 +3753,14 @@ void trace_check_vprintf(struct trace_it
+ * instead. See samples/trace_events/trace-events-sample.h
+ * for reference.
+ */
+- if (WARN_ONCE(!good, "fmt: '%s' current_buffer: '%s'",
+- fmt, seq_buf_str(&iter->seq.seq))) {
+- int ret;
+-
+- /* Try to safely read the string */
+- if (star) {
+- if (len + 1 > iter->fmt_size)
+- len = iter->fmt_size - 1;
+- if (len < 0)
+- len = 0;
+- ret = copy_from_kernel_nofault(iter->fmt, str, len);
+- iter->fmt[len] = 0;
+- star = false;
+- } else {
+- ret = strncpy_from_kernel_nofault(iter->fmt, str,
+- iter->fmt_size);
+- }
+- if (ret < 0)
+- trace_seq_printf(&iter->seq, "(0x%px)", str);
+- else
+- trace_seq_printf(&iter->seq, "(0x%px:%s)",
+- str, iter->fmt);
+- str = "[UNSAFE-MEMORY]";
+- strcpy(iter->fmt, "%s");
+- } else {
+- strncpy(iter->fmt, p + i, j + 1);
+- iter->fmt[j+1] = '\0';
++ if (WARN_ONCE(!good, "event '%s' has unsafe pointer field '%s'",
++ trace_event_name(event), field->name)) {
++ trace_seq_printf(seq, "EVENT %s: HAS UNSAFE POINTER FIELD '%s'\n",
++ trace_event_name(event), field->name);
++ return true;
+ }
+- if (star)
+- trace_seq_printf(&iter->seq, iter->fmt, len, str);
+- else
+- trace_seq_printf(&iter->seq, iter->fmt, str);
+-
+- p += i + j + 1;
+ }
+- print:
+- if (*p)
+- trace_seq_vprintf(&iter->seq, p, ap);
++ return false;
+ }
+
+ const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
+@@ -10803,8 +10656,6 @@ __init static int tracer_alloc_buffers(v
+
+ register_snapshot_cmd();
+
+- test_can_verify();
+-
+ return 0;
+
+ out_free_pipe_cpumask:
+--- a/kernel/trace/trace.h
++++ b/kernel/trace/trace.h
+@@ -664,9 +664,8 @@ void trace_buffer_unlock_commit_nostack(
+
+ bool trace_is_tracepoint_string(const char *str);
+ const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
+-void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
+- va_list ap) __printf(2, 0);
+ char *trace_iter_expand_format(struct trace_iterator *iter);
++bool ignore_event(struct trace_iterator *iter);
+
+ int trace_empty(struct trace_iterator *iter);
+
+@@ -1402,7 +1401,8 @@ struct ftrace_event_field {
+ int filter_type;
+ int offset;
+ int size;
+- int is_signed;
++ unsigned int is_signed:1;
++ unsigned int needs_test:1;
+ int len;
+ };
+
+--- a/kernel/trace/trace_events.c
++++ b/kernel/trace/trace_events.c
+@@ -82,7 +82,7 @@ static int system_refcount_dec(struct ev
+ }
+
+ static struct ftrace_event_field *
+-__find_event_field(struct list_head *head, char *name)
++__find_event_field(struct list_head *head, const char *name)
+ {
+ struct ftrace_event_field *field;
+
+@@ -114,7 +114,8 @@ trace_find_event_field(struct trace_even
+
+ static int __trace_define_field(struct list_head *head, const char *type,
+ const char *name, int offset, int size,
+- int is_signed, int filter_type, int len)
++ int is_signed, int filter_type, int len,
++ int need_test)
+ {
+ struct ftrace_event_field *field;
+
+@@ -133,6 +134,7 @@ static int __trace_define_field(struct l
+ field->offset = offset;
+ field->size = size;
+ field->is_signed = is_signed;
++ field->needs_test = need_test;
+ field->len = len;
+
+ list_add(&field->link, head);
+@@ -151,13 +153,13 @@ int trace_define_field(struct trace_even
+
+ head = trace_get_fields(call);
+ return __trace_define_field(head, type, name, offset, size,
+- is_signed, filter_type, 0);
++ is_signed, filter_type, 0, 0);
+ }
+ EXPORT_SYMBOL_GPL(trace_define_field);
+
+ static int trace_define_field_ext(struct trace_event_call *call, const char *type,
+ const char *name, int offset, int size, int is_signed,
+- int filter_type, int len)
++ int filter_type, int len, int need_test)
+ {
+ struct list_head *head;
+
+@@ -166,13 +168,13 @@ static int trace_define_field_ext(struct
+
+ head = trace_get_fields(call);
+ return __trace_define_field(head, type, name, offset, size,
+- is_signed, filter_type, len);
++ is_signed, filter_type, len, need_test);
+ }
+
+ #define __generic_field(type, item, filter_type) \
+ ret = __trace_define_field(&ftrace_generic_fields, #type, \
+ #item, 0, 0, is_signed_type(type), \
+- filter_type, 0); \
++ filter_type, 0, 0); \
+ if (ret) \
+ return ret;
+
+@@ -181,7 +183,8 @@ static int trace_define_field_ext(struct
+ "common_" #item, \
+ offsetof(typeof(ent), item), \
+ sizeof(ent.item), \
+- is_signed_type(type), FILTER_OTHER, 0); \
++ is_signed_type(type), FILTER_OTHER, \
++ 0, 0); \
+ if (ret) \
+ return ret;
+
+@@ -332,6 +335,7 @@ static bool process_pointer(const char *
+ /* Return true if the string is safe */
+ static bool process_string(const char *fmt, int len, struct trace_event_call *call)
+ {
++ struct trace_event_fields *field;
+ const char *r, *e, *s;
+
+ e = fmt + len;
+@@ -372,8 +376,16 @@ static bool process_string(const char *f
+ if (process_pointer(fmt, len, call))
+ return true;
+
+- /* Make sure the field is found, and consider it OK for now if it is */
+- return find_event_field(fmt, call) != NULL;
++ /* Make sure the field is found */
++ field = find_event_field(fmt, call);
++ if (!field)
++ return false;
++
++ /* Test this field's string before printing the event */
++ call->flags |= TRACE_EVENT_FL_TEST_STR;
++ field->needs_test = 1;
++
++ return true;
+ }
+
+ /*
+@@ -2586,7 +2598,7 @@ event_define_fields(struct trace_event_c
+ ret = trace_define_field_ext(call, field->type, field->name,
+ offset, field->size,
+ field->is_signed, field->filter_type,
+- field->len);
++ field->len, field->needs_test);
+ if (WARN_ON_ONCE(ret)) {
+ pr_err("error code is %d\n", ret);
+ break;
+--- a/kernel/trace/trace_output.c
++++ b/kernel/trace/trace_output.c
+@@ -317,10 +317,14 @@ EXPORT_SYMBOL(trace_raw_output_prep);
+
+ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...)
+ {
++ struct trace_seq *s = &iter->seq;
+ va_list ap;
+
++ if (ignore_event(iter))
++ return;
++
+ va_start(ap, fmt);
+- trace_check_vprintf(iter, trace_event_format(iter, fmt), ap);
++ trace_seq_vprintf(s, trace_event_format(iter, fmt), ap);
+ va_end(ap);
+ }
+ EXPORT_SYMBOL(trace_event_printf);
--- /dev/null
+From a6629626c584200daf495cc9a740048b455addcd Mon Sep 17 00:00:00 2001
+From: Steven Rostedt <rostedt@goodmis.org>
+Date: Mon, 16 Dec 2024 21:41:19 -0500
+Subject: tracing: Fix test_event_printk() to process entire print argument
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+commit a6629626c584200daf495cc9a740048b455addcd upstream.
+
+The test_event_printk() analyzes print formats of trace events looking for
+cases where it may dereference a pointer that is not in the ring buffer
+which can possibly be a bug when the trace event is read from the ring
+buffer and the content of that pointer no longer exists.
+
+The function needs to accurately go from one print format argument to the
+next. It handles quotes and parenthesis that may be included in an
+argument. When it finds the start of the next argument, it uses a simple
+"c = strstr(fmt + i, ',')" to find the end of that argument!
+
+In order to include "%s" dereferencing, it needs to process the entire
+content of the print format argument and not just the content of the first
+',' it finds. As there may be content like:
+
+ ({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char
+ *access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
+ }; union kvm_mmu_page_role role; role.word = REC->role;
+ trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe
+ %sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level,
+ role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "",
+ access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? ""
+ : "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ?
+ "unsync" : "sync", 0); saved_ptr; })
+
+Which is an example of a full argument of an existing event. As the code
+already handles finding the next print format argument, process the
+argument at the end of it and not the start of it. This way it has both
+the start of the argument as well as the end of it.
+
+Add a helper function "process_pointer()" that will do the processing during
+the loop as well as at the end. It also makes the code cleaner and easier
+to read.
+
+Cc: stable@vger.kernel.org
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Al Viro <viro@ZenIV.linux.org.uk>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org
+Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/trace_events.c | 82 ++++++++++++++++++++++++++++----------------
+ 1 file changed, 53 insertions(+), 29 deletions(-)
+
+--- a/kernel/trace/trace_events.c
++++ b/kernel/trace/trace_events.c
+@@ -265,8 +265,7 @@ static bool test_field(const char *fmt,
+ len = p - fmt;
+
+ for (; field->type; field++) {
+- if (strncmp(field->name, fmt, len) ||
+- field->name[len])
++ if (strncmp(field->name, fmt, len) || field->name[len])
+ continue;
+ array_descriptor = strchr(field->type, '[');
+ /* This is an array and is OK to dereference. */
+@@ -275,6 +274,32 @@ static bool test_field(const char *fmt,
+ return false;
+ }
+
++/* Return true if the argument pointer is safe */
++static bool process_pointer(const char *fmt, int len, struct trace_event_call *call)
++{
++ const char *r, *e, *a;
++
++ e = fmt + len;
++
++ /* Find the REC-> in the argument */
++ r = strstr(fmt, "REC->");
++ if (r && r < e) {
++ /*
++ * Addresses of events on the buffer, or an array on the buffer is
++ * OK to dereference. There's ways to fool this, but
++ * this is to catch common mistakes, not malicious code.
++ */
++ a = strchr(fmt, '&');
++ if ((a && (a < r)) || test_field(r, call))
++ return true;
++ } else if ((r = strstr(fmt, "__get_dynamic_array(")) && r < e) {
++ return true;
++ } else if ((r = strstr(fmt, "__get_sockaddr(")) && r < e) {
++ return true;
++ }
++ return false;
++}
++
+ /*
+ * Examine the print fmt of the event looking for unsafe dereference
+ * pointers using %p* that could be recorded in the trace event and
+@@ -285,12 +310,12 @@ static void test_event_printk(struct tra
+ {
+ u64 dereference_flags = 0;
+ bool first = true;
+- const char *fmt, *c, *r, *a;
++ const char *fmt;
+ int parens = 0;
+ char in_quote = 0;
+ int start_arg = 0;
+ int arg = 0;
+- int i;
++ int i, e;
+
+ fmt = call->print_fmt;
+
+@@ -403,42 +428,41 @@ static void test_event_printk(struct tra
+ case ',':
+ if (in_quote || parens)
+ continue;
++ e = i;
+ i++;
+ while (isspace(fmt[i]))
+ i++;
+- start_arg = i;
+- if (!(dereference_flags & (1ULL << arg)))
+- goto next_arg;
+
+- /* Find the REC-> in the argument */
+- c = strchr(fmt + i, ',');
+- r = strstr(fmt + i, "REC->");
+- if (r && (!c || r < c)) {
+- /*
+- * Addresses of events on the buffer,
+- * or an array on the buffer is
+- * OK to dereference.
+- * There's ways to fool this, but
+- * this is to catch common mistakes,
+- * not malicious code.
+- */
+- a = strchr(fmt + i, '&');
+- if ((a && (a < r)) || test_field(r, call))
++ /*
++ * If start_arg is zero, then this is the start of the
++ * first argument. The processing of the argument happens
++ * when the end of the argument is found, as it needs to
++ * handle paranthesis and such.
++ */
++ if (!start_arg) {
++ start_arg = i;
++ /* Balance out the i++ in the for loop */
++ i--;
++ continue;
++ }
++
++ if (dereference_flags & (1ULL << arg)) {
++ if (process_pointer(fmt + start_arg, e - start_arg, call))
+ dereference_flags &= ~(1ULL << arg);
+- } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) &&
+- (!c || r < c)) {
+- dereference_flags &= ~(1ULL << arg);
+- } else if ((r = strstr(fmt + i, "__get_sockaddr(")) &&
+- (!c || r < c)) {
+- dereference_flags &= ~(1ULL << arg);
+ }
+
+- next_arg:
+- i--;
++ start_arg = i;
+ arg++;
++ /* Balance out the i++ in the for loop */
++ i--;
+ }
+ }
+
++ if (dereference_flags & (1ULL << arg)) {
++ if (process_pointer(fmt + start_arg, i - start_arg, call))
++ dereference_flags &= ~(1ULL << arg);
++ }
++
+ /*
+ * If you triggered the below warning, the trace event reported
+ * uses an unsafe dereference pointer %p*. As the data stored
--- /dev/null
+From bcc80dec91ee745b3d66f3e48f0ec2efdea97149 Mon Sep 17 00:00:00 2001
+From: Naman Jain <namjain@linux.microsoft.com>
+Date: Tue, 17 Sep 2024 11:09:17 +0530
+Subject: x86/hyperv: Fix hv tsc page based sched_clock for hibernation
+
+From: Naman Jain <namjain@linux.microsoft.com>
+
+commit bcc80dec91ee745b3d66f3e48f0ec2efdea97149 upstream.
+
+read_hv_sched_clock_tsc() assumes that the Hyper-V clock counter is
+bigger than the variable hv_sched_clock_offset, which is cached during
+early boot, but depending on the timing this assumption may be false
+when a hibernated VM starts again (the clock counter starts from 0
+again) and is resuming back (Note: hv_init_tsc_clocksource() is not
+called during hibernation/resume); consequently,
+read_hv_sched_clock_tsc() may return a negative integer (which is
+interpreted as a huge positive integer since the return type is u64)
+and new kernel messages are prefixed with huge timestamps before
+read_hv_sched_clock_tsc() grows big enough (which typically takes
+several seconds).
+
+Fix the issue by saving the Hyper-V clock counter just before the
+suspend, and using it to correct the hv_sched_clock_offset in
+resume. This makes hv tsc page based sched_clock continuous and ensures
+that post resume, it starts from where it left off during suspend.
+Override x86_platform.save_sched_clock_state and
+x86_platform.restore_sched_clock_state routines to correct this as soon
+as possible.
+
+Note: if Invariant TSC is available, the issue doesn't happen because
+1) we don't register read_hv_sched_clock_tsc() for sched clock:
+See commit e5313f1c5404 ("clocksource/drivers/hyper-v: Rework
+clocksource and sched clock setup");
+2) the common x86 code adjusts TSC similarly: see
+__restore_processor_state() -> tsc_verify_tsc_adjust(true) and
+x86_platform.restore_sched_clock_state().
+
+Cc: stable@vger.kernel.org
+Fixes: 1349401ff1aa ("clocksource/drivers/hyper-v: Suspend/resume Hyper-V clocksource for hibernation")
+Co-developed-by: Dexuan Cui <decui@microsoft.com>
+Signed-off-by: Dexuan Cui <decui@microsoft.com>
+Signed-off-by: Naman Jain <namjain@linux.microsoft.com>
+Reviewed-by: Michael Kelley <mhklinux@outlook.com>
+Link: https://lore.kernel.org/r/20240917053917.76787-1-namjain@linux.microsoft.com
+Signed-off-by: Wei Liu <wei.liu@kernel.org>
+Message-ID: <20240917053917.76787-1-namjain@linux.microsoft.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/cpu/mshyperv.c | 58 +++++++++++++++++++++++++++++++++++++
+ drivers/clocksource/hyperv_timer.c | 14 ++++++++
+ include/clocksource/hyperv_timer.h | 2 +
+ 3 files changed, 73 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kernel/cpu/mshyperv.c
++++ b/arch/x86/kernel/cpu/mshyperv.c
+@@ -223,6 +223,63 @@ static void hv_machine_crash_shutdown(st
+ hyperv_cleanup();
+ }
+ #endif /* CONFIG_CRASH_DUMP */
++
++static u64 hv_ref_counter_at_suspend;
++static void (*old_save_sched_clock_state)(void);
++static void (*old_restore_sched_clock_state)(void);
++
++/*
++ * Hyper-V clock counter resets during hibernation. Save and restore clock
++ * offset during suspend/resume, while also considering the time passed
++ * before suspend. This is to make sure that sched_clock using hv tsc page
++ * based clocksource, proceeds from where it left off during suspend and
++ * it shows correct time for the timestamps of kernel messages after resume.
++ */
++static void save_hv_clock_tsc_state(void)
++{
++ hv_ref_counter_at_suspend = hv_read_reference_counter();
++}
++
++static void restore_hv_clock_tsc_state(void)
++{
++ /*
++ * Adjust the offsets used by hv tsc clocksource to
++ * account for the time spent before hibernation.
++ * adjusted value = reference counter (time) at suspend
++ * - reference counter (time) now.
++ */
++ hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter());
++}
++
++/*
++ * Functions to override save_sched_clock_state and restore_sched_clock_state
++ * functions of x86_platform. The Hyper-V clock counter is reset during
++ * suspend-resume and the offset used to measure time needs to be
++ * corrected, post resume.
++ */
++static void hv_save_sched_clock_state(void)
++{
++ old_save_sched_clock_state();
++ save_hv_clock_tsc_state();
++}
++
++static void hv_restore_sched_clock_state(void)
++{
++ restore_hv_clock_tsc_state();
++ old_restore_sched_clock_state();
++}
++
++static void __init x86_setup_ops_for_tsc_pg_clock(void)
++{
++ if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
++ return;
++
++ old_save_sched_clock_state = x86_platform.save_sched_clock_state;
++ x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
++
++ old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
++ x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
++}
+ #endif /* CONFIG_HYPERV */
+
+ static uint32_t __init ms_hyperv_platform(void)
+@@ -579,6 +636,7 @@ static void __init ms_hyperv_init_platfo
+
+ /* Register Hyper-V specific clocksource */
+ hv_init_clocksource();
++ x86_setup_ops_for_tsc_pg_clock();
+ hv_vtl_init_platform();
+ #endif
+ /*
+--- a/drivers/clocksource/hyperv_timer.c
++++ b/drivers/clocksource/hyperv_timer.c
+@@ -27,7 +27,8 @@
+ #include <asm/mshyperv.h>
+
+ static struct clock_event_device __percpu *hv_clock_event;
+-static u64 hv_sched_clock_offset __ro_after_init;
++/* Note: offset can hold negative values after hibernation. */
++static u64 hv_sched_clock_offset __read_mostly;
+
+ /*
+ * If false, we're using the old mechanism for stimer0 interrupts
+@@ -470,6 +471,17 @@ static void resume_hv_clock_tsc(struct c
+ hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
+ }
+
++/*
++ * Called during resume from hibernation, from overridden
++ * x86_platform.restore_sched_clock_state routine. This is to adjust offsets
++ * used to calculate time for hv tsc page based sched_clock, to account for
++ * time spent before hibernation.
++ */
++void hv_adj_sched_clock_offset(u64 offset)
++{
++ hv_sched_clock_offset -= offset;
++}
++
+ #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK
+ static int hv_cs_enable(struct clocksource *cs)
+ {
+--- a/include/clocksource/hyperv_timer.h
++++ b/include/clocksource/hyperv_timer.h
+@@ -38,6 +38,8 @@ extern void hv_remap_tsc_clocksource(voi
+ extern unsigned long hv_get_tsc_pfn(void);
+ extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
+
++extern void hv_adj_sched_clock_offset(u64 offset);
++
+ static __always_inline bool
+ hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
+ u64 *cur_tsc, u64 *time)