]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.12-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 23 Dec 2024 11:50:45 +0000 (12:50 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 23 Dec 2024 11:50:45 +0000 (12:50 +0100)
added patches:
accel-ivpu-fix-general-protection-fault-in-ivpu_bo_list.patch
accel-ivpu-fix-warn-in-ivpu_ipc_send_receive_internal.patch
btrfs-fix-improper-generation-check-in-snapshot-delete.patch
btrfs-split-bios-to-the-fs-sector-size-boundary.patch
btrfs-tree-checker-reject-inline-extent-items-with-0-ref-count.patch
drivers-hv-util-avoid-accessing-a-ringbuffer-not-initialized-yet.patch
drm-amdgpu-gfx12-fix-ip-version-check.patch
drm-amdgpu-mmhub4.1-fix-ip-version-check.patch
drm-amdgpu-nbio7.0-fix-ip-version-check.patch
fgraph-still-initialize-idle-shadow-stacks-when-starting.patch
io_uring-check-if-iowq-is-killed-before-queuing.patch
io_uring-fix-registered-ring-file-refcount-leak.patch
kvm-x86-play-nice-with-protected-guests-in-complete_hypercall_exit.patch
s390-mm-fix-directmap-accounting.patch
selftests-bpf-use-asm-constraint-m-for-loongarch.patch
selftests-memfd-run-sysctl-tests-when-pid-namespace-support-is-enabled.patch
smb-client-fix-tcp-timers-deadlock-after-rmmod.patch
tools-hv-fix-a-complier-warning-in-the-fcopy-uio-daemon.patch
tracing-add-missing-helper-functions-in-event-pointer-dereference-check.patch
tracing-add-s-check-in-test_event_printk.patch
tracing-check-s-dereference-via-the-field-and-not-the-tp_printk-format.patch
tracing-fix-test_event_printk-to-process-entire-print-argument.patch
x86-hyperv-fix-hv-tsc-page-based-sched_clock-for-hibernation.patch

25 files changed:
queue-6.12/accel-ivpu-fix-general-protection-fault-in-ivpu_bo_list.patch [new file with mode: 0644]
queue-6.12/accel-ivpu-fix-warn-in-ivpu_ipc_send_receive_internal.patch [new file with mode: 0644]
queue-6.12/btrfs-fix-improper-generation-check-in-snapshot-delete.patch [new file with mode: 0644]
queue-6.12/btrfs-split-bios-to-the-fs-sector-size-boundary.patch [new file with mode: 0644]
queue-6.12/btrfs-tree-checker-reject-inline-extent-items-with-0-ref-count.patch [new file with mode: 0644]
queue-6.12/cxl-pci-fix-potential-bogus-return-value-upon-successful-probing.patch [deleted file]
queue-6.12/drivers-hv-util-avoid-accessing-a-ringbuffer-not-initialized-yet.patch [new file with mode: 0644]
queue-6.12/drm-amdgpu-gfx12-fix-ip-version-check.patch [new file with mode: 0644]
queue-6.12/drm-amdgpu-mmhub4.1-fix-ip-version-check.patch [new file with mode: 0644]
queue-6.12/drm-amdgpu-nbio7.0-fix-ip-version-check.patch [new file with mode: 0644]
queue-6.12/fgraph-still-initialize-idle-shadow-stacks-when-starting.patch [new file with mode: 0644]
queue-6.12/io_uring-check-if-iowq-is-killed-before-queuing.patch [new file with mode: 0644]
queue-6.12/io_uring-fix-registered-ring-file-refcount-leak.patch [new file with mode: 0644]
queue-6.12/kvm-x86-play-nice-with-protected-guests-in-complete_hypercall_exit.patch [new file with mode: 0644]
queue-6.12/s390-mm-fix-directmap-accounting.patch [new file with mode: 0644]
queue-6.12/selftests-bpf-use-asm-constraint-m-for-loongarch.patch [new file with mode: 0644]
queue-6.12/selftests-memfd-run-sysctl-tests-when-pid-namespace-support-is-enabled.patch [new file with mode: 0644]
queue-6.12/series
queue-6.12/smb-client-fix-tcp-timers-deadlock-after-rmmod.patch [new file with mode: 0644]
queue-6.12/tools-hv-fix-a-complier-warning-in-the-fcopy-uio-daemon.patch [new file with mode: 0644]
queue-6.12/tracing-add-missing-helper-functions-in-event-pointer-dereference-check.patch [new file with mode: 0644]
queue-6.12/tracing-add-s-check-in-test_event_printk.patch [new file with mode: 0644]
queue-6.12/tracing-check-s-dereference-via-the-field-and-not-the-tp_printk-format.patch [new file with mode: 0644]
queue-6.12/tracing-fix-test_event_printk-to-process-entire-print-argument.patch [new file with mode: 0644]
queue-6.12/x86-hyperv-fix-hv-tsc-page-based-sched_clock-for-hibernation.patch [new file with mode: 0644]

diff --git a/queue-6.12/accel-ivpu-fix-general-protection-fault-in-ivpu_bo_list.patch b/queue-6.12/accel-ivpu-fix-general-protection-fault-in-ivpu_bo_list.patch
new file mode 100644 (file)
index 0000000..4fa6ea7
--- /dev/null
@@ -0,0 +1,33 @@
+From 4b2efb9db0c22a130bbd1275e489b42c02d08050 Mon Sep 17 00:00:00 2001
+From: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+Date: Tue, 10 Dec 2024 14:09:37 +0100
+Subject: accel/ivpu: Fix general protection fault in ivpu_bo_list()
+
+From: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+
+commit 4b2efb9db0c22a130bbd1275e489b42c02d08050 upstream.
+
+Check if ctx is not NULL before accessing its fields.
+
+Fixes: 37dee2a2f433 ("accel/ivpu: Improve buffer object debug logs")
+Cc: stable@vger.kernel.org # v6.8
+Reviewed-by: Karol Wachowski <karol.wachowski@intel.com>
+Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
+Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20241210130939.1575610-2-jacek.lawrynowicz@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/accel/ivpu/ivpu_gem.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/accel/ivpu/ivpu_gem.c
++++ b/drivers/accel/ivpu/ivpu_gem.c
+@@ -406,7 +406,7 @@ static void ivpu_bo_print_info(struct iv
+       mutex_lock(&bo->lock);
+       drm_printf(p, "%-9p %-3u 0x%-12llx %-10lu 0x%-8x %-4u",
+-                 bo, bo->ctx->id, bo->vpu_addr, bo->base.base.size,
++                 bo, bo->ctx ? bo->ctx->id : 0, bo->vpu_addr, bo->base.base.size,
+                  bo->flags, kref_read(&bo->base.base.refcount));
+       if (bo->base.pages)
diff --git a/queue-6.12/accel-ivpu-fix-warn-in-ivpu_ipc_send_receive_internal.patch b/queue-6.12/accel-ivpu-fix-warn-in-ivpu_ipc_send_receive_internal.patch
new file mode 100644 (file)
index 0000000..96e4ea0
--- /dev/null
@@ -0,0 +1,43 @@
+From 0f6482caa6acdfdfc744db7430771fe7e6c4e787 Mon Sep 17 00:00:00 2001
+From: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+Date: Tue, 10 Dec 2024 14:09:39 +0100
+Subject: accel/ivpu: Fix WARN in ivpu_ipc_send_receive_internal()
+
+From: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+
+commit 0f6482caa6acdfdfc744db7430771fe7e6c4e787 upstream.
+
+Move pm_runtime_set_active() to ivpu_pm_init() so when
+ivpu_ipc_send_receive_internal() is executed before ivpu_pm_enable()
+it already has correct runtime state, even if last resume was
+not successful.
+
+Fixes: 8ed520ff4682 ("accel/ivpu: Move set autosuspend delay to HW specific code")
+Cc: stable@vger.kernel.org # v6.7+
+Reviewed-by: Karol Wachowski <karol.wachowski@intel.com>
+Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
+Signed-off-by: Jacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20241210130939.1575610-4-jacek.lawrynowicz@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/accel/ivpu/ivpu_pm.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/accel/ivpu/ivpu_pm.c
++++ b/drivers/accel/ivpu/ivpu_pm.c
+@@ -364,6 +364,7 @@ void ivpu_pm_init(struct ivpu_device *vd
+       pm_runtime_use_autosuspend(dev);
+       pm_runtime_set_autosuspend_delay(dev, delay);
++      pm_runtime_set_active(dev);
+       ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay);
+ }
+@@ -378,7 +379,6 @@ void ivpu_pm_enable(struct ivpu_device *
+ {
+       struct device *dev = vdev->drm.dev;
+-      pm_runtime_set_active(dev);
+       pm_runtime_allow(dev);
+       pm_runtime_mark_last_busy(dev);
+       pm_runtime_put_autosuspend(dev);
diff --git a/queue-6.12/btrfs-fix-improper-generation-check-in-snapshot-delete.patch b/queue-6.12/btrfs-fix-improper-generation-check-in-snapshot-delete.patch
new file mode 100644 (file)
index 0000000..1ba7fa3
--- /dev/null
@@ -0,0 +1,127 @@
+From d75d72a858f0c00ca8ae161b48cdb403807be4de Mon Sep 17 00:00:00 2001
+From: Josef Bacik <josef@toxicpanda.com>
+Date: Wed, 13 Nov 2024 11:11:55 -0500
+Subject: btrfs: fix improper generation check in snapshot delete
+
+From: Josef Bacik <josef@toxicpanda.com>
+
+commit d75d72a858f0c00ca8ae161b48cdb403807be4de upstream.
+
+We have been using the following check
+
+   if (generation <= root->root_key.offset)
+
+to make decisions about whether or not to visit a node during snapshot
+delete.  This is because for normal subvolumes this is set to 0, and for
+snapshots it's set to the creation generation.  The idea being that if
+the generation of the node is less than or equal to our creation
+generation then we don't need to visit that node, because it doesn't
+belong to us, we can simply drop our reference and move on.
+
+However reloc roots don't have their generation stored in
+root->root_key.offset, instead that is the objectid of their
+corresponding fs root.  This means we can incorrectly not walk into
+nodes that need to be dropped when deleting a reloc root.
+
+There are a variety of consequences to making the wrong choice in two
+distinct areas.
+
+visit_node_for_delete()
+
+1. False positive.  We think we are newer than the block when we really
+   aren't.  We don't visit the node and drop our reference to the node
+   and carry on.  This would result in leaked space.
+2. False negative.  We do decide to walk down into a block that we
+   should have just dropped our reference to.  However this means that
+   the child node will have refs > 1, so we will switch to
+   UPDATE_BACKREF, and then the subsequent walk_down_proc() will notice
+   that btrfs_header_owner(node) != root->root_key.objectid and it'll
+   break out of the loop, and then walk_up_proc() will drop our reference,
+   so this appears to be ok.
+
+do_walk_down()
+
+1. False positive.  We are in UPDATE_BACKREF and incorrectly decide that
+   we are done and don't need to update the backref for our lower nodes.
+   This is another case that simply won't happen with relocation, as we
+   only have to do UPDATE_BACKREF if the node below us was shared and
+   didn't have FULL_BACKREF set, and since we don't own that node
+   because we're a reloc root we actually won't end up in this case.
+2. False negative.  Again this is tricky because as described above, we
+   simply wouldn't be here from relocation, because we don't own any of
+   the nodes because we never set btrfs_header_owner() to the reloc root
+   objectid, and we always use FULL_BACKREF, we never actually need to
+   set FULL_BACKREF on any children.
+
+Having spent a lot of time stressing relocation/snapshot delete recently
+I've not seen this pop in practice.  But this is objectively incorrect,
+so fix this to get the correct starting generation based on the root
+we're dropping to keep me from thinking there's a problem here.
+
+CC: stable@vger.kernel.org
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/ctree.h       |   19 +++++++++++++++++++
+ fs/btrfs/extent-tree.c |    6 +++---
+ 2 files changed, 22 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -371,6 +371,25 @@ static inline void btrfs_set_root_last_t
+ }
+ /*
++ * Return the generation this root started with.
++ *
++ * Every normal root that is created with root->root_key.offset set to it's
++ * originating generation.  If it is a snapshot it is the generation when the
++ * snapshot was created.
++ *
++ * However for TREE_RELOC roots root_key.offset is the objectid of the owning
++ * tree root.  Thankfully we copy the root item of the owning tree root, which
++ * has it's last_snapshot set to what we would have root_key.offset set to, so
++ * return that if this is a TREE_RELOC root.
++ */
++static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root)
++{
++      if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID)
++              return btrfs_root_last_snapshot(&root->root_item);
++      return root->root_key.offset;
++}
++
++/*
+  * Structure that conveys information about an extent that is going to replace
+  * all the extents in a file range.
+  */
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -5308,7 +5308,7 @@ static bool visit_node_for_delete(struct
+        * reference to it.
+        */
+       generation = btrfs_node_ptr_generation(eb, slot);
+-      if (!wc->update_ref || generation <= root->root_key.offset)
++      if (!wc->update_ref || generation <= btrfs_root_origin_generation(root))
+               return false;
+       /*
+@@ -5363,7 +5363,7 @@ static noinline void reada_walk_down(str
+                       goto reada;
+               if (wc->stage == UPDATE_BACKREF &&
+-                  generation <= root->root_key.offset)
++                  generation <= btrfs_root_origin_generation(root))
+                       continue;
+               /* We don't lock the tree block, it's OK to be racy here */
+@@ -5706,7 +5706,7 @@ static noinline int do_walk_down(struct
+        * for the subtree
+        */
+       if (wc->stage == UPDATE_BACKREF &&
+-          generation <= root->root_key.offset) {
++          generation <= btrfs_root_origin_generation(root)) {
+               wc->lookup_info = 1;
+               return 1;
+       }
diff --git a/queue-6.12/btrfs-split-bios-to-the-fs-sector-size-boundary.patch b/queue-6.12/btrfs-split-bios-to-the-fs-sector-size-boundary.patch
new file mode 100644 (file)
index 0000000..04da461
--- /dev/null
@@ -0,0 +1,47 @@
+From be691b5e593f2cc8cef67bbc59c1fb91b74a86a9 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Mon, 4 Nov 2024 07:26:33 +0100
+Subject: btrfs: split bios to the fs sector size boundary
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit be691b5e593f2cc8cef67bbc59c1fb91b74a86a9 upstream.
+
+Btrfs like other file systems can't really deal with I/O not aligned to
+it's internal block size (which strangely is called sector size in
+btrfs, for historical reasons), but the block layer split helper doesn't
+even know about that.
+
+Round down the split boundary so that all I/Os are aligned.
+
+Fixes: d5e4377d5051 ("btrfs: split zone append bios in btrfs_submit_bio")
+CC: stable@vger.kernel.org # 6.12
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Damien Le Moal <dlemoal@kernel.org>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/bio.c |   10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/bio.c
++++ b/fs/btrfs/bio.c
+@@ -649,8 +649,14 @@ static u64 btrfs_append_map_length(struc
+       map_length = min(map_length, bbio->fs_info->max_zone_append_size);
+       sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits,
+                                       &nr_segs, map_length);
+-      if (sector_offset)
+-              return sector_offset << SECTOR_SHIFT;
++      if (sector_offset) {
++              /*
++               * bio_split_rw_at() could split at a size smaller than our
++               * sectorsize and thus cause unaligned I/Os.  Fix that by
++               * always rounding down to the nearest boundary.
++               */
++              return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize);
++      }
+       return map_length;
+ }
diff --git a/queue-6.12/btrfs-tree-checker-reject-inline-extent-items-with-0-ref-count.patch b/queue-6.12/btrfs-tree-checker-reject-inline-extent-items-with-0-ref-count.patch
new file mode 100644 (file)
index 0000000..1b40c9b
--- /dev/null
@@ -0,0 +1,104 @@
+From dfb92681a19e1d5172420baa242806414b3eff6f Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Wed, 4 Dec 2024 13:30:46 +1030
+Subject: btrfs: tree-checker: reject inline extent items with 0 ref count
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit dfb92681a19e1d5172420baa242806414b3eff6f upstream.
+
+[BUG]
+There is a bug report in the mailing list where btrfs_run_delayed_refs()
+failed to drop the ref count for logical 25870311358464 num_bytes
+2113536.
+
+The involved leaf dump looks like this:
+
+  item 166 key (25870311358464 168 2113536) itemoff 10091 itemsize 50
+    extent refs 1 gen 84178 flags 1
+    ref#0: shared data backref parent 32399126528000 count 0 <<<
+    ref#1: shared data backref parent 31808973717504 count 1
+
+Notice the count number is 0.
+
+[CAUSE]
+There is no concrete evidence yet, but considering 0 -> 1 is also a
+single bit flipped, it's possible that hardware memory bitflip is
+involved, causing the on-disk extent tree to be corrupted.
+
+[FIX]
+To prevent us reading such corrupted extent item, or writing such
+damaged extent item back to disk, enhance the handling of
+BTRFS_EXTENT_DATA_REF_KEY and BTRFS_SHARED_DATA_REF_KEY keys for both
+inlined and key items, to detect such 0 ref count and reject them.
+
+CC: stable@vger.kernel.org # 5.4+
+Link: https://lore.kernel.org/linux-btrfs/7c69dd49-c346-4806-86e7-e6f863a66f48@app.fastmail.com/
+Reported-by: Frankie Fisher <frankie@terrorise.me.uk>
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-checker.c |   27 ++++++++++++++++++++++++++-
+ 1 file changed, 26 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/tree-checker.c
++++ b/fs/btrfs/tree-checker.c
+@@ -1527,6 +1527,11 @@ static int check_extent_item(struct exte
+                                          dref_offset, fs_info->sectorsize);
+                               return -EUCLEAN;
+                       }
++                      if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) {
++                              extent_err(leaf, slot,
++                      "invalid data ref count, should have non-zero value");
++                              return -EUCLEAN;
++                      }
+                       inline_refs += btrfs_extent_data_ref_count(leaf, dref);
+                       break;
+               /* Contains parent bytenr and ref count */
+@@ -1539,6 +1544,11 @@ static int check_extent_item(struct exte
+                                          inline_offset, fs_info->sectorsize);
+                               return -EUCLEAN;
+                       }
++                      if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) {
++                              extent_err(leaf, slot,
++                      "invalid shared data ref count, should have non-zero value");
++                              return -EUCLEAN;
++                      }
+                       inline_refs += btrfs_shared_data_ref_count(leaf, sref);
+                       break;
+               case BTRFS_EXTENT_OWNER_REF_KEY:
+@@ -1611,8 +1621,18 @@ static int check_simple_keyed_refs(struc
+ {
+       u32 expect_item_size = 0;
+-      if (key->type == BTRFS_SHARED_DATA_REF_KEY)
++      if (key->type == BTRFS_SHARED_DATA_REF_KEY) {
++              struct btrfs_shared_data_ref *sref;
++
++              sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref);
++              if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) {
++                      extent_err(leaf, slot,
++              "invalid shared data backref count, should have non-zero value");
++                      return -EUCLEAN;
++              }
++
+               expect_item_size = sizeof(struct btrfs_shared_data_ref);
++      }
+       if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) {
+               generic_err(leaf, slot,
+@@ -1689,6 +1709,11 @@ static int check_extent_data_ref(struct
+                                  offset, leaf->fs_info->sectorsize);
+                       return -EUCLEAN;
+               }
++              if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) {
++                      extent_err(leaf, slot,
++      "invalid extent data backref count, should have non-zero value");
++                      return -EUCLEAN;
++              }
+       }
+       return 0;
+ }
diff --git a/queue-6.12/cxl-pci-fix-potential-bogus-return-value-upon-successful-probing.patch b/queue-6.12/cxl-pci-fix-potential-bogus-return-value-upon-successful-probing.patch
deleted file mode 100644 (file)
index 4e51fb1..0000000
+++ /dev/null
@@ -1,40 +0,0 @@
-From da4d8c83358163df9a4addaeba0ef8bcb03b22e8 Mon Sep 17 00:00:00 2001
-From: Davidlohr Bueso <dave@stgolabs.net>
-Date: Fri, 15 Nov 2024 09:00:32 -0800
-Subject: cxl/pci: Fix potential bogus return value upon successful probing
-
-From: Davidlohr Bueso <dave@stgolabs.net>
-
-commit da4d8c83358163df9a4addaeba0ef8bcb03b22e8 upstream.
-
-If cxl_pci_ras_unmask() returns non-zero, cxl_pci_probe() will end up
-returning that value, instead of zero.
-
-Fixes: 248529edc86f ("cxl: add RAS status unmasking for CXL")
-Reviewed-by: Fan Ni <fan.ni@samsung.com>
-Signed-off-by: Davidlohr Bueso <dave@stgolabs.net>
-Reviewed-by: Ira Weiny <ira.weiny@intel.com>
-Link: https://patch.msgid.link/20241115170032.108445-1-dave@stgolabs.net
-Signed-off-by: Dave Jiang <dave.jiang@intel.com>
-Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
----
- drivers/cxl/pci.c | 3 +--
- 1 file changed, 1 insertion(+), 2 deletions(-)
-
-diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c
-index 0241d1d7133a..26ab06c9deff 100644
---- a/drivers/cxl/pci.c
-+++ b/drivers/cxl/pci.c
-@@ -1032,8 +1032,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
-       if (rc)
-               return rc;
--      rc = cxl_pci_ras_unmask(pdev);
--      if (rc)
-+      if (cxl_pci_ras_unmask(pdev))
-               dev_dbg(&pdev->dev, "No RAS reporting unmasked\n");
-       pci_save_state(pdev);
--- 
-2.47.1
-
diff --git a/queue-6.12/drivers-hv-util-avoid-accessing-a-ringbuffer-not-initialized-yet.patch b/queue-6.12/drivers-hv-util-avoid-accessing-a-ringbuffer-not-initialized-yet.patch
new file mode 100644 (file)
index 0000000..fe11703
--- /dev/null
@@ -0,0 +1,169 @@
+From 07a756a49f4b4290b49ea46e089cbe6f79ff8d26 Mon Sep 17 00:00:00 2001
+From: Michael Kelley <mhklinux@outlook.com>
+Date: Wed, 6 Nov 2024 07:42:47 -0800
+Subject: Drivers: hv: util: Avoid accessing a ringbuffer not initialized yet
+
+From: Michael Kelley <mhklinux@outlook.com>
+
+commit 07a756a49f4b4290b49ea46e089cbe6f79ff8d26 upstream.
+
+If the KVP (or VSS) daemon starts before the VMBus channel's ringbuffer is
+fully initialized, we can hit the panic below:
+
+hv_utils: Registering HyperV Utility Driver
+hv_vmbus: registering driver hv_utils
+...
+BUG: kernel NULL pointer dereference, address: 0000000000000000
+CPU: 44 UID: 0 PID: 2552 Comm: hv_kvp_daemon Tainted: G E 6.11.0-rc3+ #1
+RIP: 0010:hv_pkt_iter_first+0x12/0xd0
+Call Trace:
+...
+ vmbus_recvpacket
+ hv_kvp_onchannelcallback
+ vmbus_on_event
+ tasklet_action_common
+ tasklet_action
+ handle_softirqs
+ irq_exit_rcu
+ sysvec_hyperv_stimer0
+ </IRQ>
+ <TASK>
+ asm_sysvec_hyperv_stimer0
+...
+ kvp_register_done
+ hvt_op_read
+ vfs_read
+ ksys_read
+ __x64_sys_read
+
+This can happen because the KVP/VSS channel callback can be invoked
+even before the channel is fully opened:
+1) as soon as hv_kvp_init() -> hvutil_transport_init() creates
+/dev/vmbus/hv_kvp, the kvp daemon can open the device file immediately and
+register itself to the driver by writing a message KVP_OP_REGISTER1 to the
+file (which is handled by kvp_on_msg() ->kvp_handle_handshake()) and
+reading the file for the driver's response, which is handled by
+hvt_op_read(), which calls hvt->on_read(), i.e. kvp_register_done().
+
+2) the problem with kvp_register_done() is that it can cause the
+channel callback to be called even before the channel is fully opened,
+and when the channel callback is starting to run, util_probe()->
+vmbus_open() may have not initialized the ringbuffer yet, so the
+callback can hit the panic of NULL pointer dereference.
+
+To reproduce the panic consistently, we can add a "ssleep(10)" for KVP in
+__vmbus_open(), just before the first hv_ringbuffer_init(), and then we
+unload and reload the driver hv_utils, and run the daemon manually within
+the 10 seconds.
+
+Fix the panic by reordering the steps in util_probe() so the char dev
+entry used by the KVP or VSS daemon is not created until after
+vmbus_open() has completed. This reordering prevents the race condition
+from happening.
+
+Reported-by: Dexuan Cui <decui@microsoft.com>
+Fixes: e0fa3e5e7df6 ("Drivers: hv: utils: fix a race on userspace daemons registration")
+Cc: stable@vger.kernel.org
+Signed-off-by: Michael Kelley <mhklinux@outlook.com>
+Acked-by: Wei Liu <wei.liu@kernel.org>
+Link: https://lore.kernel.org/r/20241106154247.2271-3-mhklinux@outlook.com
+Signed-off-by: Wei Liu <wei.liu@kernel.org>
+Message-ID: <20241106154247.2271-3-mhklinux@outlook.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/hv/hv_kvp.c       |    6 ++++++
+ drivers/hv/hv_snapshot.c  |    6 ++++++
+ drivers/hv/hv_util.c      |    9 +++++++++
+ drivers/hv/hyperv_vmbus.h |    2 ++
+ include/linux/hyperv.h    |    1 +
+ 5 files changed, 24 insertions(+)
+
+--- a/drivers/hv/hv_kvp.c
++++ b/drivers/hv/hv_kvp.c
+@@ -767,6 +767,12 @@ hv_kvp_init(struct hv_util_service *srv)
+        */
+       kvp_transaction.state = HVUTIL_DEVICE_INIT;
++      return 0;
++}
++
++int
++hv_kvp_init_transport(void)
++{
+       hvt = hvutil_transport_init(kvp_devname, CN_KVP_IDX, CN_KVP_VAL,
+                                   kvp_on_msg, kvp_on_reset);
+       if (!hvt)
+--- a/drivers/hv/hv_snapshot.c
++++ b/drivers/hv/hv_snapshot.c
+@@ -388,6 +388,12 @@ hv_vss_init(struct hv_util_service *srv)
+        */
+       vss_transaction.state = HVUTIL_DEVICE_INIT;
++      return 0;
++}
++
++int
++hv_vss_init_transport(void)
++{
+       hvt = hvutil_transport_init(vss_devname, CN_VSS_IDX, CN_VSS_VAL,
+                                   vss_on_msg, vss_on_reset);
+       if (!hvt) {
+--- a/drivers/hv/hv_util.c
++++ b/drivers/hv/hv_util.c
+@@ -141,6 +141,7 @@ static struct hv_util_service util_heart
+ static struct hv_util_service util_kvp = {
+       .util_cb = hv_kvp_onchannelcallback,
+       .util_init = hv_kvp_init,
++      .util_init_transport = hv_kvp_init_transport,
+       .util_pre_suspend = hv_kvp_pre_suspend,
+       .util_pre_resume = hv_kvp_pre_resume,
+       .util_deinit = hv_kvp_deinit,
+@@ -149,6 +150,7 @@ static struct hv_util_service util_kvp =
+ static struct hv_util_service util_vss = {
+       .util_cb = hv_vss_onchannelcallback,
+       .util_init = hv_vss_init,
++      .util_init_transport = hv_vss_init_transport,
+       .util_pre_suspend = hv_vss_pre_suspend,
+       .util_pre_resume = hv_vss_pre_resume,
+       .util_deinit = hv_vss_deinit,
+@@ -613,6 +615,13 @@ static int util_probe(struct hv_device *
+       if (ret)
+               goto error;
++      if (srv->util_init_transport) {
++              ret = srv->util_init_transport();
++              if (ret) {
++                      vmbus_close(dev->channel);
++                      goto error;
++              }
++      }
+       return 0;
+ error:
+--- a/drivers/hv/hyperv_vmbus.h
++++ b/drivers/hv/hyperv_vmbus.h
+@@ -370,12 +370,14 @@ void vmbus_on_event(unsigned long data);
+ void vmbus_on_msg_dpc(unsigned long data);
+ int hv_kvp_init(struct hv_util_service *srv);
++int hv_kvp_init_transport(void);
+ void hv_kvp_deinit(void);
+ int hv_kvp_pre_suspend(void);
+ int hv_kvp_pre_resume(void);
+ void hv_kvp_onchannelcallback(void *context);
+ int hv_vss_init(struct hv_util_service *srv);
++int hv_vss_init_transport(void);
+ void hv_vss_deinit(void);
+ int hv_vss_pre_suspend(void);
+ int hv_vss_pre_resume(void);
+--- a/include/linux/hyperv.h
++++ b/include/linux/hyperv.h
+@@ -1559,6 +1559,7 @@ struct hv_util_service {
+       void *channel;
+       void (*util_cb)(void *);
+       int (*util_init)(struct hv_util_service *);
++      int (*util_init_transport)(void);
+       void (*util_deinit)(void);
+       int (*util_pre_suspend)(void);
+       int (*util_pre_resume)(void);
diff --git a/queue-6.12/drm-amdgpu-gfx12-fix-ip-version-check.patch b/queue-6.12/drm-amdgpu-gfx12-fix-ip-version-check.patch
new file mode 100644 (file)
index 0000000..d6f4999
--- /dev/null
@@ -0,0 +1,31 @@
+From 41be00f839e9ee7753892a73a36ce4c14c6f5cbf Mon Sep 17 00:00:00 2001
+From: Alex Deucher <alexander.deucher@amd.com>
+Date: Thu, 12 Dec 2024 17:04:58 -0500
+Subject: drm/amdgpu/gfx12: fix IP version check
+
+From: Alex Deucher <alexander.deucher@amd.com>
+
+commit 41be00f839e9ee7753892a73a36ce4c14c6f5cbf upstream.
+
+Use the helper function rather than reading it directly.
+
+Reviewed-by: Yang Wang <kevinyang.wang@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+(cherry picked from commit f1fd1d0f40272948aa6ab82a3a82ecbbc76dff53)
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c
+@@ -4105,7 +4105,7 @@ static int gfx_v12_0_set_clockgating_sta
+       if (amdgpu_sriov_vf(adev))
+               return 0;
+-      switch (adev->ip_versions[GC_HWIP][0]) {
++      switch (amdgpu_ip_version(adev, GC_HWIP, 0)) {
+       case IP_VERSION(12, 0, 0):
+       case IP_VERSION(12, 0, 1):
+               gfx_v12_0_update_gfx_clock_gating(adev,
diff --git a/queue-6.12/drm-amdgpu-mmhub4.1-fix-ip-version-check.patch b/queue-6.12/drm-amdgpu-mmhub4.1-fix-ip-version-check.patch
new file mode 100644 (file)
index 0000000..4e4348a
--- /dev/null
@@ -0,0 +1,36 @@
+From 6ebc5b92190e01dd48313b68cbf752c9adcfefa8 Mon Sep 17 00:00:00 2001
+From: Alex Deucher <alexander.deucher@amd.com>
+Date: Thu, 12 Dec 2024 17:03:20 -0500
+Subject: drm/amdgpu/mmhub4.1: fix IP version check
+
+From: Alex Deucher <alexander.deucher@amd.com>
+
+commit 6ebc5b92190e01dd48313b68cbf752c9adcfefa8 upstream.
+
+Use the helper function rather than reading it directly.
+
+Reviewed-by: Yang Wang <kevinyang.wang@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+(cherry picked from commit 63bfd24088b42c6f55c2096bfc41b50213d419b2)
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c
+index 0fbc3be81f14..f2ab5001b492 100644
+--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c
+@@ -108,7 +108,7 @@ mmhub_v4_1_0_print_l2_protection_fault_status(struct amdgpu_device *adev,
+       dev_err(adev->dev,
+               "MMVM_L2_PROTECTION_FAULT_STATUS_LO32:0x%08X\n",
+               status);
+-      switch (adev->ip_versions[MMHUB_HWIP][0]) {
++      switch (amdgpu_ip_version(adev, MMHUB_HWIP, 0)) {
+       case IP_VERSION(4, 1, 0):
+               mmhub_cid = mmhub_client_ids_v4_1_0[cid][rw];
+               break;
+-- 
+2.47.1
+
diff --git a/queue-6.12/drm-amdgpu-nbio7.0-fix-ip-version-check.patch b/queue-6.12/drm-amdgpu-nbio7.0-fix-ip-version-check.patch
new file mode 100644 (file)
index 0000000..ec4fa6e
--- /dev/null
@@ -0,0 +1,36 @@
+From 3abb660f9e18925468685591a3702bda05faba4f Mon Sep 17 00:00:00 2001
+From: Alex Deucher <alexander.deucher@amd.com>
+Date: Thu, 12 Dec 2024 16:49:20 -0500
+Subject: drm/amdgpu/nbio7.0: fix IP version check
+
+From: Alex Deucher <alexander.deucher@amd.com>
+
+commit 3abb660f9e18925468685591a3702bda05faba4f upstream.
+
+Use the helper function rather than reading it directly.
+
+Reviewed-by: Yang Wang <kevinyang.wang@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+(cherry picked from commit 0ec43fbece784215d3c4469973e4556d70bce915)
+Cc: stable@vger.kernel.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
+index 49e953f86ced..d1032e9992b4 100644
+--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c
+@@ -278,7 +278,7 @@ static void nbio_v7_0_init_registers(struct amdgpu_device *adev)
+ {
+       uint32_t data;
+-      switch (adev->ip_versions[NBIO_HWIP][0]) {
++      switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) {
+       case IP_VERSION(2, 5, 0):
+               data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4) & ~BIT(23);
+               WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4, data);
+-- 
+2.47.1
+
diff --git a/queue-6.12/fgraph-still-initialize-idle-shadow-stacks-when-starting.patch b/queue-6.12/fgraph-still-initialize-idle-shadow-stacks-when-starting.patch
new file mode 100644 (file)
index 0000000..99c3845
--- /dev/null
@@ -0,0 +1,62 @@
+From cc252bb592638e0f7aea40d580186c36d89526b8 Mon Sep 17 00:00:00 2001
+From: Steven Rostedt <rostedt@goodmis.org>
+Date: Wed, 11 Dec 2024 13:53:35 -0500
+Subject: fgraph: Still initialize idle shadow stacks when starting
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+commit cc252bb592638e0f7aea40d580186c36d89526b8 upstream.
+
+A bug was discovered where the idle shadow stacks were not initialized
+for offline CPUs when starting function graph tracer, and when they came
+online they were not traced due to the missing shadow stack. To fix
+this, the idle task shadow stack initialization was moved to using the
+CPU hotplug callbacks. But it removed the initialization when the
+function graph was enabled. The problem here is that the hotplug
+callbacks are called when the CPUs come online, but the idle shadow
+stack initialization only happens if function graph is currently
+active. This caused the online CPUs to not get their shadow stack
+initialized.
+
+The idle shadow stack initialization still needs to be done when the
+function graph is registered, as they will not be allocated if function
+graph is not registered.
+
+Cc: stable@vger.kernel.org
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Link: https://lore.kernel.org/20241211135335.094ba282@batman.local.home
+Fixes: 2c02f7375e65 ("fgraph: Use CPU hotplug mechanism to initialize idle shadow stacks")
+Reported-by: Linus Walleij <linus.walleij@linaro.org>
+Tested-by: Linus Walleij <linus.walleij@linaro.org>
+Closes: https://lore.kernel.org/all/CACRpkdaTBrHwRbbrphVy-=SeDz6MSsXhTKypOtLrTQ+DgGAOcQ@mail.gmail.com/
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/fgraph.c |    8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/kernel/trace/fgraph.c
++++ b/kernel/trace/fgraph.c
+@@ -1160,7 +1160,7 @@ void fgraph_update_pid_func(void)
+ static int start_graph_tracing(void)
+ {
+       unsigned long **ret_stack_list;
+-      int ret;
++      int ret, cpu;
+       ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE,
+                                sizeof(*ret_stack_list), GFP_KERNEL);
+@@ -1168,6 +1168,12 @@ static int start_graph_tracing(void)
+       if (!ret_stack_list)
+               return -ENOMEM;
++      /* The cpu_boot init_task->ret_stack will never be freed */
++      for_each_online_cpu(cpu) {
++              if (!idle_task(cpu)->ret_stack)
++                      ftrace_graph_init_idle_task(idle_task(cpu), cpu);
++      }
++
+       do {
+               ret = alloc_retstack_tasklist(ret_stack_list);
+       } while (ret == -EAGAIN);
diff --git a/queue-6.12/io_uring-check-if-iowq-is-killed-before-queuing.patch b/queue-6.12/io_uring-check-if-iowq-is-killed-before-queuing.patch
new file mode 100644 (file)
index 0000000..83e79fb
--- /dev/null
@@ -0,0 +1,46 @@
+From dbd2ca9367eb19bc5e269b8c58b0b1514ada9156 Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Thu, 19 Dec 2024 19:52:58 +0000
+Subject: io_uring: check if iowq is killed before queuing
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit dbd2ca9367eb19bc5e269b8c58b0b1514ada9156 upstream.
+
+task work can be executed after the task has gone through io_uring
+termination, whether it's the final task_work run or the fallback path.
+In this case, task work will find ->io_wq being already killed and
+null'ed, which is a problem if it then tries to forward the request to
+io_queue_iowq(). Make io_queue_iowq() fail requests in this case.
+
+Note that it also checks PF_KTHREAD, because the user can first close
+a DEFER_TASKRUN ring and shortly after kill the task, in which case
+->iowq check would race.
+
+Cc: stable@vger.kernel.org
+Fixes: 50c52250e2d74 ("block: implement async io_uring discard cmd")
+Fixes: 773af69121ecc ("io_uring: always reissue from task_work context")
+Reported-by: Will <willsroot@protonmail.com>
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/63312b4a2c2bb67ad67b857d17a300e1d3b078e8.1734637909.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/io_uring.c |    6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -515,7 +515,11 @@ static void io_queue_iowq(struct io_kioc
+       struct io_uring_task *tctx = req->task->io_uring;
+       BUG_ON(!tctx);
+-      BUG_ON(!tctx->io_wq);
++
++      if ((current->flags & PF_KTHREAD) || !tctx->io_wq) {
++              io_req_task_queue_fail(req, -ECANCELED);
++              return;
++      }
+       /* init ->work of the whole link before punting */
+       io_prep_async_link(req);
diff --git a/queue-6.12/io_uring-fix-registered-ring-file-refcount-leak.patch b/queue-6.12/io_uring-fix-registered-ring-file-refcount-leak.patch
new file mode 100644 (file)
index 0000000..243e6ed
--- /dev/null
@@ -0,0 +1,64 @@
+From 12d908116f7efd34f255a482b9afc729d7a5fb78 Mon Sep 17 00:00:00 2001
+From: Jann Horn <jannh@google.com>
+Date: Wed, 18 Dec 2024 17:56:25 +0100
+Subject: io_uring: Fix registered ring file refcount leak
+
+From: Jann Horn <jannh@google.com>
+
+commit 12d908116f7efd34f255a482b9afc729d7a5fb78 upstream.
+
+Currently, io_uring_unreg_ringfd() (which cleans up registered rings) is
+only called on exit, but __io_uring_free (which frees the tctx in which the
+registered ring pointers are stored) is also called on execve (via
+begin_new_exec -> io_uring_task_cancel -> __io_uring_cancel ->
+io_uring_cancel_generic -> __io_uring_free).
+
+This means: A process going through execve while having registered rings
+will leak references to the rings' `struct file`.
+
+Fix it by zapping registered rings on execve(). This is implemented by
+moving the io_uring_unreg_ringfd() from io_uring_files_cancel() into its
+callee __io_uring_cancel(), which is called from io_uring_task_cancel() on
+execve.
+
+This could probably be exploited *on 32-bit kernels* by leaking 2^32
+references to the same ring, because the file refcount is stored in a
+pointer-sized field and get_file() doesn't have protection against
+refcount overflow, just a WARN_ONCE(); but on 64-bit it should have no
+impact beyond a memory leak.
+
+Cc: stable@vger.kernel.org
+Fixes: e7a6c00dc77a ("io_uring: add support for registering ring file descriptors")
+Signed-off-by: Jann Horn <jannh@google.com>
+Link: https://lore.kernel.org/r/20241218-uring-reg-ring-cleanup-v1-1-8f63e999045b@google.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/io_uring.h |    4 +---
+ io_uring/io_uring.c      |    1 +
+ 2 files changed, 2 insertions(+), 3 deletions(-)
+
+--- a/include/linux/io_uring.h
++++ b/include/linux/io_uring.h
+@@ -15,10 +15,8 @@ bool io_is_uring_fops(struct file *file)
+ static inline void io_uring_files_cancel(void)
+ {
+-      if (current->io_uring) {
+-              io_uring_unreg_ringfd();
++      if (current->io_uring)
+               __io_uring_cancel(false);
+-      }
+ }
+ static inline void io_uring_task_cancel(void)
+ {
+--- a/io_uring/io_uring.c
++++ b/io_uring/io_uring.c
+@@ -3230,6 +3230,7 @@ end_wait:
+ void __io_uring_cancel(bool cancel_all)
+ {
++      io_uring_unreg_ringfd();
+       io_uring_cancel_generic(cancel_all, NULL);
+ }
diff --git a/queue-6.12/kvm-x86-play-nice-with-protected-guests-in-complete_hypercall_exit.patch b/queue-6.12/kvm-x86-play-nice-with-protected-guests-in-complete_hypercall_exit.patch
new file mode 100644 (file)
index 0000000..0be2d35
--- /dev/null
@@ -0,0 +1,59 @@
+From 9b42d1e8e4fe9dc631162c04caa69b0d1860b0f0 Mon Sep 17 00:00:00 2001
+From: Sean Christopherson <seanjc@google.com>
+Date: Wed, 27 Nov 2024 16:43:39 -0800
+Subject: KVM: x86: Play nice with protected guests in complete_hypercall_exit()
+
+From: Sean Christopherson <seanjc@google.com>
+
+commit 9b42d1e8e4fe9dc631162c04caa69b0d1860b0f0 upstream.
+
+Use is_64_bit_hypercall() instead of is_64_bit_mode() to detect a 64-bit
+hypercall when completing said hypercall.  For guests with protected state,
+e.g. SEV-ES and SEV-SNP, KVM must assume the hypercall was made in 64-bit
+mode as the vCPU state needed to detect 64-bit mode is unavailable.
+
+Hacking the sev_smoke_test selftest to generate a KVM_HC_MAP_GPA_RANGE
+hypercall via VMGEXIT trips the WARN:
+
+  ------------[ cut here ]------------
+  WARNING: CPU: 273 PID: 326626 at arch/x86/kvm/x86.h:180 complete_hypercall_exit+0x44/0xe0 [kvm]
+  Modules linked in: kvm_amd kvm ... [last unloaded: kvm]
+  CPU: 273 UID: 0 PID: 326626 Comm: sev_smoke_test Not tainted 6.12.0-smp--392e932fa0f3-feat #470
+  Hardware name: Google Astoria/astoria, BIOS 0.20240617.0-0 06/17/2024
+  RIP: 0010:complete_hypercall_exit+0x44/0xe0 [kvm]
+  Call Trace:
+   <TASK>
+   kvm_arch_vcpu_ioctl_run+0x2400/0x2720 [kvm]
+   kvm_vcpu_ioctl+0x54f/0x630 [kvm]
+   __se_sys_ioctl+0x6b/0xc0
+   do_syscall_64+0x83/0x160
+   entry_SYSCALL_64_after_hwframe+0x76/0x7e
+   </TASK>
+  ---[ end trace 0000000000000000 ]---
+
+Fixes: b5aead0064f3 ("KVM: x86: Assume a 64-bit hypercall for guests with protected state")
+Cc: stable@vger.kernel.org
+Cc: Tom Lendacky <thomas.lendacky@amd.com>
+Reviewed-by: Xiaoyao Li <xiaoyao.li@intel.com>
+Reviewed-by: Nikunj A Dadhania <nikunj@amd.com>
+Reviewed-by: Tom Lendacky <thomas.lendacky@amd.com>
+Reviewed-by: Binbin Wu <binbin.wu@linux.intel.com>
+Reviewed-by: Kai Huang <kai.huang@intel.com>
+Link: https://lore.kernel.org/r/20241128004344.4072099-2-seanjc@google.com
+Signed-off-by: Sean Christopherson <seanjc@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kvm/x86.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kvm/x86.c
++++ b/arch/x86/kvm/x86.c
+@@ -9991,7 +9991,7 @@ static int complete_hypercall_exit(struc
+ {
+       u64 ret = vcpu->run->hypercall.ret;
+-      if (!is_64_bit_mode(vcpu))
++      if (!is_64_bit_hypercall(vcpu))
+               ret = (u32)ret;
+       kvm_rax_write(vcpu, ret);
+       ++vcpu->stat.hypercalls;
diff --git a/queue-6.12/s390-mm-fix-directmap-accounting.patch b/queue-6.12/s390-mm-fix-directmap-accounting.patch
new file mode 100644 (file)
index 0000000..da8332a
--- /dev/null
@@ -0,0 +1,64 @@
+From 41856638e6c4ed51d8aa9e54f70059d1e357b46e Mon Sep 17 00:00:00 2001
+From: Heiko Carstens <hca@linux.ibm.com>
+Date: Fri, 29 Nov 2024 17:39:27 +0100
+Subject: s390/mm: Fix DirectMap accounting
+
+From: Heiko Carstens <hca@linux.ibm.com>
+
+commit 41856638e6c4ed51d8aa9e54f70059d1e357b46e upstream.
+
+With uncoupling of physical and virtual address spaces population of
+the identity mapping was changed to use the type POPULATE_IDENTITY
+instead of POPULATE_DIRECT. This breaks DirectMap accounting:
+
+> cat /proc/meminfo
+DirectMap4k:       55296 kB
+DirectMap1M:    18446744073709496320 kB
+
+Adjust all locations of update_page_count() in vmem.c to use
+POPULATE_IDENTITY instead of POPULATE_DIRECT as well. With this
+accounting is correct again:
+
+> cat /proc/meminfo
+DirectMap4k:       54264 kB
+DirectMap1M:     8334336 kB
+
+Fixes: c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces")
+Cc: stable@vger.kernel.org
+Reviewed-by: Alexander Gordeev <agordeev@linux.ibm.com>
+Signed-off-by: Heiko Carstens <hca@linux.ibm.com>
+Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/s390/boot/vmem.c |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/arch/s390/boot/vmem.c
++++ b/arch/s390/boot/vmem.c
+@@ -306,7 +306,7 @@ static void pgtable_pte_populate(pmd_t *
+                       pages++;
+               }
+       }
+-      if (mode == POPULATE_DIRECT)
++      if (mode == POPULATE_IDENTITY)
+               update_page_count(PG_DIRECT_MAP_4K, pages);
+ }
+@@ -339,7 +339,7 @@ static void pgtable_pmd_populate(pud_t *
+               }
+               pgtable_pte_populate(pmd, addr, next, mode);
+       }
+-      if (mode == POPULATE_DIRECT)
++      if (mode == POPULATE_IDENTITY)
+               update_page_count(PG_DIRECT_MAP_1M, pages);
+ }
+@@ -372,7 +372,7 @@ static void pgtable_pud_populate(p4d_t *
+               }
+               pgtable_pmd_populate(pud, addr, next, mode);
+       }
+-      if (mode == POPULATE_DIRECT)
++      if (mode == POPULATE_IDENTITY)
+               update_page_count(PG_DIRECT_MAP_2G, pages);
+ }
diff --git a/queue-6.12/selftests-bpf-use-asm-constraint-m-for-loongarch.patch b/queue-6.12/selftests-bpf-use-asm-constraint-m-for-loongarch.patch
new file mode 100644 (file)
index 0000000..a491f29
--- /dev/null
@@ -0,0 +1,40 @@
+From 29d44cce324dab2bd86c447071a596262e7109b6 Mon Sep 17 00:00:00 2001
+From: Tiezhu Yang <yangtiezhu@loongson.cn>
+Date: Thu, 19 Dec 2024 19:15:06 +0800
+Subject: selftests/bpf: Use asm constraint "m" for LoongArch
+
+From: Tiezhu Yang <yangtiezhu@loongson.cn>
+
+commit 29d44cce324dab2bd86c447071a596262e7109b6 upstream.
+
+Currently, LoongArch LLVM does not support the constraint "o" and no plan
+to support it, it only supports the similar constraint "m", so change the
+constraints from "nor" in the "else" case to arch-specific "nmr" to avoid
+the build error such as "unexpected asm memory constraint" for LoongArch.
+
+Fixes: 630301b0d59d ("selftests/bpf: Add basic USDT selftests")
+Suggested-by: Weining Lu <luweining@loongson.cn>
+Suggested-by: Li Chen <chenli@loongson.cn>
+Signed-off-by: Tiezhu Yang <yangtiezhu@loongson.cn>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Reviewed-by: Huacai Chen <chenhuacai@loongson.cn>
+Cc: stable@vger.kernel.org
+Link: https://llvm.org/docs/LangRef.html#supported-constraint-code-list
+Link: https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp#L172
+Link: https://lore.kernel.org/bpf/20241219111506.20643-1-yangtiezhu@loongson.cn
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/bpf/sdt.h |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/tools/testing/selftests/bpf/sdt.h
++++ b/tools/testing/selftests/bpf/sdt.h
+@@ -102,6 +102,8 @@
+ # define STAP_SDT_ARG_CONSTRAINT        nZr
+ # elif defined __arm__
+ # define STAP_SDT_ARG_CONSTRAINT        g
++# elif defined __loongarch__
++# define STAP_SDT_ARG_CONSTRAINT        nmr
+ # else
+ # define STAP_SDT_ARG_CONSTRAINT        nor
+ # endif
diff --git a/queue-6.12/selftests-memfd-run-sysctl-tests-when-pid-namespace-support-is-enabled.patch b/queue-6.12/selftests-memfd-run-sysctl-tests-when-pid-namespace-support-is-enabled.patch
new file mode 100644 (file)
index 0000000..1bc0f11
--- /dev/null
@@ -0,0 +1,71 @@
+From 6a75f19af16ff482cfd6085c77123aa0f464f8dd Mon Sep 17 00:00:00 2001
+From: "Isaac J. Manjarres" <isaacmanjarres@google.com>
+Date: Thu, 5 Dec 2024 11:29:41 -0800
+Subject: selftests/memfd: run sysctl tests when PID namespace support is enabled
+
+From: Isaac J. Manjarres <isaacmanjarres@google.com>
+
+commit 6a75f19af16ff482cfd6085c77123aa0f464f8dd upstream.
+
+The sysctl tests for vm.memfd_noexec rely on the kernel to support PID
+namespaces (i.e.  the kernel is built with CONFIG_PID_NS=y).  If the
+kernel the test runs on does not support PID namespaces, the first sysctl
+test will fail when attempting to spawn a new thread in a new PID
+namespace, abort the test, preventing the remaining tests from being run.
+
+This is not desirable, as not all kernels need PID namespaces, but can
+still use the other features provided by memfd.  Therefore, only run the
+sysctl tests if the kernel supports PID namespaces.  Otherwise, skip those
+tests and emit an informative message to let the user know why the sysctl
+tests are not being run.
+
+Link: https://lkml.kernel.org/r/20241205192943.3228757-1-isaacmanjarres@google.com
+Fixes: 11f75a01448f ("selftests/memfd: add tests for MFD_NOEXEC_SEAL MFD_EXEC")
+Signed-off-by: Isaac J. Manjarres <isaacmanjarres@google.com>
+Reviewed-by: Jeff Xu <jeffxu@google.com>
+Cc: Suren Baghdasaryan <surenb@google.com>
+Cc: Kalesh Singh <kaleshsingh@google.com>
+Cc: <stable@vger.kernel.org>   [6.6+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/memfd/memfd_test.c |   14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+--- a/tools/testing/selftests/memfd/memfd_test.c
++++ b/tools/testing/selftests/memfd/memfd_test.c
+@@ -9,6 +9,7 @@
+ #include <fcntl.h>
+ #include <linux/memfd.h>
+ #include <sched.h>
++#include <stdbool.h>
+ #include <stdio.h>
+ #include <stdlib.h>
+ #include <signal.h>
+@@ -1557,6 +1558,11 @@ static void test_share_fork(char *banner
+       close(fd);
+ }
++static bool pid_ns_supported(void)
++{
++      return access("/proc/self/ns/pid", F_OK) == 0;
++}
++
+ int main(int argc, char **argv)
+ {
+       pid_t pid;
+@@ -1591,8 +1597,12 @@ int main(int argc, char **argv)
+       test_seal_grow();
+       test_seal_resize();
+-      test_sysctl_simple();
+-      test_sysctl_nested();
++      if (pid_ns_supported()) {
++              test_sysctl_simple();
++              test_sysctl_nested();
++      } else {
++              printf("PID namespaces are not supported; skipping sysctl tests\n");
++      }
+       test_share_dup("SHARE-DUP", "");
+       test_share_mmap("SHARE-MMAP", "");
index 0c067e42fb61a8c21dc167e5c7ba25e277c6792a..7e7eb47683ab7241b748ec7353c6b46bfd2ab441 100644 (file)
@@ -112,4 +112,26 @@ vmalloc-fix-accounting-with-i915.patch
 mm-page_alloc-don-t-call-pfn_to_page-on-possibly-non-existent-pfn-in-split_large_buddy.patch
 ring-buffer-fix-overflow-in-__rb_map_vma.patch
 alloc_tag-fix-set_codetag_empty-when-config_mem_alloc_profiling_debug.patch
-cxl-pci-fix-potential-bogus-return-value-upon-successful-probing.patch
+btrfs-split-bios-to-the-fs-sector-size-boundary.patch
+btrfs-fix-improper-generation-check-in-snapshot-delete.patch
+btrfs-tree-checker-reject-inline-extent-items-with-0-ref-count.patch
+s390-mm-fix-directmap-accounting.patch
+drm-amdgpu-nbio7.0-fix-ip-version-check.patch
+drm-amdgpu-gfx12-fix-ip-version-check.patch
+drm-amdgpu-mmhub4.1-fix-ip-version-check.patch
+fgraph-still-initialize-idle-shadow-stacks-when-starting.patch
+drivers-hv-util-avoid-accessing-a-ringbuffer-not-initialized-yet.patch
+tools-hv-fix-a-complier-warning-in-the-fcopy-uio-daemon.patch
+x86-hyperv-fix-hv-tsc-page-based-sched_clock-for-hibernation.patch
+kvm-x86-play-nice-with-protected-guests-in-complete_hypercall_exit.patch
+smb-client-fix-tcp-timers-deadlock-after-rmmod.patch
+accel-ivpu-fix-general-protection-fault-in-ivpu_bo_list.patch
+accel-ivpu-fix-warn-in-ivpu_ipc_send_receive_internal.patch
+tracing-fix-test_event_printk-to-process-entire-print-argument.patch
+tracing-add-missing-helper-functions-in-event-pointer-dereference-check.patch
+tracing-add-s-check-in-test_event_printk.patch
+tracing-check-s-dereference-via-the-field-and-not-the-tp_printk-format.patch
+selftests-memfd-run-sysctl-tests-when-pid-namespace-support-is-enabled.patch
+selftests-bpf-use-asm-constraint-m-for-loongarch.patch
+io_uring-fix-registered-ring-file-refcount-leak.patch
+io_uring-check-if-iowq-is-killed-before-queuing.patch
diff --git a/queue-6.12/smb-client-fix-tcp-timers-deadlock-after-rmmod.patch b/queue-6.12/smb-client-fix-tcp-timers-deadlock-after-rmmod.patch
new file mode 100644 (file)
index 0000000..1b305a5
--- /dev/null
@@ -0,0 +1,182 @@
+From e9f2517a3e18a54a3943c098d2226b245d488801 Mon Sep 17 00:00:00 2001
+From: Enzo Matsumiya <ematsumiya@suse.de>
+Date: Tue, 10 Dec 2024 18:15:12 -0300
+Subject: smb: client: fix TCP timers deadlock after rmmod
+
+From: Enzo Matsumiya <ematsumiya@suse.de>
+
+commit e9f2517a3e18a54a3943c098d2226b245d488801 upstream.
+
+Commit ef7134c7fc48 ("smb: client: Fix use-after-free of network namespace.")
+fixed a netns UAF by manually enabled socket refcounting
+(sk->sk_net_refcnt=1 and sock_inuse_add(net, 1)).
+
+The reason the patch worked for that bug was because we now hold
+references to the netns (get_net_track() gets a ref internally)
+and they're properly released (internally, on __sk_destruct()),
+but only because sk->sk_net_refcnt was set.
+
+Problem:
+(this happens regardless of CONFIG_NET_NS_REFCNT_TRACKER and regardless
+if init_net or other)
+
+Setting sk->sk_net_refcnt=1 *manually* and *after* socket creation is not
+only out of cifs scope, but also technically wrong -- it's set conditionally
+based on user (=1) vs kernel (=0) sockets.  And net/ implementations
+seem to base their user vs kernel space operations on it.
+
+e.g. upon TCP socket close, the TCP timers are not cleared because
+sk->sk_net_refcnt=1:
+(cf. commit 151c9c724d05 ("tcp: properly terminate timers for kernel sockets"))
+
+net/ipv4/tcp.c:
+    void tcp_close(struct sock *sk, long timeout)
+    {
+       lock_sock(sk);
+       __tcp_close(sk, timeout);
+       release_sock(sk);
+       if (!sk->sk_net_refcnt)
+               inet_csk_clear_xmit_timers_sync(sk);
+       sock_put(sk);
+    }
+
+Which will throw a lockdep warning and then, as expected, deadlock on
+tcp_write_timer().
+
+A way to reproduce this is by running the reproducer from ef7134c7fc48
+and then 'rmmod cifs'.  A few seconds later, the deadlock/lockdep
+warning shows up.
+
+Fix:
+We shouldn't mess with socket internals ourselves, so do not set
+sk_net_refcnt manually.
+
+Also change __sock_create() to sock_create_kern() for explicitness.
+
+As for non-init_net network namespaces, we deal with it the best way
+we can -- hold an extra netns reference for server->ssocket and drop it
+when it's released.  This ensures that the netns still exists whenever
+we need to create/destroy server->ssocket, but is not directly tied to
+it.
+
+Fixes: ef7134c7fc48 ("smb: client: Fix use-after-free of network namespace.")
+Cc: stable@vger.kernel.org
+Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/smb/client/connect.c |   36 ++++++++++++++++++++++++++----------
+ 1 file changed, 26 insertions(+), 10 deletions(-)
+
+--- a/fs/smb/client/connect.c
++++ b/fs/smb/client/connect.c
+@@ -987,9 +987,13 @@ clean_demultiplex_info(struct TCP_Server
+       msleep(125);
+       if (cifs_rdma_enabled(server))
+               smbd_destroy(server);
++
+       if (server->ssocket) {
+               sock_release(server->ssocket);
+               server->ssocket = NULL;
++
++              /* Release netns reference for the socket. */
++              put_net(cifs_net_ns(server));
+       }
+       if (!list_empty(&server->pending_mid_q)) {
+@@ -1037,6 +1041,7 @@ clean_demultiplex_info(struct TCP_Server
+                */
+       }
++      /* Release netns reference for this server. */
+       put_net(cifs_net_ns(server));
+       kfree(server->leaf_fullpath);
+       kfree(server);
+@@ -1713,6 +1718,8 @@ cifs_get_tcp_session(struct smb3_fs_cont
+       tcp_ses->ops = ctx->ops;
+       tcp_ses->vals = ctx->vals;
++
++      /* Grab netns reference for this server. */
+       cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns));
+       tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId);
+@@ -1844,6 +1851,7 @@ smbd_connected:
+ out_err_crypto_release:
+       cifs_crypto_secmech_release(tcp_ses);
++      /* Release netns reference for this server. */
+       put_net(cifs_net_ns(tcp_ses));
+ out_err:
+@@ -1852,8 +1860,10 @@ out_err:
+                       cifs_put_tcp_session(tcp_ses->primary_server, false);
+               kfree(tcp_ses->hostname);
+               kfree(tcp_ses->leaf_fullpath);
+-              if (tcp_ses->ssocket)
++              if (tcp_ses->ssocket) {
+                       sock_release(tcp_ses->ssocket);
++                      put_net(cifs_net_ns(tcp_ses));
++              }
+               kfree(tcp_ses);
+       }
+       return ERR_PTR(rc);
+@@ -3111,20 +3121,20 @@ generic_ip_connect(struct TCP_Server_Inf
+               socket = server->ssocket;
+       } else {
+               struct net *net = cifs_net_ns(server);
+-              struct sock *sk;
+-              rc = __sock_create(net, sfamily, SOCK_STREAM,
+-                                 IPPROTO_TCP, &server->ssocket, 1);
++              rc = sock_create_kern(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket);
+               if (rc < 0) {
+                       cifs_server_dbg(VFS, "Error %d creating socket\n", rc);
+                       return rc;
+               }
+-              sk = server->ssocket->sk;
+-              __netns_tracker_free(net, &sk->ns_tracker, false);
+-              sk->sk_net_refcnt = 1;
+-              get_net_track(net, &sk->ns_tracker, GFP_KERNEL);
+-              sock_inuse_add(net, 1);
++              /*
++               * Grab netns reference for the socket.
++               *
++               * It'll be released here, on error, or in clean_demultiplex_info() upon server
++               * teardown.
++               */
++              get_net(net);
+               /* BB other socket options to set KEEPALIVE, NODELAY? */
+               cifs_dbg(FYI, "Socket created\n");
+@@ -3138,8 +3148,10 @@ generic_ip_connect(struct TCP_Server_Inf
+       }
+       rc = bind_socket(server);
+-      if (rc < 0)
++      if (rc < 0) {
++              put_net(cifs_net_ns(server));
+               return rc;
++      }
+       /*
+        * Eventually check for other socket options to change from
+@@ -3176,6 +3188,7 @@ generic_ip_connect(struct TCP_Server_Inf
+       if (rc < 0) {
+               cifs_dbg(FYI, "Error %d connecting to server\n", rc);
+               trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc);
++              put_net(cifs_net_ns(server));
+               sock_release(socket);
+               server->ssocket = NULL;
+               return rc;
+@@ -3184,6 +3197,9 @@ generic_ip_connect(struct TCP_Server_Inf
+       if (sport == htons(RFC1001_PORT))
+               rc = ip_rfc1001_connect(server);
++      if (rc < 0)
++              put_net(cifs_net_ns(server));
++
+       return rc;
+ }
diff --git a/queue-6.12/tools-hv-fix-a-complier-warning-in-the-fcopy-uio-daemon.patch b/queue-6.12/tools-hv-fix-a-complier-warning-in-the-fcopy-uio-daemon.patch
new file mode 100644 (file)
index 0000000..386a901
--- /dev/null
@@ -0,0 +1,63 @@
+From cb1b78f1c726c938bd47497c1ab16b01ce967f37 Mon Sep 17 00:00:00 2001
+From: Dexuan Cui <decui@microsoft.com>
+Date: Tue, 10 Sep 2024 00:44:32 +0000
+Subject: tools: hv: Fix a complier warning in the fcopy uio daemon
+
+From: Dexuan Cui <decui@microsoft.com>
+
+commit cb1b78f1c726c938bd47497c1ab16b01ce967f37 upstream.
+
+hv_fcopy_uio_daemon.c:436:53: warning: '%s' directive output may be truncated
+writing up to 14 bytes into a region of size 10 [-Wformat-truncation=]
+  436 |  snprintf(uio_dev_path, sizeof(uio_dev_path), "/dev/%s", uio_name);
+
+Also added 'static' for the array 'desc[]'.
+
+Fixes: 82b0945ce2c2 ("tools: hv: Add new fcopy application based on uio driver")
+Cc: stable@vger.kernel.org # 6.10+
+Signed-off-by: Dexuan Cui <decui@microsoft.com>
+Reviewed-by: Saurabh Sengar <ssengar@linux.microsoft.com>
+Link: https://lore.kernel.org/r/20240910004433.50254-1-decui@microsoft.com
+Signed-off-by: Wei Liu <wei.liu@kernel.org>
+Message-ID: <20240910004433.50254-1-decui@microsoft.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/hv/hv_fcopy_uio_daemon.c | 8 +++-----
+ 1 file changed, 3 insertions(+), 5 deletions(-)
+
+diff --git a/tools/hv/hv_fcopy_uio_daemon.c b/tools/hv/hv_fcopy_uio_daemon.c
+index 7a00f3066a98..12743d7f164f 100644
+--- a/tools/hv/hv_fcopy_uio_daemon.c
++++ b/tools/hv/hv_fcopy_uio_daemon.c
+@@ -35,8 +35,6 @@
+ #define WIN8_SRV_MINOR                1
+ #define WIN8_SRV_VERSION      (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR)
+-#define MAX_FOLDER_NAME               15
+-#define MAX_PATH_LEN          15
+ #define FCOPY_UIO             "/sys/bus/vmbus/devices/eb765408-105f-49b6-b4aa-c123b64d17d4/uio"
+ #define FCOPY_VER_COUNT               1
+@@ -51,7 +49,7 @@ static const int fw_versions[] = {
+ #define HV_RING_SIZE          0x4000 /* 16KB ring buffer size */
+-unsigned char desc[HV_RING_SIZE];
++static unsigned char desc[HV_RING_SIZE];
+ static int target_fd;
+ static char target_fname[PATH_MAX];
+@@ -409,8 +407,8 @@ int main(int argc, char *argv[])
+       struct vmbus_br txbr, rxbr;
+       void *ring;
+       uint32_t len = HV_RING_SIZE;
+-      char uio_name[MAX_FOLDER_NAME] = {0};
+-      char uio_dev_path[MAX_PATH_LEN] = {0};
++      char uio_name[NAME_MAX] = {0};
++      char uio_dev_path[PATH_MAX] = {0};
+       static struct option long_options[] = {
+               {"help",        no_argument,       0,  'h' },
+-- 
+2.47.1
+
diff --git a/queue-6.12/tracing-add-missing-helper-functions-in-event-pointer-dereference-check.patch b/queue-6.12/tracing-add-missing-helper-functions-in-event-pointer-dereference-check.patch
new file mode 100644 (file)
index 0000000..5ea5104
--- /dev/null
@@ -0,0 +1,78 @@
+From 917110481f6bc1c96b1e54b62bb114137fbc6d17 Mon Sep 17 00:00:00 2001
+From: Steven Rostedt <rostedt@goodmis.org>
+Date: Mon, 16 Dec 2024 21:41:20 -0500
+Subject: tracing: Add missing helper functions in event pointer dereference check
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+commit 917110481f6bc1c96b1e54b62bb114137fbc6d17 upstream.
+
+The process_pointer() helper function looks to see if various trace event
+macros are used. These macros are for storing data in the event. This
+makes it safe to dereference as the dereference will then point into the
+event on the ring buffer where the content of the data stays with the
+event itself.
+
+A few helper functions were missing. Those were:
+
+  __get_rel_dynamic_array()
+  __get_dynamic_array_len()
+  __get_rel_dynamic_array_len()
+  __get_rel_sockaddr()
+
+Also add a helper function find_print_string() to not need to use a middle
+man variable to test if the string exists.
+
+Cc: stable@vger.kernel.org
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Al Viro <viro@ZenIV.linux.org.uk>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Link: https://lore.kernel.org/20241217024720.521836792@goodmis.org
+Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/trace_events.c |   21 +++++++++++++++++++--
+ 1 file changed, 19 insertions(+), 2 deletions(-)
+
+--- a/kernel/trace/trace_events.c
++++ b/kernel/trace/trace_events.c
+@@ -274,6 +274,15 @@ static bool test_field(const char *fmt,
+       return false;
+ }
++/* Look for a string within an argument */
++static bool find_print_string(const char *arg, const char *str, const char *end)
++{
++      const char *r;
++
++      r = strstr(arg, str);
++      return r && r < end;
++}
++
+ /* Return true if the argument pointer is safe */
+ static bool process_pointer(const char *fmt, int len, struct trace_event_call *call)
+ {
+@@ -292,9 +301,17 @@ static bool process_pointer(const char *
+               a = strchr(fmt, '&');
+               if ((a && (a < r)) || test_field(r, call))
+                       return true;
+-      } else if ((r = strstr(fmt, "__get_dynamic_array(")) && r < e) {
++      } else if (find_print_string(fmt, "__get_dynamic_array(", e)) {
++              return true;
++      } else if (find_print_string(fmt, "__get_rel_dynamic_array(", e)) {
++              return true;
++      } else if (find_print_string(fmt, "__get_dynamic_array_len(", e)) {
++              return true;
++      } else if (find_print_string(fmt, "__get_rel_dynamic_array_len(", e)) {
++              return true;
++      } else if (find_print_string(fmt, "__get_sockaddr(", e)) {
+               return true;
+-      } else if ((r = strstr(fmt, "__get_sockaddr(")) && r < e) {
++      } else if (find_print_string(fmt, "__get_rel_sockaddr(", e)) {
+               return true;
+       }
+       return false;
diff --git a/queue-6.12/tracing-add-s-check-in-test_event_printk.patch b/queue-6.12/tracing-add-s-check-in-test_event_printk.patch
new file mode 100644 (file)
index 0000000..5c30d99
--- /dev/null
@@ -0,0 +1,206 @@
+From 65a25d9f7ac02e0cf361356e834d1c71d36acca9 Mon Sep 17 00:00:00 2001
+From: Steven Rostedt <rostedt@goodmis.org>
+Date: Mon, 16 Dec 2024 21:41:21 -0500
+Subject: tracing: Add "%s" check in test_event_printk()
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+commit 65a25d9f7ac02e0cf361356e834d1c71d36acca9 upstream.
+
+The test_event_printk() code makes sure that when a trace event is
+registered, any dereferenced pointers in from the event's TP_printk() are
+pointing to content in the ring buffer. But currently it does not handle
+"%s", as there's cases where the string pointer saved in the ring buffer
+points to a static string in the kernel that will never be freed. As that
+is a valid case, the pointer needs to be checked at runtime.
+
+Currently the runtime check is done via trace_check_vprintf(), but to not
+have to replicate everything in vsnprintf() it does some logic with the
+va_list that may not be reliable across architectures. In order to get rid
+of that logic, more work in the test_event_printk() needs to be done. Some
+of the strings can be validated at this time when it is obvious the string
+is valid because the string will be saved in the ring buffer content.
+
+Do all the validation of strings in the ring buffer at boot in
+test_event_printk(), and make sure that the field of the strings that
+point into the kernel are accessible. This will allow adding checks at
+runtime that will validate the fields themselves and not rely on paring
+the TP_printk() format at runtime.
+
+Cc: stable@vger.kernel.org
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Al Viro <viro@ZenIV.linux.org.uk>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Link: https://lore.kernel.org/20241217024720.685917008@goodmis.org
+Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/trace_events.c |  104 +++++++++++++++++++++++++++++++++++++-------
+ 1 file changed, 89 insertions(+), 15 deletions(-)
+
+--- a/kernel/trace/trace_events.c
++++ b/kernel/trace/trace_events.c
+@@ -244,19 +244,16 @@ int trace_event_get_offsets(struct trace
+       return tail->offset + tail->size;
+ }
+-/*
+- * Check if the referenced field is an array and return true,
+- * as arrays are OK to dereference.
+- */
+-static bool test_field(const char *fmt, struct trace_event_call *call)
++
++static struct trace_event_fields *find_event_field(const char *fmt,
++                                                 struct trace_event_call *call)
+ {
+       struct trace_event_fields *field = call->class->fields_array;
+-      const char *array_descriptor;
+       const char *p = fmt;
+       int len;
+       if (!(len = str_has_prefix(fmt, "REC->")))
+-              return false;
++              return NULL;
+       fmt += len;
+       for (p = fmt; *p; p++) {
+               if (!isalnum(*p) && *p != '_')
+@@ -267,11 +264,26 @@ static bool test_field(const char *fmt,
+       for (; field->type; field++) {
+               if (strncmp(field->name, fmt, len) || field->name[len])
+                       continue;
+-              array_descriptor = strchr(field->type, '[');
+-              /* This is an array and is OK to dereference. */
+-              return array_descriptor != NULL;
++
++              return field;
+       }
+-      return false;
++      return NULL;
++}
++
++/*
++ * Check if the referenced field is an array and return true,
++ * as arrays are OK to dereference.
++ */
++static bool test_field(const char *fmt, struct trace_event_call *call)
++{
++      struct trace_event_fields *field;
++
++      field = find_event_field(fmt, call);
++      if (!field)
++              return false;
++
++      /* This is an array and is OK to dereference. */
++      return strchr(field->type, '[') != NULL;
+ }
+ /* Look for a string within an argument */
+@@ -317,6 +329,53 @@ static bool process_pointer(const char *
+       return false;
+ }
++/* Return true if the string is safe */
++static bool process_string(const char *fmt, int len, struct trace_event_call *call)
++{
++      const char *r, *e, *s;
++
++      e = fmt + len;
++
++      /*
++       * There are several helper functions that return strings.
++       * If the argument contains a function, then assume its field is valid.
++       * It is considered that the argument has a function if it has:
++       *   alphanumeric or '_' before a parenthesis.
++       */
++      s = fmt;
++      do {
++              r = strstr(s, "(");
++              if (!r || r >= e)
++                      break;
++              for (int i = 1; r - i >= s; i++) {
++                      char ch = *(r - i);
++                      if (isspace(ch))
++                              continue;
++                      if (isalnum(ch) || ch == '_')
++                              return true;
++                      /* Anything else, this isn't a function */
++                      break;
++              }
++              /* A function could be wrapped in parethesis, try the next one */
++              s = r + 1;
++      } while (s < e);
++
++      /*
++       * If there's any strings in the argument consider this arg OK as it
++       * could be: REC->field ? "foo" : "bar" and we don't want to get into
++       * verifying that logic here.
++       */
++      if (find_print_string(fmt, "\"", e))
++              return true;
++
++      /* Dereferenced strings are also valid like any other pointer */
++      if (process_pointer(fmt, len, call))
++              return true;
++
++      /* Make sure the field is found, and consider it OK for now if it is */
++      return find_event_field(fmt, call) != NULL;
++}
++
+ /*
+  * Examine the print fmt of the event looking for unsafe dereference
+  * pointers using %p* that could be recorded in the trace event and
+@@ -326,6 +385,7 @@ static bool process_pointer(const char *
+ static void test_event_printk(struct trace_event_call *call)
+ {
+       u64 dereference_flags = 0;
++      u64 string_flags = 0;
+       bool first = true;
+       const char *fmt;
+       int parens = 0;
+@@ -416,8 +476,16 @@ static void test_event_printk(struct tra
+                                               star = true;
+                                               continue;
+                                       }
+-                                      if ((fmt[i + j] == 's') && star)
+-                                              arg++;
++                                      if ((fmt[i + j] == 's')) {
++                                              if (star)
++                                                      arg++;
++                                              if (WARN_ONCE(arg == 63,
++                                                            "Too many args for event: %s",
++                                                            trace_event_name(call)))
++                                                      return;
++                                              dereference_flags |= 1ULL << arg;
++                                              string_flags |= 1ULL << arg;
++                                      }
+                                       break;
+                               }
+                               break;
+@@ -464,7 +532,10 @@ static void test_event_printk(struct tra
+                       }
+                       if (dereference_flags & (1ULL << arg)) {
+-                              if (process_pointer(fmt + start_arg, e - start_arg, call))
++                              if (string_flags & (1ULL << arg)) {
++                                      if (process_string(fmt + start_arg, e - start_arg, call))
++                                              dereference_flags &= ~(1ULL << arg);
++                              } else if (process_pointer(fmt + start_arg, e - start_arg, call))
+                                       dereference_flags &= ~(1ULL << arg);
+                       }
+@@ -476,7 +547,10 @@ static void test_event_printk(struct tra
+       }
+       if (dereference_flags & (1ULL << arg)) {
+-              if (process_pointer(fmt + start_arg, i - start_arg, call))
++              if (string_flags & (1ULL << arg)) {
++                      if (process_string(fmt + start_arg, i - start_arg, call))
++                              dereference_flags &= ~(1ULL << arg);
++              } else if (process_pointer(fmt + start_arg, i - start_arg, call))
+                       dereference_flags &= ~(1ULL << arg);
+       }
diff --git a/queue-6.12/tracing-check-s-dereference-via-the-field-and-not-the-tp_printk-format.patch b/queue-6.12/tracing-check-s-dereference-via-the-field-and-not-the-tp_printk-format.patch
new file mode 100644 (file)
index 0000000..699c526
--- /dev/null
@@ -0,0 +1,589 @@
+From afd2627f727b89496d79a6b934a025fc916d4ded Mon Sep 17 00:00:00 2001
+From: Steven Rostedt <rostedt@goodmis.org>
+Date: Mon, 16 Dec 2024 21:41:22 -0500
+Subject: tracing: Check "%s" dereference via the field and not the TP_printk format
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+commit afd2627f727b89496d79a6b934a025fc916d4ded upstream.
+
+The TP_printk() portion of a trace event is executed at the time a event
+is read from the trace. This can happen seconds, minutes, hours, days,
+months, years possibly later since the event was recorded. If the print
+format contains a dereference to a string via "%s", and that string was
+allocated, there's a chance that string could be freed before it is read
+by the trace file.
+
+To protect against such bugs, there are two functions that verify the
+event. The first one is test_event_printk(), which is called when the
+event is created. It reads the TP_printk() format as well as its arguments
+to make sure nothing may be dereferencing a pointer that was not copied
+into the ring buffer along with the event. If it is, it will trigger a
+WARN_ON().
+
+For strings that use "%s", it is not so easy. The string may not reside in
+the ring buffer but may still be valid. Strings that are static and part
+of the kernel proper which will not be freed for the life of the running
+system, are safe to dereference. But to know if it is a pointer to a
+static string or to something on the heap can not be determined until the
+event is triggered.
+
+This brings us to the second function that tests for the bad dereferencing
+of strings, trace_check_vprintf(). It would walk through the printf format
+looking for "%s", and when it finds it, it would validate that the pointer
+is safe to read. If not, it would produces a WARN_ON() as well and write
+into the ring buffer "[UNSAFE-MEMORY]".
+
+The problem with this is how it used va_list to have vsnprintf() handle
+all the cases that it didn't need to check. Instead of re-implementing
+vsnprintf(), it would make a copy of the format up to the %s part, and
+call vsnprintf() with the current va_list ap variable, where the ap would
+then be ready to point at the string in question.
+
+For architectures that passed va_list by reference this was possible. For
+architectures that passed it by copy it was not. A test_can_verify()
+function was used to differentiate between the two, and if it wasn't
+possible, it would disable it.
+
+Even for architectures where this was feasible, it was a stretch to rely
+on such a method that is undocumented, and could cause issues later on
+with new optimizations of the compiler.
+
+Instead, the first function test_event_printk() was updated to look at
+"%s" as well. If the "%s" argument is a pointer outside the event in the
+ring buffer, it would find the field type of the event that is the problem
+and mark the structure with a new flag called "needs_test". The event
+itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that
+this event has a field that needs to be verified before the event can be
+printed using the printf format.
+
+When the event fields are created from the field type structure, the
+fields would copy the field type's "needs_test" value.
+
+Finally, before being printed, a new function ignore_event() is called
+which will check if the event has the TEST_STR flag set (if not, it
+returns false). If the flag is set, it then iterates through the events
+fields looking for the ones that have the "needs_test" flag set.
+
+Then it uses the offset field from the field structure to find the pointer
+in the ring buffer event. It runs the tests to make sure that pointer is
+safe to print and if not, it triggers the WARN_ON() and also adds to the
+trace output that the event in question has an unsafe memory access.
+
+The ignore_event() makes the trace_check_vprintf() obsolete so it is
+removed.
+
+Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/
+
+Cc: stable@vger.kernel.org
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Al Viro <viro@ZenIV.linux.org.uk>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org
+Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/trace_events.h |    6 -
+ kernel/trace/trace.c         |  255 ++++++++-----------------------------------
+ kernel/trace/trace.h         |    6 -
+ kernel/trace/trace_events.c  |   32 +++--
+ kernel/trace/trace_output.c  |    6 -
+ 5 files changed, 88 insertions(+), 217 deletions(-)
+
+--- a/include/linux/trace_events.h
++++ b/include/linux/trace_events.h
+@@ -285,7 +285,8 @@ struct trace_event_fields {
+                       const char *name;
+                       const int  size;
+                       const int  align;
+-                      const int  is_signed;
++                      const unsigned int is_signed:1;
++                      unsigned int needs_test:1;
+                       const int  filter_type;
+                       const int  len;
+               };
+@@ -337,6 +338,7 @@ enum {
+       TRACE_EVENT_FL_EPROBE_BIT,
+       TRACE_EVENT_FL_FPROBE_BIT,
+       TRACE_EVENT_FL_CUSTOM_BIT,
++      TRACE_EVENT_FL_TEST_STR_BIT,
+ };
+ /*
+@@ -354,6 +356,7 @@ enum {
+  *  CUSTOM        - Event is a custom event (to be attached to an exsiting tracepoint)
+  *                   This is set when the custom event has not been attached
+  *                   to a tracepoint yet, then it is cleared when it is.
++ *  TEST_STR      - The event has a "%s" that points to a string outside the event
+  */
+ enum {
+       TRACE_EVENT_FL_FILTERED         = (1 << TRACE_EVENT_FL_FILTERED_BIT),
+@@ -367,6 +370,7 @@ enum {
+       TRACE_EVENT_FL_EPROBE           = (1 << TRACE_EVENT_FL_EPROBE_BIT),
+       TRACE_EVENT_FL_FPROBE           = (1 << TRACE_EVENT_FL_FPROBE_BIT),
+       TRACE_EVENT_FL_CUSTOM           = (1 << TRACE_EVENT_FL_CUSTOM_BIT),
++      TRACE_EVENT_FL_TEST_STR         = (1 << TRACE_EVENT_FL_TEST_STR_BIT),
+ };
+ #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE)
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -3635,17 +3635,12 @@ char *trace_iter_expand_format(struct tr
+ }
+ /* Returns true if the string is safe to dereference from an event */
+-static bool trace_safe_str(struct trace_iterator *iter, const char *str,
+-                         bool star, int len)
++static bool trace_safe_str(struct trace_iterator *iter, const char *str)
+ {
+       unsigned long addr = (unsigned long)str;
+       struct trace_event *trace_event;
+       struct trace_event_call *event;
+-      /* Ignore strings with no length */
+-      if (star && !len)
+-              return true;
+-
+       /* OK if part of the event data */
+       if ((addr >= (unsigned long)iter->ent) &&
+           (addr < (unsigned long)iter->ent + iter->ent_size))
+@@ -3685,181 +3680,69 @@ static bool trace_safe_str(struct trace_
+       return false;
+ }
+-static DEFINE_STATIC_KEY_FALSE(trace_no_verify);
+-
+-static int test_can_verify_check(const char *fmt, ...)
+-{
+-      char buf[16];
+-      va_list ap;
+-      int ret;
+-
+-      /*
+-       * The verifier is dependent on vsnprintf() modifies the va_list
+-       * passed to it, where it is sent as a reference. Some architectures
+-       * (like x86_32) passes it by value, which means that vsnprintf()
+-       * does not modify the va_list passed to it, and the verifier
+-       * would then need to be able to understand all the values that
+-       * vsnprintf can use. If it is passed by value, then the verifier
+-       * is disabled.
+-       */
+-      va_start(ap, fmt);
+-      vsnprintf(buf, 16, "%d", ap);
+-      ret = va_arg(ap, int);
+-      va_end(ap);
+-
+-      return ret;
+-}
+-
+-static void test_can_verify(void)
+-{
+-      if (!test_can_verify_check("%d %d", 0, 1)) {
+-              pr_info("trace event string verifier disabled\n");
+-              static_branch_inc(&trace_no_verify);
+-      }
+-}
+-
+ /**
+- * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer
++ * ignore_event - Check dereferenced fields while writing to the seq buffer
+  * @iter: The iterator that holds the seq buffer and the event being printed
+- * @fmt: The format used to print the event
+- * @ap: The va_list holding the data to print from @fmt.
+  *
+- * This writes the data into the @iter->seq buffer using the data from
+- * @fmt and @ap. If the format has a %s, then the source of the string
+- * is examined to make sure it is safe to print, otherwise it will
+- * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string
+- * pointer.
++ * At boot up, test_event_printk() will flag any event that dereferences
++ * a string with "%s" that does exist in the ring buffer. It may still
++ * be valid, as the string may point to a static string in the kernel
++ * rodata that never gets freed. But if the string pointer is pointing
++ * to something that was allocated, there's a chance that it can be freed
++ * by the time the user reads the trace. This would cause a bad memory
++ * access by the kernel and possibly crash the system.
++ *
++ * This function will check if the event has any fields flagged as needing
++ * to be checked at runtime and perform those checks.
++ *
++ * If it is found that a field is unsafe, it will write into the @iter->seq
++ * a message stating what was found to be unsafe.
++ *
++ * @return: true if the event is unsafe and should be ignored,
++ *          false otherwise.
+  */
+-void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
+-                       va_list ap)
++bool ignore_event(struct trace_iterator *iter)
+ {
+-      long text_delta = 0;
+-      long data_delta = 0;
+-      const char *p = fmt;
+-      const char *str;
+-      bool good;
+-      int i, j;
++      struct ftrace_event_field *field;
++      struct trace_event *trace_event;
++      struct trace_event_call *event;
++      struct list_head *head;
++      struct trace_seq *seq;
++      const void *ptr;
+-      if (WARN_ON_ONCE(!fmt))
+-              return;
++      trace_event = ftrace_find_event(iter->ent->type);
+-      if (static_branch_unlikely(&trace_no_verify))
+-              goto print;
++      seq = &iter->seq;
+-      /*
+-       * When the kernel is booted with the tp_printk command line
+-       * parameter, trace events go directly through to printk().
+-       * It also is checked by this function, but it does not
+-       * have an associated trace_array (tr) for it.
+-       */
+-      if (iter->tr) {
+-              text_delta = iter->tr->text_delta;
+-              data_delta = iter->tr->data_delta;
++      if (!trace_event) {
++              trace_seq_printf(seq, "EVENT ID %d NOT FOUND?\n", iter->ent->type);
++              return true;
+       }
+-      /* Don't bother checking when doing a ftrace_dump() */
+-      if (iter->fmt == static_fmt_buf)
+-              goto print;
+-
+-      while (*p) {
+-              bool star = false;
+-              int len = 0;
+-
+-              j = 0;
+-
+-              /*
+-               * We only care about %s and variants
+-               * as well as %p[sS] if delta is non-zero
+-               */
+-              for (i = 0; p[i]; i++) {
+-                      if (i + 1 >= iter->fmt_size) {
+-                              /*
+-                               * If we can't expand the copy buffer,
+-                               * just print it.
+-                               */
+-                              if (!trace_iter_expand_format(iter))
+-                                      goto print;
+-                      }
+-
+-                      if (p[i] == '\\' && p[i+1]) {
+-                              i++;
+-                              continue;
+-                      }
+-                      if (p[i] == '%') {
+-                              /* Need to test cases like %08.*s */
+-                              for (j = 1; p[i+j]; j++) {
+-                                      if (isdigit(p[i+j]) ||
+-                                          p[i+j] == '.')
+-                                              continue;
+-                                      if (p[i+j] == '*') {
+-                                              star = true;
+-                                              continue;
+-                                      }
+-                                      break;
+-                              }
+-                              if (p[i+j] == 's')
+-                                      break;
++      event = container_of(trace_event, struct trace_event_call, event);
++      if (!(event->flags & TRACE_EVENT_FL_TEST_STR))
++              return false;
+-                              if (text_delta && p[i+1] == 'p' &&
+-                                  ((p[i+2] == 's' || p[i+2] == 'S')))
+-                                      break;
++      head = trace_get_fields(event);
++      if (!head) {
++              trace_seq_printf(seq, "FIELDS FOR EVENT '%s' NOT FOUND?\n",
++                               trace_event_name(event));
++              return true;
++      }
+-                              star = false;
+-                      }
+-                      j = 0;
+-              }
+-              /* If no %s found then just print normally */
+-              if (!p[i])
+-                      break;
++      /* Offsets are from the iter->ent that points to the raw event */
++      ptr = iter->ent;
+-              /* Copy up to the %s, and print that */
+-              strncpy(iter->fmt, p, i);
+-              iter->fmt[i] = '\0';
+-              trace_seq_vprintf(&iter->seq, iter->fmt, ap);
+-
+-              /* Add delta to %pS pointers */
+-              if (p[i+1] == 'p') {
+-                      unsigned long addr;
+-                      char fmt[4];
+-
+-                      fmt[0] = '%';
+-                      fmt[1] = 'p';
+-                      fmt[2] = p[i+2]; /* Either %ps or %pS */
+-                      fmt[3] = '\0';
+-
+-                      addr = va_arg(ap, unsigned long);
+-                      addr += text_delta;
+-                      trace_seq_printf(&iter->seq, fmt, (void *)addr);
++      list_for_each_entry(field, head, link) {
++              const char *str;
++              bool good;
+-                      p += i + 3;
++              if (!field->needs_test)
+                       continue;
+-              }
+-              /*
+-               * If iter->seq is full, the above call no longer guarantees
+-               * that ap is in sync with fmt processing, and further calls
+-               * to va_arg() can return wrong positional arguments.
+-               *
+-               * Ensure that ap is no longer used in this case.
+-               */
+-              if (iter->seq.full) {
+-                      p = "";
+-                      break;
+-              }
+-
+-              if (star)
+-                      len = va_arg(ap, int);
+-
+-              /* The ap now points to the string data of the %s */
+-              str = va_arg(ap, const char *);
+-
+-              good = trace_safe_str(iter, str, star, len);
++              str = *(const char **)(ptr + field->offset);
+-              /* Could be from the last boot */
+-              if (data_delta && !good) {
+-                      str += data_delta;
+-                      good = trace_safe_str(iter, str, star, len);
+-              }
++              good = trace_safe_str(iter, str);
+               /*
+                * If you hit this warning, it is likely that the
+@@ -3870,44 +3753,14 @@ void trace_check_vprintf(struct trace_it
+                * instead. See samples/trace_events/trace-events-sample.h
+                * for reference.
+                */
+-              if (WARN_ONCE(!good, "fmt: '%s' current_buffer: '%s'",
+-                            fmt, seq_buf_str(&iter->seq.seq))) {
+-                      int ret;
+-
+-                      /* Try to safely read the string */
+-                      if (star) {
+-                              if (len + 1 > iter->fmt_size)
+-                                      len = iter->fmt_size - 1;
+-                              if (len < 0)
+-                                      len = 0;
+-                              ret = copy_from_kernel_nofault(iter->fmt, str, len);
+-                              iter->fmt[len] = 0;
+-                              star = false;
+-                      } else {
+-                              ret = strncpy_from_kernel_nofault(iter->fmt, str,
+-                                                                iter->fmt_size);
+-                      }
+-                      if (ret < 0)
+-                              trace_seq_printf(&iter->seq, "(0x%px)", str);
+-                      else
+-                              trace_seq_printf(&iter->seq, "(0x%px:%s)",
+-                                               str, iter->fmt);
+-                      str = "[UNSAFE-MEMORY]";
+-                      strcpy(iter->fmt, "%s");
+-              } else {
+-                      strncpy(iter->fmt, p + i, j + 1);
+-                      iter->fmt[j+1] = '\0';
++              if (WARN_ONCE(!good, "event '%s' has unsafe pointer field '%s'",
++                            trace_event_name(event), field->name)) {
++                      trace_seq_printf(seq, "EVENT %s: HAS UNSAFE POINTER FIELD '%s'\n",
++                                       trace_event_name(event), field->name);
++                      return true;
+               }
+-              if (star)
+-                      trace_seq_printf(&iter->seq, iter->fmt, len, str);
+-              else
+-                      trace_seq_printf(&iter->seq, iter->fmt, str);
+-
+-              p += i + j + 1;
+       }
+- print:
+-      if (*p)
+-              trace_seq_vprintf(&iter->seq, p, ap);
++      return false;
+ }
+ const char *trace_event_format(struct trace_iterator *iter, const char *fmt)
+@@ -10803,8 +10656,6 @@ __init static int tracer_alloc_buffers(v
+       register_snapshot_cmd();
+-      test_can_verify();
+-
+       return 0;
+ out_free_pipe_cpumask:
+--- a/kernel/trace/trace.h
++++ b/kernel/trace/trace.h
+@@ -664,9 +664,8 @@ void trace_buffer_unlock_commit_nostack(
+ bool trace_is_tracepoint_string(const char *str);
+ const char *trace_event_format(struct trace_iterator *iter, const char *fmt);
+-void trace_check_vprintf(struct trace_iterator *iter, const char *fmt,
+-                       va_list ap) __printf(2, 0);
+ char *trace_iter_expand_format(struct trace_iterator *iter);
++bool ignore_event(struct trace_iterator *iter);
+ int trace_empty(struct trace_iterator *iter);
+@@ -1402,7 +1401,8 @@ struct ftrace_event_field {
+       int                     filter_type;
+       int                     offset;
+       int                     size;
+-      int                     is_signed;
++      unsigned int            is_signed:1;
++      unsigned int            needs_test:1;
+       int                     len;
+ };
+--- a/kernel/trace/trace_events.c
++++ b/kernel/trace/trace_events.c
+@@ -82,7 +82,7 @@ static int system_refcount_dec(struct ev
+       }
+ static struct ftrace_event_field *
+-__find_event_field(struct list_head *head, char *name)
++__find_event_field(struct list_head *head, const char *name)
+ {
+       struct ftrace_event_field *field;
+@@ -114,7 +114,8 @@ trace_find_event_field(struct trace_even
+ static int __trace_define_field(struct list_head *head, const char *type,
+                               const char *name, int offset, int size,
+-                              int is_signed, int filter_type, int len)
++                              int is_signed, int filter_type, int len,
++                              int need_test)
+ {
+       struct ftrace_event_field *field;
+@@ -133,6 +134,7 @@ static int __trace_define_field(struct l
+       field->offset = offset;
+       field->size = size;
+       field->is_signed = is_signed;
++      field->needs_test = need_test;
+       field->len = len;
+       list_add(&field->link, head);
+@@ -151,13 +153,13 @@ int trace_define_field(struct trace_even
+       head = trace_get_fields(call);
+       return __trace_define_field(head, type, name, offset, size,
+-                                  is_signed, filter_type, 0);
++                                  is_signed, filter_type, 0, 0);
+ }
+ EXPORT_SYMBOL_GPL(trace_define_field);
+ static int trace_define_field_ext(struct trace_event_call *call, const char *type,
+                      const char *name, int offset, int size, int is_signed,
+-                     int filter_type, int len)
++                     int filter_type, int len, int need_test)
+ {
+       struct list_head *head;
+@@ -166,13 +168,13 @@ static int trace_define_field_ext(struct
+       head = trace_get_fields(call);
+       return __trace_define_field(head, type, name, offset, size,
+-                                  is_signed, filter_type, len);
++                                  is_signed, filter_type, len, need_test);
+ }
+ #define __generic_field(type, item, filter_type)                      \
+       ret = __trace_define_field(&ftrace_generic_fields, #type,       \
+                                  #item, 0, 0, is_signed_type(type),   \
+-                                 filter_type, 0);                     \
++                                 filter_type, 0, 0);                  \
+       if (ret)                                                        \
+               return ret;
+@@ -181,7 +183,8 @@ static int trace_define_field_ext(struct
+                                  "common_" #item,                     \
+                                  offsetof(typeof(ent), item),         \
+                                  sizeof(ent.item),                    \
+-                                 is_signed_type(type), FILTER_OTHER, 0);      \
++                                 is_signed_type(type), FILTER_OTHER,  \
++                                 0, 0);                               \
+       if (ret)                                                        \
+               return ret;
+@@ -332,6 +335,7 @@ static bool process_pointer(const char *
+ /* Return true if the string is safe */
+ static bool process_string(const char *fmt, int len, struct trace_event_call *call)
+ {
++      struct trace_event_fields *field;
+       const char *r, *e, *s;
+       e = fmt + len;
+@@ -372,8 +376,16 @@ static bool process_string(const char *f
+       if (process_pointer(fmt, len, call))
+               return true;
+-      /* Make sure the field is found, and consider it OK for now if it is */
+-      return find_event_field(fmt, call) != NULL;
++      /* Make sure the field is found */
++      field = find_event_field(fmt, call);
++      if (!field)
++              return false;
++
++      /* Test this field's string before printing the event */
++      call->flags |= TRACE_EVENT_FL_TEST_STR;
++      field->needs_test = 1;
++
++      return true;
+ }
+ /*
+@@ -2586,7 +2598,7 @@ event_define_fields(struct trace_event_c
+                       ret = trace_define_field_ext(call, field->type, field->name,
+                                                offset, field->size,
+                                                field->is_signed, field->filter_type,
+-                                               field->len);
++                                               field->len, field->needs_test);
+                       if (WARN_ON_ONCE(ret)) {
+                               pr_err("error code is %d\n", ret);
+                               break;
+--- a/kernel/trace/trace_output.c
++++ b/kernel/trace/trace_output.c
+@@ -317,10 +317,14 @@ EXPORT_SYMBOL(trace_raw_output_prep);
+ void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...)
+ {
++      struct trace_seq *s = &iter->seq;
+       va_list ap;
++      if (ignore_event(iter))
++              return;
++
+       va_start(ap, fmt);
+-      trace_check_vprintf(iter, trace_event_format(iter, fmt), ap);
++      trace_seq_vprintf(s, trace_event_format(iter, fmt), ap);
+       va_end(ap);
+ }
+ EXPORT_SYMBOL(trace_event_printf);
diff --git a/queue-6.12/tracing-fix-test_event_printk-to-process-entire-print-argument.patch b/queue-6.12/tracing-fix-test_event_printk-to-process-entire-print-argument.patch
new file mode 100644 (file)
index 0000000..ebd2328
--- /dev/null
@@ -0,0 +1,184 @@
+From a6629626c584200daf495cc9a740048b455addcd Mon Sep 17 00:00:00 2001
+From: Steven Rostedt <rostedt@goodmis.org>
+Date: Mon, 16 Dec 2024 21:41:19 -0500
+Subject: tracing: Fix test_event_printk() to process entire print argument
+
+From: Steven Rostedt <rostedt@goodmis.org>
+
+commit a6629626c584200daf495cc9a740048b455addcd upstream.
+
+The test_event_printk() analyzes print formats of trace events looking for
+cases where it may dereference a pointer that is not in the ring buffer
+which can possibly be a bug when the trace event is read from the ring
+buffer and the content of that pointer no longer exists.
+
+The function needs to accurately go from one print format argument to the
+next. It handles quotes and parenthesis that may be included in an
+argument. When it finds the start of the next argument, it uses a simple
+"c = strstr(fmt + i, ',')" to find the end of that argument!
+
+In order to include "%s" dereferencing, it needs to process the entire
+content of the print format argument and not just the content of the first
+',' it finds. As there may be content like:
+
+ ({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char
+   *access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux"
+   }; union kvm_mmu_page_role role; role.word = REC->role;
+   trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe
+   %sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level,
+   role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "",
+   access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? ""
+   : "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ?
+   "unsync" : "sync", 0); saved_ptr; })
+
+Which is an example of a full argument of an existing event. As the code
+already handles finding the next print format argument, process the
+argument at the end of it and not the start of it. This way it has both
+the start of the argument as well as the end of it.
+
+Add a helper function "process_pointer()" that will do the processing during
+the loop as well as at the end. It also makes the code cleaner and easier
+to read.
+
+Cc: stable@vger.kernel.org
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Mark Rutland <mark.rutland@arm.com>
+Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: Al Viro <viro@ZenIV.linux.org.uk>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org
+Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers")
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/trace_events.c |   82 ++++++++++++++++++++++++++++----------------
+ 1 file changed, 53 insertions(+), 29 deletions(-)
+
+--- a/kernel/trace/trace_events.c
++++ b/kernel/trace/trace_events.c
+@@ -265,8 +265,7 @@ static bool test_field(const char *fmt,
+       len = p - fmt;
+       for (; field->type; field++) {
+-              if (strncmp(field->name, fmt, len) ||
+-                  field->name[len])
++              if (strncmp(field->name, fmt, len) || field->name[len])
+                       continue;
+               array_descriptor = strchr(field->type, '[');
+               /* This is an array and is OK to dereference. */
+@@ -275,6 +274,32 @@ static bool test_field(const char *fmt,
+       return false;
+ }
++/* Return true if the argument pointer is safe */
++static bool process_pointer(const char *fmt, int len, struct trace_event_call *call)
++{
++      const char *r, *e, *a;
++
++      e = fmt + len;
++
++      /* Find the REC-> in the argument */
++      r = strstr(fmt, "REC->");
++      if (r && r < e) {
++              /*
++               * Addresses of events on the buffer, or an array on the buffer is
++               * OK to dereference. There's ways to fool this, but
++               * this is to catch common mistakes, not malicious code.
++               */
++              a = strchr(fmt, '&');
++              if ((a && (a < r)) || test_field(r, call))
++                      return true;
++      } else if ((r = strstr(fmt, "__get_dynamic_array(")) && r < e) {
++              return true;
++      } else if ((r = strstr(fmt, "__get_sockaddr(")) && r < e) {
++              return true;
++      }
++      return false;
++}
++
+ /*
+  * Examine the print fmt of the event looking for unsafe dereference
+  * pointers using %p* that could be recorded in the trace event and
+@@ -285,12 +310,12 @@ static void test_event_printk(struct tra
+ {
+       u64 dereference_flags = 0;
+       bool first = true;
+-      const char *fmt, *c, *r, *a;
++      const char *fmt;
+       int parens = 0;
+       char in_quote = 0;
+       int start_arg = 0;
+       int arg = 0;
+-      int i;
++      int i, e;
+       fmt = call->print_fmt;
+@@ -403,42 +428,41 @@ static void test_event_printk(struct tra
+               case ',':
+                       if (in_quote || parens)
+                               continue;
++                      e = i;
+                       i++;
+                       while (isspace(fmt[i]))
+                               i++;
+-                      start_arg = i;
+-                      if (!(dereference_flags & (1ULL << arg)))
+-                              goto next_arg;
+-                      /* Find the REC-> in the argument */
+-                      c = strchr(fmt + i, ',');
+-                      r = strstr(fmt + i, "REC->");
+-                      if (r && (!c || r < c)) {
+-                              /*
+-                               * Addresses of events on the buffer,
+-                               * or an array on the buffer is
+-                               * OK to dereference.
+-                               * There's ways to fool this, but
+-                               * this is to catch common mistakes,
+-                               * not malicious code.
+-                               */
+-                              a = strchr(fmt + i, '&');
+-                              if ((a && (a < r)) || test_field(r, call))
++                      /*
++                       * If start_arg is zero, then this is the start of the
++                       * first argument. The processing of the argument happens
++                       * when the end of the argument is found, as it needs to
++                       * handle paranthesis and such.
++                       */
++                      if (!start_arg) {
++                              start_arg = i;
++                              /* Balance out the i++ in the for loop */
++                              i--;
++                              continue;
++                      }
++
++                      if (dereference_flags & (1ULL << arg)) {
++                              if (process_pointer(fmt + start_arg, e - start_arg, call))
+                                       dereference_flags &= ~(1ULL << arg);
+-                      } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) &&
+-                                 (!c || r < c)) {
+-                              dereference_flags &= ~(1ULL << arg);
+-                      } else if ((r = strstr(fmt + i, "__get_sockaddr(")) &&
+-                                 (!c || r < c)) {
+-                              dereference_flags &= ~(1ULL << arg);
+                       }
+-              next_arg:
+-                      i--;
++                      start_arg = i;
+                       arg++;
++                      /* Balance out the i++ in the for loop */
++                      i--;
+               }
+       }
++      if (dereference_flags & (1ULL << arg)) {
++              if (process_pointer(fmt + start_arg, i - start_arg, call))
++                      dereference_flags &= ~(1ULL << arg);
++      }
++
+       /*
+        * If you triggered the below warning, the trace event reported
+        * uses an unsafe dereference pointer %p*. As the data stored
diff --git a/queue-6.12/x86-hyperv-fix-hv-tsc-page-based-sched_clock-for-hibernation.patch b/queue-6.12/x86-hyperv-fix-hv-tsc-page-based-sched_clock-for-hibernation.patch
new file mode 100644 (file)
index 0000000..501a4f0
--- /dev/null
@@ -0,0 +1,168 @@
+From bcc80dec91ee745b3d66f3e48f0ec2efdea97149 Mon Sep 17 00:00:00 2001
+From: Naman Jain <namjain@linux.microsoft.com>
+Date: Tue, 17 Sep 2024 11:09:17 +0530
+Subject: x86/hyperv: Fix hv tsc page based sched_clock for hibernation
+
+From: Naman Jain <namjain@linux.microsoft.com>
+
+commit bcc80dec91ee745b3d66f3e48f0ec2efdea97149 upstream.
+
+read_hv_sched_clock_tsc() assumes that the Hyper-V clock counter is
+bigger than the variable hv_sched_clock_offset, which is cached during
+early boot, but depending on the timing this assumption may be false
+when a hibernated VM starts again (the clock counter starts from 0
+again) and is resuming back (Note: hv_init_tsc_clocksource() is not
+called during hibernation/resume); consequently,
+read_hv_sched_clock_tsc() may return a negative integer (which is
+interpreted as a huge positive integer since the return type is u64)
+and new kernel messages are prefixed with huge timestamps before
+read_hv_sched_clock_tsc() grows big enough (which typically takes
+several seconds).
+
+Fix the issue by saving the Hyper-V clock counter just before the
+suspend, and using it to correct the hv_sched_clock_offset in
+resume. This makes hv tsc page based sched_clock continuous and ensures
+that post resume, it starts from where it left off during suspend.
+Override x86_platform.save_sched_clock_state and
+x86_platform.restore_sched_clock_state routines to correct this as soon
+as possible.
+
+Note: if Invariant TSC is available, the issue doesn't happen because
+1) we don't register read_hv_sched_clock_tsc() for sched clock:
+See commit e5313f1c5404 ("clocksource/drivers/hyper-v: Rework
+clocksource and sched clock setup");
+2) the common x86 code adjusts TSC similarly: see
+__restore_processor_state() ->  tsc_verify_tsc_adjust(true) and
+x86_platform.restore_sched_clock_state().
+
+Cc: stable@vger.kernel.org
+Fixes: 1349401ff1aa ("clocksource/drivers/hyper-v: Suspend/resume Hyper-V clocksource for hibernation")
+Co-developed-by: Dexuan Cui <decui@microsoft.com>
+Signed-off-by: Dexuan Cui <decui@microsoft.com>
+Signed-off-by: Naman Jain <namjain@linux.microsoft.com>
+Reviewed-by: Michael Kelley <mhklinux@outlook.com>
+Link: https://lore.kernel.org/r/20240917053917.76787-1-namjain@linux.microsoft.com
+Signed-off-by: Wei Liu <wei.liu@kernel.org>
+Message-ID: <20240917053917.76787-1-namjain@linux.microsoft.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/cpu/mshyperv.c     |   58 +++++++++++++++++++++++++++++++++++++
+ drivers/clocksource/hyperv_timer.c |   14 ++++++++
+ include/clocksource/hyperv_timer.h |    2 +
+ 3 files changed, 73 insertions(+), 1 deletion(-)
+
+--- a/arch/x86/kernel/cpu/mshyperv.c
++++ b/arch/x86/kernel/cpu/mshyperv.c
+@@ -223,6 +223,63 @@ static void hv_machine_crash_shutdown(st
+       hyperv_cleanup();
+ }
+ #endif /* CONFIG_CRASH_DUMP */
++
++static u64 hv_ref_counter_at_suspend;
++static void (*old_save_sched_clock_state)(void);
++static void (*old_restore_sched_clock_state)(void);
++
++/*
++ * Hyper-V clock counter resets during hibernation. Save and restore clock
++ * offset during suspend/resume, while also considering the time passed
++ * before suspend. This is to make sure that sched_clock using hv tsc page
++ * based clocksource, proceeds from where it left off during suspend and
++ * it shows correct time for the timestamps of kernel messages after resume.
++ */
++static void save_hv_clock_tsc_state(void)
++{
++      hv_ref_counter_at_suspend = hv_read_reference_counter();
++}
++
++static void restore_hv_clock_tsc_state(void)
++{
++      /*
++       * Adjust the offsets used by hv tsc clocksource to
++       * account for the time spent before hibernation.
++       * adjusted value = reference counter (time) at suspend
++       *                - reference counter (time) now.
++       */
++      hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter());
++}
++
++/*
++ * Functions to override save_sched_clock_state and restore_sched_clock_state
++ * functions of x86_platform. The Hyper-V clock counter is reset during
++ * suspend-resume and the offset used to measure time needs to be
++ * corrected, post resume.
++ */
++static void hv_save_sched_clock_state(void)
++{
++      old_save_sched_clock_state();
++      save_hv_clock_tsc_state();
++}
++
++static void hv_restore_sched_clock_state(void)
++{
++      restore_hv_clock_tsc_state();
++      old_restore_sched_clock_state();
++}
++
++static void __init x86_setup_ops_for_tsc_pg_clock(void)
++{
++      if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE))
++              return;
++
++      old_save_sched_clock_state = x86_platform.save_sched_clock_state;
++      x86_platform.save_sched_clock_state = hv_save_sched_clock_state;
++
++      old_restore_sched_clock_state = x86_platform.restore_sched_clock_state;
++      x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state;
++}
+ #endif /* CONFIG_HYPERV */
+ static uint32_t  __init ms_hyperv_platform(void)
+@@ -579,6 +636,7 @@ static void __init ms_hyperv_init_platfo
+       /* Register Hyper-V specific clocksource */
+       hv_init_clocksource();
++      x86_setup_ops_for_tsc_pg_clock();
+       hv_vtl_init_platform();
+ #endif
+       /*
+--- a/drivers/clocksource/hyperv_timer.c
++++ b/drivers/clocksource/hyperv_timer.c
+@@ -27,7 +27,8 @@
+ #include <asm/mshyperv.h>
+ static struct clock_event_device __percpu *hv_clock_event;
+-static u64 hv_sched_clock_offset __ro_after_init;
++/* Note: offset can hold negative values after hibernation. */
++static u64 hv_sched_clock_offset __read_mostly;
+ /*
+  * If false, we're using the old mechanism for stimer0 interrupts
+@@ -470,6 +471,17 @@ static void resume_hv_clock_tsc(struct c
+       hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64);
+ }
++/*
++ * Called during resume from hibernation, from overridden
++ * x86_platform.restore_sched_clock_state routine. This is to adjust offsets
++ * used to calculate time for hv tsc page based sched_clock, to account for
++ * time spent before hibernation.
++ */
++void hv_adj_sched_clock_offset(u64 offset)
++{
++      hv_sched_clock_offset -= offset;
++}
++
+ #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK
+ static int hv_cs_enable(struct clocksource *cs)
+ {
+--- a/include/clocksource/hyperv_timer.h
++++ b/include/clocksource/hyperv_timer.h
+@@ -38,6 +38,8 @@ extern void hv_remap_tsc_clocksource(voi
+ extern unsigned long hv_get_tsc_pfn(void);
+ extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void);
++extern void hv_adj_sched_clock_offset(u64 offset);
++
+ static __always_inline bool
+ hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg,
+                    u64 *cur_tsc, u64 *time)