From: Greg Kroah-Hartman Date: Mon, 23 Dec 2024 11:50:45 +0000 (+0100) Subject: 6.12-stable patches X-Git-Tag: v6.1.122~21 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=573da444365091a9a84e270722fa5f02d6359b06;p=thirdparty%2Fkernel%2Fstable-queue.git 6.12-stable patches added patches: accel-ivpu-fix-general-protection-fault-in-ivpu_bo_list.patch accel-ivpu-fix-warn-in-ivpu_ipc_send_receive_internal.patch btrfs-fix-improper-generation-check-in-snapshot-delete.patch btrfs-split-bios-to-the-fs-sector-size-boundary.patch btrfs-tree-checker-reject-inline-extent-items-with-0-ref-count.patch drivers-hv-util-avoid-accessing-a-ringbuffer-not-initialized-yet.patch drm-amdgpu-gfx12-fix-ip-version-check.patch drm-amdgpu-mmhub4.1-fix-ip-version-check.patch drm-amdgpu-nbio7.0-fix-ip-version-check.patch fgraph-still-initialize-idle-shadow-stacks-when-starting.patch io_uring-check-if-iowq-is-killed-before-queuing.patch io_uring-fix-registered-ring-file-refcount-leak.patch kvm-x86-play-nice-with-protected-guests-in-complete_hypercall_exit.patch s390-mm-fix-directmap-accounting.patch selftests-bpf-use-asm-constraint-m-for-loongarch.patch selftests-memfd-run-sysctl-tests-when-pid-namespace-support-is-enabled.patch smb-client-fix-tcp-timers-deadlock-after-rmmod.patch tools-hv-fix-a-complier-warning-in-the-fcopy-uio-daemon.patch tracing-add-missing-helper-functions-in-event-pointer-dereference-check.patch tracing-add-s-check-in-test_event_printk.patch tracing-check-s-dereference-via-the-field-and-not-the-tp_printk-format.patch tracing-fix-test_event_printk-to-process-entire-print-argument.patch x86-hyperv-fix-hv-tsc-page-based-sched_clock-for-hibernation.patch --- diff --git a/queue-6.12/accel-ivpu-fix-general-protection-fault-in-ivpu_bo_list.patch b/queue-6.12/accel-ivpu-fix-general-protection-fault-in-ivpu_bo_list.patch new file mode 100644 index 00000000000..4fa6ea7bf03 --- /dev/null +++ b/queue-6.12/accel-ivpu-fix-general-protection-fault-in-ivpu_bo_list.patch @@ -0,0 +1,33 @@ +From 4b2efb9db0c22a130bbd1275e489b42c02d08050 Mon Sep 17 00:00:00 2001 +From: Jacek Lawrynowicz +Date: Tue, 10 Dec 2024 14:09:37 +0100 +Subject: accel/ivpu: Fix general protection fault in ivpu_bo_list() + +From: Jacek Lawrynowicz + +commit 4b2efb9db0c22a130bbd1275e489b42c02d08050 upstream. + +Check if ctx is not NULL before accessing its fields. + +Fixes: 37dee2a2f433 ("accel/ivpu: Improve buffer object debug logs") +Cc: stable@vger.kernel.org # v6.8 +Reviewed-by: Karol Wachowski +Reviewed-by: Jeffrey Hugo +Signed-off-by: Jacek Lawrynowicz +Link: https://patchwork.freedesktop.org/patch/msgid/20241210130939.1575610-2-jacek.lawrynowicz@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/accel/ivpu/ivpu_gem.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/accel/ivpu/ivpu_gem.c ++++ b/drivers/accel/ivpu/ivpu_gem.c +@@ -406,7 +406,7 @@ static void ivpu_bo_print_info(struct iv + mutex_lock(&bo->lock); + + drm_printf(p, "%-9p %-3u 0x%-12llx %-10lu 0x%-8x %-4u", +- bo, bo->ctx->id, bo->vpu_addr, bo->base.base.size, ++ bo, bo->ctx ? bo->ctx->id : 0, bo->vpu_addr, bo->base.base.size, + bo->flags, kref_read(&bo->base.base.refcount)); + + if (bo->base.pages) diff --git a/queue-6.12/accel-ivpu-fix-warn-in-ivpu_ipc_send_receive_internal.patch b/queue-6.12/accel-ivpu-fix-warn-in-ivpu_ipc_send_receive_internal.patch new file mode 100644 index 00000000000..96e4ea01c63 --- /dev/null +++ b/queue-6.12/accel-ivpu-fix-warn-in-ivpu_ipc_send_receive_internal.patch @@ -0,0 +1,43 @@ +From 0f6482caa6acdfdfc744db7430771fe7e6c4e787 Mon Sep 17 00:00:00 2001 +From: Jacek Lawrynowicz +Date: Tue, 10 Dec 2024 14:09:39 +0100 +Subject: accel/ivpu: Fix WARN in ivpu_ipc_send_receive_internal() + +From: Jacek Lawrynowicz + +commit 0f6482caa6acdfdfc744db7430771fe7e6c4e787 upstream. + +Move pm_runtime_set_active() to ivpu_pm_init() so when +ivpu_ipc_send_receive_internal() is executed before ivpu_pm_enable() +it already has correct runtime state, even if last resume was +not successful. + +Fixes: 8ed520ff4682 ("accel/ivpu: Move set autosuspend delay to HW specific code") +Cc: stable@vger.kernel.org # v6.7+ +Reviewed-by: Karol Wachowski +Reviewed-by: Jeffrey Hugo +Signed-off-by: Jacek Lawrynowicz +Link: https://patchwork.freedesktop.org/patch/msgid/20241210130939.1575610-4-jacek.lawrynowicz@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/accel/ivpu/ivpu_pm.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/accel/ivpu/ivpu_pm.c ++++ b/drivers/accel/ivpu/ivpu_pm.c +@@ -364,6 +364,7 @@ void ivpu_pm_init(struct ivpu_device *vd + + pm_runtime_use_autosuspend(dev); + pm_runtime_set_autosuspend_delay(dev, delay); ++ pm_runtime_set_active(dev); + + ivpu_dbg(vdev, PM, "Autosuspend delay = %d\n", delay); + } +@@ -378,7 +379,6 @@ void ivpu_pm_enable(struct ivpu_device * + { + struct device *dev = vdev->drm.dev; + +- pm_runtime_set_active(dev); + pm_runtime_allow(dev); + pm_runtime_mark_last_busy(dev); + pm_runtime_put_autosuspend(dev); diff --git a/queue-6.12/btrfs-fix-improper-generation-check-in-snapshot-delete.patch b/queue-6.12/btrfs-fix-improper-generation-check-in-snapshot-delete.patch new file mode 100644 index 00000000000..1ba7fa32ddf --- /dev/null +++ b/queue-6.12/btrfs-fix-improper-generation-check-in-snapshot-delete.patch @@ -0,0 +1,127 @@ +From d75d72a858f0c00ca8ae161b48cdb403807be4de Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Wed, 13 Nov 2024 11:11:55 -0500 +Subject: btrfs: fix improper generation check in snapshot delete + +From: Josef Bacik + +commit d75d72a858f0c00ca8ae161b48cdb403807be4de upstream. + +We have been using the following check + + if (generation <= root->root_key.offset) + +to make decisions about whether or not to visit a node during snapshot +delete. This is because for normal subvolumes this is set to 0, and for +snapshots it's set to the creation generation. The idea being that if +the generation of the node is less than or equal to our creation +generation then we don't need to visit that node, because it doesn't +belong to us, we can simply drop our reference and move on. + +However reloc roots don't have their generation stored in +root->root_key.offset, instead that is the objectid of their +corresponding fs root. This means we can incorrectly not walk into +nodes that need to be dropped when deleting a reloc root. + +There are a variety of consequences to making the wrong choice in two +distinct areas. + +visit_node_for_delete() + +1. False positive. We think we are newer than the block when we really + aren't. We don't visit the node and drop our reference to the node + and carry on. This would result in leaked space. +2. False negative. We do decide to walk down into a block that we + should have just dropped our reference to. However this means that + the child node will have refs > 1, so we will switch to + UPDATE_BACKREF, and then the subsequent walk_down_proc() will notice + that btrfs_header_owner(node) != root->root_key.objectid and it'll + break out of the loop, and then walk_up_proc() will drop our reference, + so this appears to be ok. + +do_walk_down() + +1. False positive. We are in UPDATE_BACKREF and incorrectly decide that + we are done and don't need to update the backref for our lower nodes. + This is another case that simply won't happen with relocation, as we + only have to do UPDATE_BACKREF if the node below us was shared and + didn't have FULL_BACKREF set, and since we don't own that node + because we're a reloc root we actually won't end up in this case. +2. False negative. Again this is tricky because as described above, we + simply wouldn't be here from relocation, because we don't own any of + the nodes because we never set btrfs_header_owner() to the reloc root + objectid, and we always use FULL_BACKREF, we never actually need to + set FULL_BACKREF on any children. + +Having spent a lot of time stressing relocation/snapshot delete recently +I've not seen this pop in practice. But this is objectively incorrect, +so fix this to get the correct starting generation based on the root +we're dropping to keep me from thinking there's a problem here. + +CC: stable@vger.kernel.org +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/ctree.h | 19 +++++++++++++++++++ + fs/btrfs/extent-tree.c | 6 +++--- + 2 files changed, 22 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -371,6 +371,25 @@ static inline void btrfs_set_root_last_t + } + + /* ++ * Return the generation this root started with. ++ * ++ * Every normal root that is created with root->root_key.offset set to it's ++ * originating generation. If it is a snapshot it is the generation when the ++ * snapshot was created. ++ * ++ * However for TREE_RELOC roots root_key.offset is the objectid of the owning ++ * tree root. Thankfully we copy the root item of the owning tree root, which ++ * has it's last_snapshot set to what we would have root_key.offset set to, so ++ * return that if this is a TREE_RELOC root. ++ */ ++static inline u64 btrfs_root_origin_generation(const struct btrfs_root *root) ++{ ++ if (btrfs_root_id(root) == BTRFS_TREE_RELOC_OBJECTID) ++ return btrfs_root_last_snapshot(&root->root_item); ++ return root->root_key.offset; ++} ++ ++/* + * Structure that conveys information about an extent that is going to replace + * all the extents in a file range. + */ +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -5308,7 +5308,7 @@ static bool visit_node_for_delete(struct + * reference to it. + */ + generation = btrfs_node_ptr_generation(eb, slot); +- if (!wc->update_ref || generation <= root->root_key.offset) ++ if (!wc->update_ref || generation <= btrfs_root_origin_generation(root)) + return false; + + /* +@@ -5363,7 +5363,7 @@ static noinline void reada_walk_down(str + goto reada; + + if (wc->stage == UPDATE_BACKREF && +- generation <= root->root_key.offset) ++ generation <= btrfs_root_origin_generation(root)) + continue; + + /* We don't lock the tree block, it's OK to be racy here */ +@@ -5706,7 +5706,7 @@ static noinline int do_walk_down(struct + * for the subtree + */ + if (wc->stage == UPDATE_BACKREF && +- generation <= root->root_key.offset) { ++ generation <= btrfs_root_origin_generation(root)) { + wc->lookup_info = 1; + return 1; + } diff --git a/queue-6.12/btrfs-split-bios-to-the-fs-sector-size-boundary.patch b/queue-6.12/btrfs-split-bios-to-the-fs-sector-size-boundary.patch new file mode 100644 index 00000000000..04da4616c3a --- /dev/null +++ b/queue-6.12/btrfs-split-bios-to-the-fs-sector-size-boundary.patch @@ -0,0 +1,47 @@ +From be691b5e593f2cc8cef67bbc59c1fb91b74a86a9 Mon Sep 17 00:00:00 2001 +From: Christoph Hellwig +Date: Mon, 4 Nov 2024 07:26:33 +0100 +Subject: btrfs: split bios to the fs sector size boundary + +From: Christoph Hellwig + +commit be691b5e593f2cc8cef67bbc59c1fb91b74a86a9 upstream. + +Btrfs like other file systems can't really deal with I/O not aligned to +it's internal block size (which strangely is called sector size in +btrfs, for historical reasons), but the block layer split helper doesn't +even know about that. + +Round down the split boundary so that all I/Os are aligned. + +Fixes: d5e4377d5051 ("btrfs: split zone append bios in btrfs_submit_bio") +CC: stable@vger.kernel.org # 6.12 +Reviewed-by: Johannes Thumshirn +Signed-off-by: Christoph Hellwig +Reviewed-by: Damien Le Moal +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/bio.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/bio.c ++++ b/fs/btrfs/bio.c +@@ -649,8 +649,14 @@ static u64 btrfs_append_map_length(struc + map_length = min(map_length, bbio->fs_info->max_zone_append_size); + sector_offset = bio_split_rw_at(&bbio->bio, &bbio->fs_info->limits, + &nr_segs, map_length); +- if (sector_offset) +- return sector_offset << SECTOR_SHIFT; ++ if (sector_offset) { ++ /* ++ * bio_split_rw_at() could split at a size smaller than our ++ * sectorsize and thus cause unaligned I/Os. Fix that by ++ * always rounding down to the nearest boundary. ++ */ ++ return ALIGN_DOWN(sector_offset << SECTOR_SHIFT, bbio->fs_info->sectorsize); ++ } + return map_length; + } + diff --git a/queue-6.12/btrfs-tree-checker-reject-inline-extent-items-with-0-ref-count.patch b/queue-6.12/btrfs-tree-checker-reject-inline-extent-items-with-0-ref-count.patch new file mode 100644 index 00000000000..1b40c9b9ff6 --- /dev/null +++ b/queue-6.12/btrfs-tree-checker-reject-inline-extent-items-with-0-ref-count.patch @@ -0,0 +1,104 @@ +From dfb92681a19e1d5172420baa242806414b3eff6f Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Wed, 4 Dec 2024 13:30:46 +1030 +Subject: btrfs: tree-checker: reject inline extent items with 0 ref count + +From: Qu Wenruo + +commit dfb92681a19e1d5172420baa242806414b3eff6f upstream. + +[BUG] +There is a bug report in the mailing list where btrfs_run_delayed_refs() +failed to drop the ref count for logical 25870311358464 num_bytes +2113536. + +The involved leaf dump looks like this: + + item 166 key (25870311358464 168 2113536) itemoff 10091 itemsize 50 + extent refs 1 gen 84178 flags 1 + ref#0: shared data backref parent 32399126528000 count 0 <<< + ref#1: shared data backref parent 31808973717504 count 1 + +Notice the count number is 0. + +[CAUSE] +There is no concrete evidence yet, but considering 0 -> 1 is also a +single bit flipped, it's possible that hardware memory bitflip is +involved, causing the on-disk extent tree to be corrupted. + +[FIX] +To prevent us reading such corrupted extent item, or writing such +damaged extent item back to disk, enhance the handling of +BTRFS_EXTENT_DATA_REF_KEY and BTRFS_SHARED_DATA_REF_KEY keys for both +inlined and key items, to detect such 0 ref count and reject them. + +CC: stable@vger.kernel.org # 5.4+ +Link: https://lore.kernel.org/linux-btrfs/7c69dd49-c346-4806-86e7-e6f863a66f48@app.fastmail.com/ +Reported-by: Frankie Fisher +Reviewed-by: Filipe Manana +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-checker.c | 27 ++++++++++++++++++++++++++- + 1 file changed, 26 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/tree-checker.c ++++ b/fs/btrfs/tree-checker.c +@@ -1527,6 +1527,11 @@ static int check_extent_item(struct exte + dref_offset, fs_info->sectorsize); + return -EUCLEAN; + } ++ if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { ++ extent_err(leaf, slot, ++ "invalid data ref count, should have non-zero value"); ++ return -EUCLEAN; ++ } + inline_refs += btrfs_extent_data_ref_count(leaf, dref); + break; + /* Contains parent bytenr and ref count */ +@@ -1539,6 +1544,11 @@ static int check_extent_item(struct exte + inline_offset, fs_info->sectorsize); + return -EUCLEAN; + } ++ if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { ++ extent_err(leaf, slot, ++ "invalid shared data ref count, should have non-zero value"); ++ return -EUCLEAN; ++ } + inline_refs += btrfs_shared_data_ref_count(leaf, sref); + break; + case BTRFS_EXTENT_OWNER_REF_KEY: +@@ -1611,8 +1621,18 @@ static int check_simple_keyed_refs(struc + { + u32 expect_item_size = 0; + +- if (key->type == BTRFS_SHARED_DATA_REF_KEY) ++ if (key->type == BTRFS_SHARED_DATA_REF_KEY) { ++ struct btrfs_shared_data_ref *sref; ++ ++ sref = btrfs_item_ptr(leaf, slot, struct btrfs_shared_data_ref); ++ if (unlikely(btrfs_shared_data_ref_count(leaf, sref) == 0)) { ++ extent_err(leaf, slot, ++ "invalid shared data backref count, should have non-zero value"); ++ return -EUCLEAN; ++ } ++ + expect_item_size = sizeof(struct btrfs_shared_data_ref); ++ } + + if (unlikely(btrfs_item_size(leaf, slot) != expect_item_size)) { + generic_err(leaf, slot, +@@ -1689,6 +1709,11 @@ static int check_extent_data_ref(struct + offset, leaf->fs_info->sectorsize); + return -EUCLEAN; + } ++ if (unlikely(btrfs_extent_data_ref_count(leaf, dref) == 0)) { ++ extent_err(leaf, slot, ++ "invalid extent data backref count, should have non-zero value"); ++ return -EUCLEAN; ++ } + } + return 0; + } diff --git a/queue-6.12/cxl-pci-fix-potential-bogus-return-value-upon-successful-probing.patch b/queue-6.12/cxl-pci-fix-potential-bogus-return-value-upon-successful-probing.patch deleted file mode 100644 index 4e51fb1c01b..00000000000 --- a/queue-6.12/cxl-pci-fix-potential-bogus-return-value-upon-successful-probing.patch +++ /dev/null @@ -1,40 +0,0 @@ -From da4d8c83358163df9a4addaeba0ef8bcb03b22e8 Mon Sep 17 00:00:00 2001 -From: Davidlohr Bueso -Date: Fri, 15 Nov 2024 09:00:32 -0800 -Subject: cxl/pci: Fix potential bogus return value upon successful probing - -From: Davidlohr Bueso - -commit da4d8c83358163df9a4addaeba0ef8bcb03b22e8 upstream. - -If cxl_pci_ras_unmask() returns non-zero, cxl_pci_probe() will end up -returning that value, instead of zero. - -Fixes: 248529edc86f ("cxl: add RAS status unmasking for CXL") -Reviewed-by: Fan Ni -Signed-off-by: Davidlohr Bueso -Reviewed-by: Ira Weiny -Link: https://patch.msgid.link/20241115170032.108445-1-dave@stgolabs.net -Signed-off-by: Dave Jiang -Signed-off-by: Greg Kroah-Hartman ---- - drivers/cxl/pci.c | 3 +-- - 1 file changed, 1 insertion(+), 2 deletions(-) - -diff --git a/drivers/cxl/pci.c b/drivers/cxl/pci.c -index 0241d1d7133a..26ab06c9deff 100644 ---- a/drivers/cxl/pci.c -+++ b/drivers/cxl/pci.c -@@ -1032,8 +1032,7 @@ static int cxl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) - if (rc) - return rc; - -- rc = cxl_pci_ras_unmask(pdev); -- if (rc) -+ if (cxl_pci_ras_unmask(pdev)) - dev_dbg(&pdev->dev, "No RAS reporting unmasked\n"); - - pci_save_state(pdev); --- -2.47.1 - diff --git a/queue-6.12/drivers-hv-util-avoid-accessing-a-ringbuffer-not-initialized-yet.patch b/queue-6.12/drivers-hv-util-avoid-accessing-a-ringbuffer-not-initialized-yet.patch new file mode 100644 index 00000000000..fe11703c4cc --- /dev/null +++ b/queue-6.12/drivers-hv-util-avoid-accessing-a-ringbuffer-not-initialized-yet.patch @@ -0,0 +1,169 @@ +From 07a756a49f4b4290b49ea46e089cbe6f79ff8d26 Mon Sep 17 00:00:00 2001 +From: Michael Kelley +Date: Wed, 6 Nov 2024 07:42:47 -0800 +Subject: Drivers: hv: util: Avoid accessing a ringbuffer not initialized yet + +From: Michael Kelley + +commit 07a756a49f4b4290b49ea46e089cbe6f79ff8d26 upstream. + +If the KVP (or VSS) daemon starts before the VMBus channel's ringbuffer is +fully initialized, we can hit the panic below: + +hv_utils: Registering HyperV Utility Driver +hv_vmbus: registering driver hv_utils +... +BUG: kernel NULL pointer dereference, address: 0000000000000000 +CPU: 44 UID: 0 PID: 2552 Comm: hv_kvp_daemon Tainted: G E 6.11.0-rc3+ #1 +RIP: 0010:hv_pkt_iter_first+0x12/0xd0 +Call Trace: +... + vmbus_recvpacket + hv_kvp_onchannelcallback + vmbus_on_event + tasklet_action_common + tasklet_action + handle_softirqs + irq_exit_rcu + sysvec_hyperv_stimer0 + + + asm_sysvec_hyperv_stimer0 +... + kvp_register_done + hvt_op_read + vfs_read + ksys_read + __x64_sys_read + +This can happen because the KVP/VSS channel callback can be invoked +even before the channel is fully opened: +1) as soon as hv_kvp_init() -> hvutil_transport_init() creates +/dev/vmbus/hv_kvp, the kvp daemon can open the device file immediately and +register itself to the driver by writing a message KVP_OP_REGISTER1 to the +file (which is handled by kvp_on_msg() ->kvp_handle_handshake()) and +reading the file for the driver's response, which is handled by +hvt_op_read(), which calls hvt->on_read(), i.e. kvp_register_done(). + +2) the problem with kvp_register_done() is that it can cause the +channel callback to be called even before the channel is fully opened, +and when the channel callback is starting to run, util_probe()-> +vmbus_open() may have not initialized the ringbuffer yet, so the +callback can hit the panic of NULL pointer dereference. + +To reproduce the panic consistently, we can add a "ssleep(10)" for KVP in +__vmbus_open(), just before the first hv_ringbuffer_init(), and then we +unload and reload the driver hv_utils, and run the daemon manually within +the 10 seconds. + +Fix the panic by reordering the steps in util_probe() so the char dev +entry used by the KVP or VSS daemon is not created until after +vmbus_open() has completed. This reordering prevents the race condition +from happening. + +Reported-by: Dexuan Cui +Fixes: e0fa3e5e7df6 ("Drivers: hv: utils: fix a race on userspace daemons registration") +Cc: stable@vger.kernel.org +Signed-off-by: Michael Kelley +Acked-by: Wei Liu +Link: https://lore.kernel.org/r/20241106154247.2271-3-mhklinux@outlook.com +Signed-off-by: Wei Liu +Message-ID: <20241106154247.2271-3-mhklinux@outlook.com> +Signed-off-by: Greg Kroah-Hartman +--- + drivers/hv/hv_kvp.c | 6 ++++++ + drivers/hv/hv_snapshot.c | 6 ++++++ + drivers/hv/hv_util.c | 9 +++++++++ + drivers/hv/hyperv_vmbus.h | 2 ++ + include/linux/hyperv.h | 1 + + 5 files changed, 24 insertions(+) + +--- a/drivers/hv/hv_kvp.c ++++ b/drivers/hv/hv_kvp.c +@@ -767,6 +767,12 @@ hv_kvp_init(struct hv_util_service *srv) + */ + kvp_transaction.state = HVUTIL_DEVICE_INIT; + ++ return 0; ++} ++ ++int ++hv_kvp_init_transport(void) ++{ + hvt = hvutil_transport_init(kvp_devname, CN_KVP_IDX, CN_KVP_VAL, + kvp_on_msg, kvp_on_reset); + if (!hvt) +--- a/drivers/hv/hv_snapshot.c ++++ b/drivers/hv/hv_snapshot.c +@@ -388,6 +388,12 @@ hv_vss_init(struct hv_util_service *srv) + */ + vss_transaction.state = HVUTIL_DEVICE_INIT; + ++ return 0; ++} ++ ++int ++hv_vss_init_transport(void) ++{ + hvt = hvutil_transport_init(vss_devname, CN_VSS_IDX, CN_VSS_VAL, + vss_on_msg, vss_on_reset); + if (!hvt) { +--- a/drivers/hv/hv_util.c ++++ b/drivers/hv/hv_util.c +@@ -141,6 +141,7 @@ static struct hv_util_service util_heart + static struct hv_util_service util_kvp = { + .util_cb = hv_kvp_onchannelcallback, + .util_init = hv_kvp_init, ++ .util_init_transport = hv_kvp_init_transport, + .util_pre_suspend = hv_kvp_pre_suspend, + .util_pre_resume = hv_kvp_pre_resume, + .util_deinit = hv_kvp_deinit, +@@ -149,6 +150,7 @@ static struct hv_util_service util_kvp = + static struct hv_util_service util_vss = { + .util_cb = hv_vss_onchannelcallback, + .util_init = hv_vss_init, ++ .util_init_transport = hv_vss_init_transport, + .util_pre_suspend = hv_vss_pre_suspend, + .util_pre_resume = hv_vss_pre_resume, + .util_deinit = hv_vss_deinit, +@@ -613,6 +615,13 @@ static int util_probe(struct hv_device * + if (ret) + goto error; + ++ if (srv->util_init_transport) { ++ ret = srv->util_init_transport(); ++ if (ret) { ++ vmbus_close(dev->channel); ++ goto error; ++ } ++ } + return 0; + + error: +--- a/drivers/hv/hyperv_vmbus.h ++++ b/drivers/hv/hyperv_vmbus.h +@@ -370,12 +370,14 @@ void vmbus_on_event(unsigned long data); + void vmbus_on_msg_dpc(unsigned long data); + + int hv_kvp_init(struct hv_util_service *srv); ++int hv_kvp_init_transport(void); + void hv_kvp_deinit(void); + int hv_kvp_pre_suspend(void); + int hv_kvp_pre_resume(void); + void hv_kvp_onchannelcallback(void *context); + + int hv_vss_init(struct hv_util_service *srv); ++int hv_vss_init_transport(void); + void hv_vss_deinit(void); + int hv_vss_pre_suspend(void); + int hv_vss_pre_resume(void); +--- a/include/linux/hyperv.h ++++ b/include/linux/hyperv.h +@@ -1559,6 +1559,7 @@ struct hv_util_service { + void *channel; + void (*util_cb)(void *); + int (*util_init)(struct hv_util_service *); ++ int (*util_init_transport)(void); + void (*util_deinit)(void); + int (*util_pre_suspend)(void); + int (*util_pre_resume)(void); diff --git a/queue-6.12/drm-amdgpu-gfx12-fix-ip-version-check.patch b/queue-6.12/drm-amdgpu-gfx12-fix-ip-version-check.patch new file mode 100644 index 00000000000..d6f49994c3c --- /dev/null +++ b/queue-6.12/drm-amdgpu-gfx12-fix-ip-version-check.patch @@ -0,0 +1,31 @@ +From 41be00f839e9ee7753892a73a36ce4c14c6f5cbf Mon Sep 17 00:00:00 2001 +From: Alex Deucher +Date: Thu, 12 Dec 2024 17:04:58 -0500 +Subject: drm/amdgpu/gfx12: fix IP version check + +From: Alex Deucher + +commit 41be00f839e9ee7753892a73a36ce4c14c6f5cbf upstream. + +Use the helper function rather than reading it directly. + +Reviewed-by: Yang Wang +Signed-off-by: Alex Deucher +(cherry picked from commit f1fd1d0f40272948aa6ab82a3a82ecbbc76dff53) +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +@@ -4105,7 +4105,7 @@ static int gfx_v12_0_set_clockgating_sta + if (amdgpu_sriov_vf(adev)) + return 0; + +- switch (adev->ip_versions[GC_HWIP][0]) { ++ switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { + case IP_VERSION(12, 0, 0): + case IP_VERSION(12, 0, 1): + gfx_v12_0_update_gfx_clock_gating(adev, diff --git a/queue-6.12/drm-amdgpu-mmhub4.1-fix-ip-version-check.patch b/queue-6.12/drm-amdgpu-mmhub4.1-fix-ip-version-check.patch new file mode 100644 index 00000000000..4e4348a09b1 --- /dev/null +++ b/queue-6.12/drm-amdgpu-mmhub4.1-fix-ip-version-check.patch @@ -0,0 +1,36 @@ +From 6ebc5b92190e01dd48313b68cbf752c9adcfefa8 Mon Sep 17 00:00:00 2001 +From: Alex Deucher +Date: Thu, 12 Dec 2024 17:03:20 -0500 +Subject: drm/amdgpu/mmhub4.1: fix IP version check + +From: Alex Deucher + +commit 6ebc5b92190e01dd48313b68cbf752c9adcfefa8 upstream. + +Use the helper function rather than reading it directly. + +Reviewed-by: Yang Wang +Signed-off-by: Alex Deucher +(cherry picked from commit 63bfd24088b42c6f55c2096bfc41b50213d419b2) +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c +index 0fbc3be81f14..f2ab5001b492 100644 +--- a/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/mmhub_v4_1_0.c +@@ -108,7 +108,7 @@ mmhub_v4_1_0_print_l2_protection_fault_status(struct amdgpu_device *adev, + dev_err(adev->dev, + "MMVM_L2_PROTECTION_FAULT_STATUS_LO32:0x%08X\n", + status); +- switch (adev->ip_versions[MMHUB_HWIP][0]) { ++ switch (amdgpu_ip_version(adev, MMHUB_HWIP, 0)) { + case IP_VERSION(4, 1, 0): + mmhub_cid = mmhub_client_ids_v4_1_0[cid][rw]; + break; +-- +2.47.1 + diff --git a/queue-6.12/drm-amdgpu-nbio7.0-fix-ip-version-check.patch b/queue-6.12/drm-amdgpu-nbio7.0-fix-ip-version-check.patch new file mode 100644 index 00000000000..ec4fa6e057a --- /dev/null +++ b/queue-6.12/drm-amdgpu-nbio7.0-fix-ip-version-check.patch @@ -0,0 +1,36 @@ +From 3abb660f9e18925468685591a3702bda05faba4f Mon Sep 17 00:00:00 2001 +From: Alex Deucher +Date: Thu, 12 Dec 2024 16:49:20 -0500 +Subject: drm/amdgpu/nbio7.0: fix IP version check + +From: Alex Deucher + +commit 3abb660f9e18925468685591a3702bda05faba4f upstream. + +Use the helper function rather than reading it directly. + +Reviewed-by: Yang Wang +Signed-off-by: Alex Deucher +(cherry picked from commit 0ec43fbece784215d3c4469973e4556d70bce915) +Cc: stable@vger.kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c +index 49e953f86ced..d1032e9992b4 100644 +--- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c ++++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_0.c +@@ -278,7 +278,7 @@ static void nbio_v7_0_init_registers(struct amdgpu_device *adev) + { + uint32_t data; + +- switch (adev->ip_versions[NBIO_HWIP][0]) { ++ switch (amdgpu_ip_version(adev, NBIO_HWIP, 0)) { + case IP_VERSION(2, 5, 0): + data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4) & ~BIT(23); + WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF6_STRAP4, data); +-- +2.47.1 + diff --git a/queue-6.12/fgraph-still-initialize-idle-shadow-stacks-when-starting.patch b/queue-6.12/fgraph-still-initialize-idle-shadow-stacks-when-starting.patch new file mode 100644 index 00000000000..99c3845f6da --- /dev/null +++ b/queue-6.12/fgraph-still-initialize-idle-shadow-stacks-when-starting.patch @@ -0,0 +1,62 @@ +From cc252bb592638e0f7aea40d580186c36d89526b8 Mon Sep 17 00:00:00 2001 +From: Steven Rostedt +Date: Wed, 11 Dec 2024 13:53:35 -0500 +Subject: fgraph: Still initialize idle shadow stacks when starting + +From: Steven Rostedt + +commit cc252bb592638e0f7aea40d580186c36d89526b8 upstream. + +A bug was discovered where the idle shadow stacks were not initialized +for offline CPUs when starting function graph tracer, and when they came +online they were not traced due to the missing shadow stack. To fix +this, the idle task shadow stack initialization was moved to using the +CPU hotplug callbacks. But it removed the initialization when the +function graph was enabled. The problem here is that the hotplug +callbacks are called when the CPUs come online, but the idle shadow +stack initialization only happens if function graph is currently +active. This caused the online CPUs to not get their shadow stack +initialized. + +The idle shadow stack initialization still needs to be done when the +function graph is registered, as they will not be allocated if function +graph is not registered. + +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Mathieu Desnoyers +Link: https://lore.kernel.org/20241211135335.094ba282@batman.local.home +Fixes: 2c02f7375e65 ("fgraph: Use CPU hotplug mechanism to initialize idle shadow stacks") +Reported-by: Linus Walleij +Tested-by: Linus Walleij +Closes: https://lore.kernel.org/all/CACRpkdaTBrHwRbbrphVy-=SeDz6MSsXhTKypOtLrTQ+DgGAOcQ@mail.gmail.com/ +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/fgraph.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/kernel/trace/fgraph.c ++++ b/kernel/trace/fgraph.c +@@ -1160,7 +1160,7 @@ void fgraph_update_pid_func(void) + static int start_graph_tracing(void) + { + unsigned long **ret_stack_list; +- int ret; ++ int ret, cpu; + + ret_stack_list = kcalloc(FTRACE_RETSTACK_ALLOC_SIZE, + sizeof(*ret_stack_list), GFP_KERNEL); +@@ -1168,6 +1168,12 @@ static int start_graph_tracing(void) + if (!ret_stack_list) + return -ENOMEM; + ++ /* The cpu_boot init_task->ret_stack will never be freed */ ++ for_each_online_cpu(cpu) { ++ if (!idle_task(cpu)->ret_stack) ++ ftrace_graph_init_idle_task(idle_task(cpu), cpu); ++ } ++ + do { + ret = alloc_retstack_tasklist(ret_stack_list); + } while (ret == -EAGAIN); diff --git a/queue-6.12/io_uring-check-if-iowq-is-killed-before-queuing.patch b/queue-6.12/io_uring-check-if-iowq-is-killed-before-queuing.patch new file mode 100644 index 00000000000..83e79fbc0ba --- /dev/null +++ b/queue-6.12/io_uring-check-if-iowq-is-killed-before-queuing.patch @@ -0,0 +1,46 @@ +From dbd2ca9367eb19bc5e269b8c58b0b1514ada9156 Mon Sep 17 00:00:00 2001 +From: Pavel Begunkov +Date: Thu, 19 Dec 2024 19:52:58 +0000 +Subject: io_uring: check if iowq is killed before queuing + +From: Pavel Begunkov + +commit dbd2ca9367eb19bc5e269b8c58b0b1514ada9156 upstream. + +task work can be executed after the task has gone through io_uring +termination, whether it's the final task_work run or the fallback path. +In this case, task work will find ->io_wq being already killed and +null'ed, which is a problem if it then tries to forward the request to +io_queue_iowq(). Make io_queue_iowq() fail requests in this case. + +Note that it also checks PF_KTHREAD, because the user can first close +a DEFER_TASKRUN ring and shortly after kill the task, in which case +->iowq check would race. + +Cc: stable@vger.kernel.org +Fixes: 50c52250e2d74 ("block: implement async io_uring discard cmd") +Fixes: 773af69121ecc ("io_uring: always reissue from task_work context") +Reported-by: Will +Signed-off-by: Pavel Begunkov +Link: https://lore.kernel.org/r/63312b4a2c2bb67ad67b857d17a300e1d3b078e8.1734637909.git.asml.silence@gmail.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/io_uring.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -515,7 +515,11 @@ static void io_queue_iowq(struct io_kioc + struct io_uring_task *tctx = req->task->io_uring; + + BUG_ON(!tctx); +- BUG_ON(!tctx->io_wq); ++ ++ if ((current->flags & PF_KTHREAD) || !tctx->io_wq) { ++ io_req_task_queue_fail(req, -ECANCELED); ++ return; ++ } + + /* init ->work of the whole link before punting */ + io_prep_async_link(req); diff --git a/queue-6.12/io_uring-fix-registered-ring-file-refcount-leak.patch b/queue-6.12/io_uring-fix-registered-ring-file-refcount-leak.patch new file mode 100644 index 00000000000..243e6ed60ad --- /dev/null +++ b/queue-6.12/io_uring-fix-registered-ring-file-refcount-leak.patch @@ -0,0 +1,64 @@ +From 12d908116f7efd34f255a482b9afc729d7a5fb78 Mon Sep 17 00:00:00 2001 +From: Jann Horn +Date: Wed, 18 Dec 2024 17:56:25 +0100 +Subject: io_uring: Fix registered ring file refcount leak + +From: Jann Horn + +commit 12d908116f7efd34f255a482b9afc729d7a5fb78 upstream. + +Currently, io_uring_unreg_ringfd() (which cleans up registered rings) is +only called on exit, but __io_uring_free (which frees the tctx in which the +registered ring pointers are stored) is also called on execve (via +begin_new_exec -> io_uring_task_cancel -> __io_uring_cancel -> +io_uring_cancel_generic -> __io_uring_free). + +This means: A process going through execve while having registered rings +will leak references to the rings' `struct file`. + +Fix it by zapping registered rings on execve(). This is implemented by +moving the io_uring_unreg_ringfd() from io_uring_files_cancel() into its +callee __io_uring_cancel(), which is called from io_uring_task_cancel() on +execve. + +This could probably be exploited *on 32-bit kernels* by leaking 2^32 +references to the same ring, because the file refcount is stored in a +pointer-sized field and get_file() doesn't have protection against +refcount overflow, just a WARN_ONCE(); but on 64-bit it should have no +impact beyond a memory leak. + +Cc: stable@vger.kernel.org +Fixes: e7a6c00dc77a ("io_uring: add support for registering ring file descriptors") +Signed-off-by: Jann Horn +Link: https://lore.kernel.org/r/20241218-uring-reg-ring-cleanup-v1-1-8f63e999045b@google.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/io_uring.h | 4 +--- + io_uring/io_uring.c | 1 + + 2 files changed, 2 insertions(+), 3 deletions(-) + +--- a/include/linux/io_uring.h ++++ b/include/linux/io_uring.h +@@ -15,10 +15,8 @@ bool io_is_uring_fops(struct file *file) + + static inline void io_uring_files_cancel(void) + { +- if (current->io_uring) { +- io_uring_unreg_ringfd(); ++ if (current->io_uring) + __io_uring_cancel(false); +- } + } + static inline void io_uring_task_cancel(void) + { +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -3230,6 +3230,7 @@ end_wait: + + void __io_uring_cancel(bool cancel_all) + { ++ io_uring_unreg_ringfd(); + io_uring_cancel_generic(cancel_all, NULL); + } + diff --git a/queue-6.12/kvm-x86-play-nice-with-protected-guests-in-complete_hypercall_exit.patch b/queue-6.12/kvm-x86-play-nice-with-protected-guests-in-complete_hypercall_exit.patch new file mode 100644 index 00000000000..0be2d3503e1 --- /dev/null +++ b/queue-6.12/kvm-x86-play-nice-with-protected-guests-in-complete_hypercall_exit.patch @@ -0,0 +1,59 @@ +From 9b42d1e8e4fe9dc631162c04caa69b0d1860b0f0 Mon Sep 17 00:00:00 2001 +From: Sean Christopherson +Date: Wed, 27 Nov 2024 16:43:39 -0800 +Subject: KVM: x86: Play nice with protected guests in complete_hypercall_exit() + +From: Sean Christopherson + +commit 9b42d1e8e4fe9dc631162c04caa69b0d1860b0f0 upstream. + +Use is_64_bit_hypercall() instead of is_64_bit_mode() to detect a 64-bit +hypercall when completing said hypercall. For guests with protected state, +e.g. SEV-ES and SEV-SNP, KVM must assume the hypercall was made in 64-bit +mode as the vCPU state needed to detect 64-bit mode is unavailable. + +Hacking the sev_smoke_test selftest to generate a KVM_HC_MAP_GPA_RANGE +hypercall via VMGEXIT trips the WARN: + + ------------[ cut here ]------------ + WARNING: CPU: 273 PID: 326626 at arch/x86/kvm/x86.h:180 complete_hypercall_exit+0x44/0xe0 [kvm] + Modules linked in: kvm_amd kvm ... [last unloaded: kvm] + CPU: 273 UID: 0 PID: 326626 Comm: sev_smoke_test Not tainted 6.12.0-smp--392e932fa0f3-feat #470 + Hardware name: Google Astoria/astoria, BIOS 0.20240617.0-0 06/17/2024 + RIP: 0010:complete_hypercall_exit+0x44/0xe0 [kvm] + Call Trace: + + kvm_arch_vcpu_ioctl_run+0x2400/0x2720 [kvm] + kvm_vcpu_ioctl+0x54f/0x630 [kvm] + __se_sys_ioctl+0x6b/0xc0 + do_syscall_64+0x83/0x160 + entry_SYSCALL_64_after_hwframe+0x76/0x7e + + ---[ end trace 0000000000000000 ]--- + +Fixes: b5aead0064f3 ("KVM: x86: Assume a 64-bit hypercall for guests with protected state") +Cc: stable@vger.kernel.org +Cc: Tom Lendacky +Reviewed-by: Xiaoyao Li +Reviewed-by: Nikunj A Dadhania +Reviewed-by: Tom Lendacky +Reviewed-by: Binbin Wu +Reviewed-by: Kai Huang +Link: https://lore.kernel.org/r/20241128004344.4072099-2-seanjc@google.com +Signed-off-by: Sean Christopherson +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kvm/x86.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -9991,7 +9991,7 @@ static int complete_hypercall_exit(struc + { + u64 ret = vcpu->run->hypercall.ret; + +- if (!is_64_bit_mode(vcpu)) ++ if (!is_64_bit_hypercall(vcpu)) + ret = (u32)ret; + kvm_rax_write(vcpu, ret); + ++vcpu->stat.hypercalls; diff --git a/queue-6.12/s390-mm-fix-directmap-accounting.patch b/queue-6.12/s390-mm-fix-directmap-accounting.patch new file mode 100644 index 00000000000..da8332af480 --- /dev/null +++ b/queue-6.12/s390-mm-fix-directmap-accounting.patch @@ -0,0 +1,64 @@ +From 41856638e6c4ed51d8aa9e54f70059d1e357b46e Mon Sep 17 00:00:00 2001 +From: Heiko Carstens +Date: Fri, 29 Nov 2024 17:39:27 +0100 +Subject: s390/mm: Fix DirectMap accounting + +From: Heiko Carstens + +commit 41856638e6c4ed51d8aa9e54f70059d1e357b46e upstream. + +With uncoupling of physical and virtual address spaces population of +the identity mapping was changed to use the type POPULATE_IDENTITY +instead of POPULATE_DIRECT. This breaks DirectMap accounting: + +> cat /proc/meminfo +DirectMap4k: 55296 kB +DirectMap1M: 18446744073709496320 kB + +Adjust all locations of update_page_count() in vmem.c to use +POPULATE_IDENTITY instead of POPULATE_DIRECT as well. With this +accounting is correct again: + +> cat /proc/meminfo +DirectMap4k: 54264 kB +DirectMap1M: 8334336 kB + +Fixes: c98d2ecae08f ("s390/mm: Uncouple physical vs virtual address spaces") +Cc: stable@vger.kernel.org +Reviewed-by: Alexander Gordeev +Signed-off-by: Heiko Carstens +Signed-off-by: Alexander Gordeev +Signed-off-by: Greg Kroah-Hartman +--- + arch/s390/boot/vmem.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/arch/s390/boot/vmem.c ++++ b/arch/s390/boot/vmem.c +@@ -306,7 +306,7 @@ static void pgtable_pte_populate(pmd_t * + pages++; + } + } +- if (mode == POPULATE_DIRECT) ++ if (mode == POPULATE_IDENTITY) + update_page_count(PG_DIRECT_MAP_4K, pages); + } + +@@ -339,7 +339,7 @@ static void pgtable_pmd_populate(pud_t * + } + pgtable_pte_populate(pmd, addr, next, mode); + } +- if (mode == POPULATE_DIRECT) ++ if (mode == POPULATE_IDENTITY) + update_page_count(PG_DIRECT_MAP_1M, pages); + } + +@@ -372,7 +372,7 @@ static void pgtable_pud_populate(p4d_t * + } + pgtable_pmd_populate(pud, addr, next, mode); + } +- if (mode == POPULATE_DIRECT) ++ if (mode == POPULATE_IDENTITY) + update_page_count(PG_DIRECT_MAP_2G, pages); + } + diff --git a/queue-6.12/selftests-bpf-use-asm-constraint-m-for-loongarch.patch b/queue-6.12/selftests-bpf-use-asm-constraint-m-for-loongarch.patch new file mode 100644 index 00000000000..a491f2993a5 --- /dev/null +++ b/queue-6.12/selftests-bpf-use-asm-constraint-m-for-loongarch.patch @@ -0,0 +1,40 @@ +From 29d44cce324dab2bd86c447071a596262e7109b6 Mon Sep 17 00:00:00 2001 +From: Tiezhu Yang +Date: Thu, 19 Dec 2024 19:15:06 +0800 +Subject: selftests/bpf: Use asm constraint "m" for LoongArch + +From: Tiezhu Yang + +commit 29d44cce324dab2bd86c447071a596262e7109b6 upstream. + +Currently, LoongArch LLVM does not support the constraint "o" and no plan +to support it, it only supports the similar constraint "m", so change the +constraints from "nor" in the "else" case to arch-specific "nmr" to avoid +the build error such as "unexpected asm memory constraint" for LoongArch. + +Fixes: 630301b0d59d ("selftests/bpf: Add basic USDT selftests") +Suggested-by: Weining Lu +Suggested-by: Li Chen +Signed-off-by: Tiezhu Yang +Signed-off-by: Daniel Borkmann +Reviewed-by: Huacai Chen +Cc: stable@vger.kernel.org +Link: https://llvm.org/docs/LangRef.html#supported-constraint-code-list +Link: https://github.com/llvm/llvm-project/blob/main/llvm/lib/Target/LoongArch/LoongArchISelDAGToDAG.cpp#L172 +Link: https://lore.kernel.org/bpf/20241219111506.20643-1-yangtiezhu@loongson.cn +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/bpf/sdt.h | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/tools/testing/selftests/bpf/sdt.h ++++ b/tools/testing/selftests/bpf/sdt.h +@@ -102,6 +102,8 @@ + # define STAP_SDT_ARG_CONSTRAINT nZr + # elif defined __arm__ + # define STAP_SDT_ARG_CONSTRAINT g ++# elif defined __loongarch__ ++# define STAP_SDT_ARG_CONSTRAINT nmr + # else + # define STAP_SDT_ARG_CONSTRAINT nor + # endif diff --git a/queue-6.12/selftests-memfd-run-sysctl-tests-when-pid-namespace-support-is-enabled.patch b/queue-6.12/selftests-memfd-run-sysctl-tests-when-pid-namespace-support-is-enabled.patch new file mode 100644 index 00000000000..1bc0f11b6a2 --- /dev/null +++ b/queue-6.12/selftests-memfd-run-sysctl-tests-when-pid-namespace-support-is-enabled.patch @@ -0,0 +1,71 @@ +From 6a75f19af16ff482cfd6085c77123aa0f464f8dd Mon Sep 17 00:00:00 2001 +From: "Isaac J. Manjarres" +Date: Thu, 5 Dec 2024 11:29:41 -0800 +Subject: selftests/memfd: run sysctl tests when PID namespace support is enabled + +From: Isaac J. Manjarres + +commit 6a75f19af16ff482cfd6085c77123aa0f464f8dd upstream. + +The sysctl tests for vm.memfd_noexec rely on the kernel to support PID +namespaces (i.e. the kernel is built with CONFIG_PID_NS=y). If the +kernel the test runs on does not support PID namespaces, the first sysctl +test will fail when attempting to spawn a new thread in a new PID +namespace, abort the test, preventing the remaining tests from being run. + +This is not desirable, as not all kernels need PID namespaces, but can +still use the other features provided by memfd. Therefore, only run the +sysctl tests if the kernel supports PID namespaces. Otherwise, skip those +tests and emit an informative message to let the user know why the sysctl +tests are not being run. + +Link: https://lkml.kernel.org/r/20241205192943.3228757-1-isaacmanjarres@google.com +Fixes: 11f75a01448f ("selftests/memfd: add tests for MFD_NOEXEC_SEAL MFD_EXEC") +Signed-off-by: Isaac J. Manjarres +Reviewed-by: Jeff Xu +Cc: Suren Baghdasaryan +Cc: Kalesh Singh +Cc: [6.6+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/memfd/memfd_test.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +--- a/tools/testing/selftests/memfd/memfd_test.c ++++ b/tools/testing/selftests/memfd/memfd_test.c +@@ -9,6 +9,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1557,6 +1558,11 @@ static void test_share_fork(char *banner + close(fd); + } + ++static bool pid_ns_supported(void) ++{ ++ return access("/proc/self/ns/pid", F_OK) == 0; ++} ++ + int main(int argc, char **argv) + { + pid_t pid; +@@ -1591,8 +1597,12 @@ int main(int argc, char **argv) + test_seal_grow(); + test_seal_resize(); + +- test_sysctl_simple(); +- test_sysctl_nested(); ++ if (pid_ns_supported()) { ++ test_sysctl_simple(); ++ test_sysctl_nested(); ++ } else { ++ printf("PID namespaces are not supported; skipping sysctl tests\n"); ++ } + + test_share_dup("SHARE-DUP", ""); + test_share_mmap("SHARE-MMAP", ""); diff --git a/queue-6.12/series b/queue-6.12/series index 0c067e42fb6..7e7eb47683a 100644 --- a/queue-6.12/series +++ b/queue-6.12/series @@ -112,4 +112,26 @@ vmalloc-fix-accounting-with-i915.patch mm-page_alloc-don-t-call-pfn_to_page-on-possibly-non-existent-pfn-in-split_large_buddy.patch ring-buffer-fix-overflow-in-__rb_map_vma.patch alloc_tag-fix-set_codetag_empty-when-config_mem_alloc_profiling_debug.patch -cxl-pci-fix-potential-bogus-return-value-upon-successful-probing.patch +btrfs-split-bios-to-the-fs-sector-size-boundary.patch +btrfs-fix-improper-generation-check-in-snapshot-delete.patch +btrfs-tree-checker-reject-inline-extent-items-with-0-ref-count.patch +s390-mm-fix-directmap-accounting.patch +drm-amdgpu-nbio7.0-fix-ip-version-check.patch +drm-amdgpu-gfx12-fix-ip-version-check.patch +drm-amdgpu-mmhub4.1-fix-ip-version-check.patch +fgraph-still-initialize-idle-shadow-stacks-when-starting.patch +drivers-hv-util-avoid-accessing-a-ringbuffer-not-initialized-yet.patch +tools-hv-fix-a-complier-warning-in-the-fcopy-uio-daemon.patch +x86-hyperv-fix-hv-tsc-page-based-sched_clock-for-hibernation.patch +kvm-x86-play-nice-with-protected-guests-in-complete_hypercall_exit.patch +smb-client-fix-tcp-timers-deadlock-after-rmmod.patch +accel-ivpu-fix-general-protection-fault-in-ivpu_bo_list.patch +accel-ivpu-fix-warn-in-ivpu_ipc_send_receive_internal.patch +tracing-fix-test_event_printk-to-process-entire-print-argument.patch +tracing-add-missing-helper-functions-in-event-pointer-dereference-check.patch +tracing-add-s-check-in-test_event_printk.patch +tracing-check-s-dereference-via-the-field-and-not-the-tp_printk-format.patch +selftests-memfd-run-sysctl-tests-when-pid-namespace-support-is-enabled.patch +selftests-bpf-use-asm-constraint-m-for-loongarch.patch +io_uring-fix-registered-ring-file-refcount-leak.patch +io_uring-check-if-iowq-is-killed-before-queuing.patch diff --git a/queue-6.12/smb-client-fix-tcp-timers-deadlock-after-rmmod.patch b/queue-6.12/smb-client-fix-tcp-timers-deadlock-after-rmmod.patch new file mode 100644 index 00000000000..1b305a57050 --- /dev/null +++ b/queue-6.12/smb-client-fix-tcp-timers-deadlock-after-rmmod.patch @@ -0,0 +1,182 @@ +From e9f2517a3e18a54a3943c098d2226b245d488801 Mon Sep 17 00:00:00 2001 +From: Enzo Matsumiya +Date: Tue, 10 Dec 2024 18:15:12 -0300 +Subject: smb: client: fix TCP timers deadlock after rmmod + +From: Enzo Matsumiya + +commit e9f2517a3e18a54a3943c098d2226b245d488801 upstream. + +Commit ef7134c7fc48 ("smb: client: Fix use-after-free of network namespace.") +fixed a netns UAF by manually enabled socket refcounting +(sk->sk_net_refcnt=1 and sock_inuse_add(net, 1)). + +The reason the patch worked for that bug was because we now hold +references to the netns (get_net_track() gets a ref internally) +and they're properly released (internally, on __sk_destruct()), +but only because sk->sk_net_refcnt was set. + +Problem: +(this happens regardless of CONFIG_NET_NS_REFCNT_TRACKER and regardless +if init_net or other) + +Setting sk->sk_net_refcnt=1 *manually* and *after* socket creation is not +only out of cifs scope, but also technically wrong -- it's set conditionally +based on user (=1) vs kernel (=0) sockets. And net/ implementations +seem to base their user vs kernel space operations on it. + +e.g. upon TCP socket close, the TCP timers are not cleared because +sk->sk_net_refcnt=1: +(cf. commit 151c9c724d05 ("tcp: properly terminate timers for kernel sockets")) + +net/ipv4/tcp.c: + void tcp_close(struct sock *sk, long timeout) + { + lock_sock(sk); + __tcp_close(sk, timeout); + release_sock(sk); + if (!sk->sk_net_refcnt) + inet_csk_clear_xmit_timers_sync(sk); + sock_put(sk); + } + +Which will throw a lockdep warning and then, as expected, deadlock on +tcp_write_timer(). + +A way to reproduce this is by running the reproducer from ef7134c7fc48 +and then 'rmmod cifs'. A few seconds later, the deadlock/lockdep +warning shows up. + +Fix: +We shouldn't mess with socket internals ourselves, so do not set +sk_net_refcnt manually. + +Also change __sock_create() to sock_create_kern() for explicitness. + +As for non-init_net network namespaces, we deal with it the best way +we can -- hold an extra netns reference for server->ssocket and drop it +when it's released. This ensures that the netns still exists whenever +we need to create/destroy server->ssocket, but is not directly tied to +it. + +Fixes: ef7134c7fc48 ("smb: client: Fix use-after-free of network namespace.") +Cc: stable@vger.kernel.org +Signed-off-by: Enzo Matsumiya +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman +--- + fs/smb/client/connect.c | 36 ++++++++++++++++++++++++++---------- + 1 file changed, 26 insertions(+), 10 deletions(-) + +--- a/fs/smb/client/connect.c ++++ b/fs/smb/client/connect.c +@@ -987,9 +987,13 @@ clean_demultiplex_info(struct TCP_Server + msleep(125); + if (cifs_rdma_enabled(server)) + smbd_destroy(server); ++ + if (server->ssocket) { + sock_release(server->ssocket); + server->ssocket = NULL; ++ ++ /* Release netns reference for the socket. */ ++ put_net(cifs_net_ns(server)); + } + + if (!list_empty(&server->pending_mid_q)) { +@@ -1037,6 +1041,7 @@ clean_demultiplex_info(struct TCP_Server + */ + } + ++ /* Release netns reference for this server. */ + put_net(cifs_net_ns(server)); + kfree(server->leaf_fullpath); + kfree(server); +@@ -1713,6 +1718,8 @@ cifs_get_tcp_session(struct smb3_fs_cont + + tcp_ses->ops = ctx->ops; + tcp_ses->vals = ctx->vals; ++ ++ /* Grab netns reference for this server. */ + cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); + + tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId); +@@ -1844,6 +1851,7 @@ smbd_connected: + out_err_crypto_release: + cifs_crypto_secmech_release(tcp_ses); + ++ /* Release netns reference for this server. */ + put_net(cifs_net_ns(tcp_ses)); + + out_err: +@@ -1852,8 +1860,10 @@ out_err: + cifs_put_tcp_session(tcp_ses->primary_server, false); + kfree(tcp_ses->hostname); + kfree(tcp_ses->leaf_fullpath); +- if (tcp_ses->ssocket) ++ if (tcp_ses->ssocket) { + sock_release(tcp_ses->ssocket); ++ put_net(cifs_net_ns(tcp_ses)); ++ } + kfree(tcp_ses); + } + return ERR_PTR(rc); +@@ -3111,20 +3121,20 @@ generic_ip_connect(struct TCP_Server_Inf + socket = server->ssocket; + } else { + struct net *net = cifs_net_ns(server); +- struct sock *sk; + +- rc = __sock_create(net, sfamily, SOCK_STREAM, +- IPPROTO_TCP, &server->ssocket, 1); ++ rc = sock_create_kern(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket); + if (rc < 0) { + cifs_server_dbg(VFS, "Error %d creating socket\n", rc); + return rc; + } + +- sk = server->ssocket->sk; +- __netns_tracker_free(net, &sk->ns_tracker, false); +- sk->sk_net_refcnt = 1; +- get_net_track(net, &sk->ns_tracker, GFP_KERNEL); +- sock_inuse_add(net, 1); ++ /* ++ * Grab netns reference for the socket. ++ * ++ * It'll be released here, on error, or in clean_demultiplex_info() upon server ++ * teardown. ++ */ ++ get_net(net); + + /* BB other socket options to set KEEPALIVE, NODELAY? */ + cifs_dbg(FYI, "Socket created\n"); +@@ -3138,8 +3148,10 @@ generic_ip_connect(struct TCP_Server_Inf + } + + rc = bind_socket(server); +- if (rc < 0) ++ if (rc < 0) { ++ put_net(cifs_net_ns(server)); + return rc; ++ } + + /* + * Eventually check for other socket options to change from +@@ -3176,6 +3188,7 @@ generic_ip_connect(struct TCP_Server_Inf + if (rc < 0) { + cifs_dbg(FYI, "Error %d connecting to server\n", rc); + trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc); ++ put_net(cifs_net_ns(server)); + sock_release(socket); + server->ssocket = NULL; + return rc; +@@ -3184,6 +3197,9 @@ generic_ip_connect(struct TCP_Server_Inf + if (sport == htons(RFC1001_PORT)) + rc = ip_rfc1001_connect(server); + ++ if (rc < 0) ++ put_net(cifs_net_ns(server)); ++ + return rc; + } + diff --git a/queue-6.12/tools-hv-fix-a-complier-warning-in-the-fcopy-uio-daemon.patch b/queue-6.12/tools-hv-fix-a-complier-warning-in-the-fcopy-uio-daemon.patch new file mode 100644 index 00000000000..386a901d78f --- /dev/null +++ b/queue-6.12/tools-hv-fix-a-complier-warning-in-the-fcopy-uio-daemon.patch @@ -0,0 +1,63 @@ +From cb1b78f1c726c938bd47497c1ab16b01ce967f37 Mon Sep 17 00:00:00 2001 +From: Dexuan Cui +Date: Tue, 10 Sep 2024 00:44:32 +0000 +Subject: tools: hv: Fix a complier warning in the fcopy uio daemon + +From: Dexuan Cui + +commit cb1b78f1c726c938bd47497c1ab16b01ce967f37 upstream. + +hv_fcopy_uio_daemon.c:436:53: warning: '%s' directive output may be truncated +writing up to 14 bytes into a region of size 10 [-Wformat-truncation=] + 436 | snprintf(uio_dev_path, sizeof(uio_dev_path), "/dev/%s", uio_name); + +Also added 'static' for the array 'desc[]'. + +Fixes: 82b0945ce2c2 ("tools: hv: Add new fcopy application based on uio driver") +Cc: stable@vger.kernel.org # 6.10+ +Signed-off-by: Dexuan Cui +Reviewed-by: Saurabh Sengar +Link: https://lore.kernel.org/r/20240910004433.50254-1-decui@microsoft.com +Signed-off-by: Wei Liu +Message-ID: <20240910004433.50254-1-decui@microsoft.com> +Signed-off-by: Greg Kroah-Hartman +--- + tools/hv/hv_fcopy_uio_daemon.c | 8 +++----- + 1 file changed, 3 insertions(+), 5 deletions(-) + +diff --git a/tools/hv/hv_fcopy_uio_daemon.c b/tools/hv/hv_fcopy_uio_daemon.c +index 7a00f3066a98..12743d7f164f 100644 +--- a/tools/hv/hv_fcopy_uio_daemon.c ++++ b/tools/hv/hv_fcopy_uio_daemon.c +@@ -35,8 +35,6 @@ + #define WIN8_SRV_MINOR 1 + #define WIN8_SRV_VERSION (WIN8_SRV_MAJOR << 16 | WIN8_SRV_MINOR) + +-#define MAX_FOLDER_NAME 15 +-#define MAX_PATH_LEN 15 + #define FCOPY_UIO "/sys/bus/vmbus/devices/eb765408-105f-49b6-b4aa-c123b64d17d4/uio" + + #define FCOPY_VER_COUNT 1 +@@ -51,7 +49,7 @@ static const int fw_versions[] = { + + #define HV_RING_SIZE 0x4000 /* 16KB ring buffer size */ + +-unsigned char desc[HV_RING_SIZE]; ++static unsigned char desc[HV_RING_SIZE]; + + static int target_fd; + static char target_fname[PATH_MAX]; +@@ -409,8 +407,8 @@ int main(int argc, char *argv[]) + struct vmbus_br txbr, rxbr; + void *ring; + uint32_t len = HV_RING_SIZE; +- char uio_name[MAX_FOLDER_NAME] = {0}; +- char uio_dev_path[MAX_PATH_LEN] = {0}; ++ char uio_name[NAME_MAX] = {0}; ++ char uio_dev_path[PATH_MAX] = {0}; + + static struct option long_options[] = { + {"help", no_argument, 0, 'h' }, +-- +2.47.1 + diff --git a/queue-6.12/tracing-add-missing-helper-functions-in-event-pointer-dereference-check.patch b/queue-6.12/tracing-add-missing-helper-functions-in-event-pointer-dereference-check.patch new file mode 100644 index 00000000000..5ea51045135 --- /dev/null +++ b/queue-6.12/tracing-add-missing-helper-functions-in-event-pointer-dereference-check.patch @@ -0,0 +1,78 @@ +From 917110481f6bc1c96b1e54b62bb114137fbc6d17 Mon Sep 17 00:00:00 2001 +From: Steven Rostedt +Date: Mon, 16 Dec 2024 21:41:20 -0500 +Subject: tracing: Add missing helper functions in event pointer dereference check + +From: Steven Rostedt + +commit 917110481f6bc1c96b1e54b62bb114137fbc6d17 upstream. + +The process_pointer() helper function looks to see if various trace event +macros are used. These macros are for storing data in the event. This +makes it safe to dereference as the dereference will then point into the +event on the ring buffer where the content of the data stays with the +event itself. + +A few helper functions were missing. Those were: + + __get_rel_dynamic_array() + __get_dynamic_array_len() + __get_rel_dynamic_array_len() + __get_rel_sockaddr() + +Also add a helper function find_print_string() to not need to use a middle +man variable to test if the string exists. + +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Mark Rutland +Cc: Mathieu Desnoyers +Cc: Andrew Morton +Cc: Al Viro +Cc: Linus Torvalds +Link: https://lore.kernel.org/20241217024720.521836792@goodmis.org +Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/trace_events.c | 21 +++++++++++++++++++-- + 1 file changed, 19 insertions(+), 2 deletions(-) + +--- a/kernel/trace/trace_events.c ++++ b/kernel/trace/trace_events.c +@@ -274,6 +274,15 @@ static bool test_field(const char *fmt, + return false; + } + ++/* Look for a string within an argument */ ++static bool find_print_string(const char *arg, const char *str, const char *end) ++{ ++ const char *r; ++ ++ r = strstr(arg, str); ++ return r && r < end; ++} ++ + /* Return true if the argument pointer is safe */ + static bool process_pointer(const char *fmt, int len, struct trace_event_call *call) + { +@@ -292,9 +301,17 @@ static bool process_pointer(const char * + a = strchr(fmt, '&'); + if ((a && (a < r)) || test_field(r, call)) + return true; +- } else if ((r = strstr(fmt, "__get_dynamic_array(")) && r < e) { ++ } else if (find_print_string(fmt, "__get_dynamic_array(", e)) { ++ return true; ++ } else if (find_print_string(fmt, "__get_rel_dynamic_array(", e)) { ++ return true; ++ } else if (find_print_string(fmt, "__get_dynamic_array_len(", e)) { ++ return true; ++ } else if (find_print_string(fmt, "__get_rel_dynamic_array_len(", e)) { ++ return true; ++ } else if (find_print_string(fmt, "__get_sockaddr(", e)) { + return true; +- } else if ((r = strstr(fmt, "__get_sockaddr(")) && r < e) { ++ } else if (find_print_string(fmt, "__get_rel_sockaddr(", e)) { + return true; + } + return false; diff --git a/queue-6.12/tracing-add-s-check-in-test_event_printk.patch b/queue-6.12/tracing-add-s-check-in-test_event_printk.patch new file mode 100644 index 00000000000..5c30d998837 --- /dev/null +++ b/queue-6.12/tracing-add-s-check-in-test_event_printk.patch @@ -0,0 +1,206 @@ +From 65a25d9f7ac02e0cf361356e834d1c71d36acca9 Mon Sep 17 00:00:00 2001 +From: Steven Rostedt +Date: Mon, 16 Dec 2024 21:41:21 -0500 +Subject: tracing: Add "%s" check in test_event_printk() + +From: Steven Rostedt + +commit 65a25d9f7ac02e0cf361356e834d1c71d36acca9 upstream. + +The test_event_printk() code makes sure that when a trace event is +registered, any dereferenced pointers in from the event's TP_printk() are +pointing to content in the ring buffer. But currently it does not handle +"%s", as there's cases where the string pointer saved in the ring buffer +points to a static string in the kernel that will never be freed. As that +is a valid case, the pointer needs to be checked at runtime. + +Currently the runtime check is done via trace_check_vprintf(), but to not +have to replicate everything in vsnprintf() it does some logic with the +va_list that may not be reliable across architectures. In order to get rid +of that logic, more work in the test_event_printk() needs to be done. Some +of the strings can be validated at this time when it is obvious the string +is valid because the string will be saved in the ring buffer content. + +Do all the validation of strings in the ring buffer at boot in +test_event_printk(), and make sure that the field of the strings that +point into the kernel are accessible. This will allow adding checks at +runtime that will validate the fields themselves and not rely on paring +the TP_printk() format at runtime. + +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Mark Rutland +Cc: Mathieu Desnoyers +Cc: Andrew Morton +Cc: Al Viro +Cc: Linus Torvalds +Link: https://lore.kernel.org/20241217024720.685917008@goodmis.org +Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/trace_events.c | 104 +++++++++++++++++++++++++++++++++++++------- + 1 file changed, 89 insertions(+), 15 deletions(-) + +--- a/kernel/trace/trace_events.c ++++ b/kernel/trace/trace_events.c +@@ -244,19 +244,16 @@ int trace_event_get_offsets(struct trace + return tail->offset + tail->size; + } + +-/* +- * Check if the referenced field is an array and return true, +- * as arrays are OK to dereference. +- */ +-static bool test_field(const char *fmt, struct trace_event_call *call) ++ ++static struct trace_event_fields *find_event_field(const char *fmt, ++ struct trace_event_call *call) + { + struct trace_event_fields *field = call->class->fields_array; +- const char *array_descriptor; + const char *p = fmt; + int len; + + if (!(len = str_has_prefix(fmt, "REC->"))) +- return false; ++ return NULL; + fmt += len; + for (p = fmt; *p; p++) { + if (!isalnum(*p) && *p != '_') +@@ -267,11 +264,26 @@ static bool test_field(const char *fmt, + for (; field->type; field++) { + if (strncmp(field->name, fmt, len) || field->name[len]) + continue; +- array_descriptor = strchr(field->type, '['); +- /* This is an array and is OK to dereference. */ +- return array_descriptor != NULL; ++ ++ return field; + } +- return false; ++ return NULL; ++} ++ ++/* ++ * Check if the referenced field is an array and return true, ++ * as arrays are OK to dereference. ++ */ ++static bool test_field(const char *fmt, struct trace_event_call *call) ++{ ++ struct trace_event_fields *field; ++ ++ field = find_event_field(fmt, call); ++ if (!field) ++ return false; ++ ++ /* This is an array and is OK to dereference. */ ++ return strchr(field->type, '[') != NULL; + } + + /* Look for a string within an argument */ +@@ -317,6 +329,53 @@ static bool process_pointer(const char * + return false; + } + ++/* Return true if the string is safe */ ++static bool process_string(const char *fmt, int len, struct trace_event_call *call) ++{ ++ const char *r, *e, *s; ++ ++ e = fmt + len; ++ ++ /* ++ * There are several helper functions that return strings. ++ * If the argument contains a function, then assume its field is valid. ++ * It is considered that the argument has a function if it has: ++ * alphanumeric or '_' before a parenthesis. ++ */ ++ s = fmt; ++ do { ++ r = strstr(s, "("); ++ if (!r || r >= e) ++ break; ++ for (int i = 1; r - i >= s; i++) { ++ char ch = *(r - i); ++ if (isspace(ch)) ++ continue; ++ if (isalnum(ch) || ch == '_') ++ return true; ++ /* Anything else, this isn't a function */ ++ break; ++ } ++ /* A function could be wrapped in parethesis, try the next one */ ++ s = r + 1; ++ } while (s < e); ++ ++ /* ++ * If there's any strings in the argument consider this arg OK as it ++ * could be: REC->field ? "foo" : "bar" and we don't want to get into ++ * verifying that logic here. ++ */ ++ if (find_print_string(fmt, "\"", e)) ++ return true; ++ ++ /* Dereferenced strings are also valid like any other pointer */ ++ if (process_pointer(fmt, len, call)) ++ return true; ++ ++ /* Make sure the field is found, and consider it OK for now if it is */ ++ return find_event_field(fmt, call) != NULL; ++} ++ + /* + * Examine the print fmt of the event looking for unsafe dereference + * pointers using %p* that could be recorded in the trace event and +@@ -326,6 +385,7 @@ static bool process_pointer(const char * + static void test_event_printk(struct trace_event_call *call) + { + u64 dereference_flags = 0; ++ u64 string_flags = 0; + bool first = true; + const char *fmt; + int parens = 0; +@@ -416,8 +476,16 @@ static void test_event_printk(struct tra + star = true; + continue; + } +- if ((fmt[i + j] == 's') && star) +- arg++; ++ if ((fmt[i + j] == 's')) { ++ if (star) ++ arg++; ++ if (WARN_ONCE(arg == 63, ++ "Too many args for event: %s", ++ trace_event_name(call))) ++ return; ++ dereference_flags |= 1ULL << arg; ++ string_flags |= 1ULL << arg; ++ } + break; + } + break; +@@ -464,7 +532,10 @@ static void test_event_printk(struct tra + } + + if (dereference_flags & (1ULL << arg)) { +- if (process_pointer(fmt + start_arg, e - start_arg, call)) ++ if (string_flags & (1ULL << arg)) { ++ if (process_string(fmt + start_arg, e - start_arg, call)) ++ dereference_flags &= ~(1ULL << arg); ++ } else if (process_pointer(fmt + start_arg, e - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } + +@@ -476,7 +547,10 @@ static void test_event_printk(struct tra + } + + if (dereference_flags & (1ULL << arg)) { +- if (process_pointer(fmt + start_arg, i - start_arg, call)) ++ if (string_flags & (1ULL << arg)) { ++ if (process_string(fmt + start_arg, i - start_arg, call)) ++ dereference_flags &= ~(1ULL << arg); ++ } else if (process_pointer(fmt + start_arg, i - start_arg, call)) + dereference_flags &= ~(1ULL << arg); + } + diff --git a/queue-6.12/tracing-check-s-dereference-via-the-field-and-not-the-tp_printk-format.patch b/queue-6.12/tracing-check-s-dereference-via-the-field-and-not-the-tp_printk-format.patch new file mode 100644 index 00000000000..699c526aa69 --- /dev/null +++ b/queue-6.12/tracing-check-s-dereference-via-the-field-and-not-the-tp_printk-format.patch @@ -0,0 +1,589 @@ +From afd2627f727b89496d79a6b934a025fc916d4ded Mon Sep 17 00:00:00 2001 +From: Steven Rostedt +Date: Mon, 16 Dec 2024 21:41:22 -0500 +Subject: tracing: Check "%s" dereference via the field and not the TP_printk format + +From: Steven Rostedt + +commit afd2627f727b89496d79a6b934a025fc916d4ded upstream. + +The TP_printk() portion of a trace event is executed at the time a event +is read from the trace. This can happen seconds, minutes, hours, days, +months, years possibly later since the event was recorded. If the print +format contains a dereference to a string via "%s", and that string was +allocated, there's a chance that string could be freed before it is read +by the trace file. + +To protect against such bugs, there are two functions that verify the +event. The first one is test_event_printk(), which is called when the +event is created. It reads the TP_printk() format as well as its arguments +to make sure nothing may be dereferencing a pointer that was not copied +into the ring buffer along with the event. If it is, it will trigger a +WARN_ON(). + +For strings that use "%s", it is not so easy. The string may not reside in +the ring buffer but may still be valid. Strings that are static and part +of the kernel proper which will not be freed for the life of the running +system, are safe to dereference. But to know if it is a pointer to a +static string or to something on the heap can not be determined until the +event is triggered. + +This brings us to the second function that tests for the bad dereferencing +of strings, trace_check_vprintf(). It would walk through the printf format +looking for "%s", and when it finds it, it would validate that the pointer +is safe to read. If not, it would produces a WARN_ON() as well and write +into the ring buffer "[UNSAFE-MEMORY]". + +The problem with this is how it used va_list to have vsnprintf() handle +all the cases that it didn't need to check. Instead of re-implementing +vsnprintf(), it would make a copy of the format up to the %s part, and +call vsnprintf() with the current va_list ap variable, where the ap would +then be ready to point at the string in question. + +For architectures that passed va_list by reference this was possible. For +architectures that passed it by copy it was not. A test_can_verify() +function was used to differentiate between the two, and if it wasn't +possible, it would disable it. + +Even for architectures where this was feasible, it was a stretch to rely +on such a method that is undocumented, and could cause issues later on +with new optimizations of the compiler. + +Instead, the first function test_event_printk() was updated to look at +"%s" as well. If the "%s" argument is a pointer outside the event in the +ring buffer, it would find the field type of the event that is the problem +and mark the structure with a new flag called "needs_test". The event +itself will be marked by TRACE_EVENT_FL_TEST_STR to let it be known that +this event has a field that needs to be verified before the event can be +printed using the printf format. + +When the event fields are created from the field type structure, the +fields would copy the field type's "needs_test" value. + +Finally, before being printed, a new function ignore_event() is called +which will check if the event has the TEST_STR flag set (if not, it +returns false). If the flag is set, it then iterates through the events +fields looking for the ones that have the "needs_test" flag set. + +Then it uses the offset field from the field structure to find the pointer +in the ring buffer event. It runs the tests to make sure that pointer is +safe to print and if not, it triggers the WARN_ON() and also adds to the +trace output that the event in question has an unsafe memory access. + +The ignore_event() makes the trace_check_vprintf() obsolete so it is +removed. + +Link: https://lore.kernel.org/all/CAHk-=wh3uOnqnZPpR0PeLZZtyWbZLboZ7cHLCKRWsocvs9Y7hQ@mail.gmail.com/ + +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Mark Rutland +Cc: Mathieu Desnoyers +Cc: Andrew Morton +Cc: Al Viro +Cc: Linus Torvalds +Link: https://lore.kernel.org/20241217024720.848621576@goodmis.org +Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/trace_events.h | 6 - + kernel/trace/trace.c | 255 ++++++++----------------------------------- + kernel/trace/trace.h | 6 - + kernel/trace/trace_events.c | 32 +++-- + kernel/trace/trace_output.c | 6 - + 5 files changed, 88 insertions(+), 217 deletions(-) + +--- a/include/linux/trace_events.h ++++ b/include/linux/trace_events.h +@@ -285,7 +285,8 @@ struct trace_event_fields { + const char *name; + const int size; + const int align; +- const int is_signed; ++ const unsigned int is_signed:1; ++ unsigned int needs_test:1; + const int filter_type; + const int len; + }; +@@ -337,6 +338,7 @@ enum { + TRACE_EVENT_FL_EPROBE_BIT, + TRACE_EVENT_FL_FPROBE_BIT, + TRACE_EVENT_FL_CUSTOM_BIT, ++ TRACE_EVENT_FL_TEST_STR_BIT, + }; + + /* +@@ -354,6 +356,7 @@ enum { + * CUSTOM - Event is a custom event (to be attached to an exsiting tracepoint) + * This is set when the custom event has not been attached + * to a tracepoint yet, then it is cleared when it is. ++ * TEST_STR - The event has a "%s" that points to a string outside the event + */ + enum { + TRACE_EVENT_FL_FILTERED = (1 << TRACE_EVENT_FL_FILTERED_BIT), +@@ -367,6 +370,7 @@ enum { + TRACE_EVENT_FL_EPROBE = (1 << TRACE_EVENT_FL_EPROBE_BIT), + TRACE_EVENT_FL_FPROBE = (1 << TRACE_EVENT_FL_FPROBE_BIT), + TRACE_EVENT_FL_CUSTOM = (1 << TRACE_EVENT_FL_CUSTOM_BIT), ++ TRACE_EVENT_FL_TEST_STR = (1 << TRACE_EVENT_FL_TEST_STR_BIT), + }; + + #define TRACE_EVENT_FL_UKPROBE (TRACE_EVENT_FL_KPROBE | TRACE_EVENT_FL_UPROBE) +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -3635,17 +3635,12 @@ char *trace_iter_expand_format(struct tr + } + + /* Returns true if the string is safe to dereference from an event */ +-static bool trace_safe_str(struct trace_iterator *iter, const char *str, +- bool star, int len) ++static bool trace_safe_str(struct trace_iterator *iter, const char *str) + { + unsigned long addr = (unsigned long)str; + struct trace_event *trace_event; + struct trace_event_call *event; + +- /* Ignore strings with no length */ +- if (star && !len) +- return true; +- + /* OK if part of the event data */ + if ((addr >= (unsigned long)iter->ent) && + (addr < (unsigned long)iter->ent + iter->ent_size)) +@@ -3685,181 +3680,69 @@ static bool trace_safe_str(struct trace_ + return false; + } + +-static DEFINE_STATIC_KEY_FALSE(trace_no_verify); +- +-static int test_can_verify_check(const char *fmt, ...) +-{ +- char buf[16]; +- va_list ap; +- int ret; +- +- /* +- * The verifier is dependent on vsnprintf() modifies the va_list +- * passed to it, where it is sent as a reference. Some architectures +- * (like x86_32) passes it by value, which means that vsnprintf() +- * does not modify the va_list passed to it, and the verifier +- * would then need to be able to understand all the values that +- * vsnprintf can use. If it is passed by value, then the verifier +- * is disabled. +- */ +- va_start(ap, fmt); +- vsnprintf(buf, 16, "%d", ap); +- ret = va_arg(ap, int); +- va_end(ap); +- +- return ret; +-} +- +-static void test_can_verify(void) +-{ +- if (!test_can_verify_check("%d %d", 0, 1)) { +- pr_info("trace event string verifier disabled\n"); +- static_branch_inc(&trace_no_verify); +- } +-} +- + /** +- * trace_check_vprintf - Check dereferenced strings while writing to the seq buffer ++ * ignore_event - Check dereferenced fields while writing to the seq buffer + * @iter: The iterator that holds the seq buffer and the event being printed +- * @fmt: The format used to print the event +- * @ap: The va_list holding the data to print from @fmt. + * +- * This writes the data into the @iter->seq buffer using the data from +- * @fmt and @ap. If the format has a %s, then the source of the string +- * is examined to make sure it is safe to print, otherwise it will +- * warn and print "[UNSAFE MEMORY]" in place of the dereferenced string +- * pointer. ++ * At boot up, test_event_printk() will flag any event that dereferences ++ * a string with "%s" that does exist in the ring buffer. It may still ++ * be valid, as the string may point to a static string in the kernel ++ * rodata that never gets freed. But if the string pointer is pointing ++ * to something that was allocated, there's a chance that it can be freed ++ * by the time the user reads the trace. This would cause a bad memory ++ * access by the kernel and possibly crash the system. ++ * ++ * This function will check if the event has any fields flagged as needing ++ * to be checked at runtime and perform those checks. ++ * ++ * If it is found that a field is unsafe, it will write into the @iter->seq ++ * a message stating what was found to be unsafe. ++ * ++ * @return: true if the event is unsafe and should be ignored, ++ * false otherwise. + */ +-void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, +- va_list ap) ++bool ignore_event(struct trace_iterator *iter) + { +- long text_delta = 0; +- long data_delta = 0; +- const char *p = fmt; +- const char *str; +- bool good; +- int i, j; ++ struct ftrace_event_field *field; ++ struct trace_event *trace_event; ++ struct trace_event_call *event; ++ struct list_head *head; ++ struct trace_seq *seq; ++ const void *ptr; + +- if (WARN_ON_ONCE(!fmt)) +- return; ++ trace_event = ftrace_find_event(iter->ent->type); + +- if (static_branch_unlikely(&trace_no_verify)) +- goto print; ++ seq = &iter->seq; + +- /* +- * When the kernel is booted with the tp_printk command line +- * parameter, trace events go directly through to printk(). +- * It also is checked by this function, but it does not +- * have an associated trace_array (tr) for it. +- */ +- if (iter->tr) { +- text_delta = iter->tr->text_delta; +- data_delta = iter->tr->data_delta; ++ if (!trace_event) { ++ trace_seq_printf(seq, "EVENT ID %d NOT FOUND?\n", iter->ent->type); ++ return true; + } + +- /* Don't bother checking when doing a ftrace_dump() */ +- if (iter->fmt == static_fmt_buf) +- goto print; +- +- while (*p) { +- bool star = false; +- int len = 0; +- +- j = 0; +- +- /* +- * We only care about %s and variants +- * as well as %p[sS] if delta is non-zero +- */ +- for (i = 0; p[i]; i++) { +- if (i + 1 >= iter->fmt_size) { +- /* +- * If we can't expand the copy buffer, +- * just print it. +- */ +- if (!trace_iter_expand_format(iter)) +- goto print; +- } +- +- if (p[i] == '\\' && p[i+1]) { +- i++; +- continue; +- } +- if (p[i] == '%') { +- /* Need to test cases like %08.*s */ +- for (j = 1; p[i+j]; j++) { +- if (isdigit(p[i+j]) || +- p[i+j] == '.') +- continue; +- if (p[i+j] == '*') { +- star = true; +- continue; +- } +- break; +- } +- if (p[i+j] == 's') +- break; ++ event = container_of(trace_event, struct trace_event_call, event); ++ if (!(event->flags & TRACE_EVENT_FL_TEST_STR)) ++ return false; + +- if (text_delta && p[i+1] == 'p' && +- ((p[i+2] == 's' || p[i+2] == 'S'))) +- break; ++ head = trace_get_fields(event); ++ if (!head) { ++ trace_seq_printf(seq, "FIELDS FOR EVENT '%s' NOT FOUND?\n", ++ trace_event_name(event)); ++ return true; ++ } + +- star = false; +- } +- j = 0; +- } +- /* If no %s found then just print normally */ +- if (!p[i]) +- break; ++ /* Offsets are from the iter->ent that points to the raw event */ ++ ptr = iter->ent; + +- /* Copy up to the %s, and print that */ +- strncpy(iter->fmt, p, i); +- iter->fmt[i] = '\0'; +- trace_seq_vprintf(&iter->seq, iter->fmt, ap); +- +- /* Add delta to %pS pointers */ +- if (p[i+1] == 'p') { +- unsigned long addr; +- char fmt[4]; +- +- fmt[0] = '%'; +- fmt[1] = 'p'; +- fmt[2] = p[i+2]; /* Either %ps or %pS */ +- fmt[3] = '\0'; +- +- addr = va_arg(ap, unsigned long); +- addr += text_delta; +- trace_seq_printf(&iter->seq, fmt, (void *)addr); ++ list_for_each_entry(field, head, link) { ++ const char *str; ++ bool good; + +- p += i + 3; ++ if (!field->needs_test) + continue; +- } + +- /* +- * If iter->seq is full, the above call no longer guarantees +- * that ap is in sync with fmt processing, and further calls +- * to va_arg() can return wrong positional arguments. +- * +- * Ensure that ap is no longer used in this case. +- */ +- if (iter->seq.full) { +- p = ""; +- break; +- } +- +- if (star) +- len = va_arg(ap, int); +- +- /* The ap now points to the string data of the %s */ +- str = va_arg(ap, const char *); +- +- good = trace_safe_str(iter, str, star, len); ++ str = *(const char **)(ptr + field->offset); + +- /* Could be from the last boot */ +- if (data_delta && !good) { +- str += data_delta; +- good = trace_safe_str(iter, str, star, len); +- } ++ good = trace_safe_str(iter, str); + + /* + * If you hit this warning, it is likely that the +@@ -3870,44 +3753,14 @@ void trace_check_vprintf(struct trace_it + * instead. See samples/trace_events/trace-events-sample.h + * for reference. + */ +- if (WARN_ONCE(!good, "fmt: '%s' current_buffer: '%s'", +- fmt, seq_buf_str(&iter->seq.seq))) { +- int ret; +- +- /* Try to safely read the string */ +- if (star) { +- if (len + 1 > iter->fmt_size) +- len = iter->fmt_size - 1; +- if (len < 0) +- len = 0; +- ret = copy_from_kernel_nofault(iter->fmt, str, len); +- iter->fmt[len] = 0; +- star = false; +- } else { +- ret = strncpy_from_kernel_nofault(iter->fmt, str, +- iter->fmt_size); +- } +- if (ret < 0) +- trace_seq_printf(&iter->seq, "(0x%px)", str); +- else +- trace_seq_printf(&iter->seq, "(0x%px:%s)", +- str, iter->fmt); +- str = "[UNSAFE-MEMORY]"; +- strcpy(iter->fmt, "%s"); +- } else { +- strncpy(iter->fmt, p + i, j + 1); +- iter->fmt[j+1] = '\0'; ++ if (WARN_ONCE(!good, "event '%s' has unsafe pointer field '%s'", ++ trace_event_name(event), field->name)) { ++ trace_seq_printf(seq, "EVENT %s: HAS UNSAFE POINTER FIELD '%s'\n", ++ trace_event_name(event), field->name); ++ return true; + } +- if (star) +- trace_seq_printf(&iter->seq, iter->fmt, len, str); +- else +- trace_seq_printf(&iter->seq, iter->fmt, str); +- +- p += i + j + 1; + } +- print: +- if (*p) +- trace_seq_vprintf(&iter->seq, p, ap); ++ return false; + } + + const char *trace_event_format(struct trace_iterator *iter, const char *fmt) +@@ -10803,8 +10656,6 @@ __init static int tracer_alloc_buffers(v + + register_snapshot_cmd(); + +- test_can_verify(); +- + return 0; + + out_free_pipe_cpumask: +--- a/kernel/trace/trace.h ++++ b/kernel/trace/trace.h +@@ -664,9 +664,8 @@ void trace_buffer_unlock_commit_nostack( + + bool trace_is_tracepoint_string(const char *str); + const char *trace_event_format(struct trace_iterator *iter, const char *fmt); +-void trace_check_vprintf(struct trace_iterator *iter, const char *fmt, +- va_list ap) __printf(2, 0); + char *trace_iter_expand_format(struct trace_iterator *iter); ++bool ignore_event(struct trace_iterator *iter); + + int trace_empty(struct trace_iterator *iter); + +@@ -1402,7 +1401,8 @@ struct ftrace_event_field { + int filter_type; + int offset; + int size; +- int is_signed; ++ unsigned int is_signed:1; ++ unsigned int needs_test:1; + int len; + }; + +--- a/kernel/trace/trace_events.c ++++ b/kernel/trace/trace_events.c +@@ -82,7 +82,7 @@ static int system_refcount_dec(struct ev + } + + static struct ftrace_event_field * +-__find_event_field(struct list_head *head, char *name) ++__find_event_field(struct list_head *head, const char *name) + { + struct ftrace_event_field *field; + +@@ -114,7 +114,8 @@ trace_find_event_field(struct trace_even + + static int __trace_define_field(struct list_head *head, const char *type, + const char *name, int offset, int size, +- int is_signed, int filter_type, int len) ++ int is_signed, int filter_type, int len, ++ int need_test) + { + struct ftrace_event_field *field; + +@@ -133,6 +134,7 @@ static int __trace_define_field(struct l + field->offset = offset; + field->size = size; + field->is_signed = is_signed; ++ field->needs_test = need_test; + field->len = len; + + list_add(&field->link, head); +@@ -151,13 +153,13 @@ int trace_define_field(struct trace_even + + head = trace_get_fields(call); + return __trace_define_field(head, type, name, offset, size, +- is_signed, filter_type, 0); ++ is_signed, filter_type, 0, 0); + } + EXPORT_SYMBOL_GPL(trace_define_field); + + static int trace_define_field_ext(struct trace_event_call *call, const char *type, + const char *name, int offset, int size, int is_signed, +- int filter_type, int len) ++ int filter_type, int len, int need_test) + { + struct list_head *head; + +@@ -166,13 +168,13 @@ static int trace_define_field_ext(struct + + head = trace_get_fields(call); + return __trace_define_field(head, type, name, offset, size, +- is_signed, filter_type, len); ++ is_signed, filter_type, len, need_test); + } + + #define __generic_field(type, item, filter_type) \ + ret = __trace_define_field(&ftrace_generic_fields, #type, \ + #item, 0, 0, is_signed_type(type), \ +- filter_type, 0); \ ++ filter_type, 0, 0); \ + if (ret) \ + return ret; + +@@ -181,7 +183,8 @@ static int trace_define_field_ext(struct + "common_" #item, \ + offsetof(typeof(ent), item), \ + sizeof(ent.item), \ +- is_signed_type(type), FILTER_OTHER, 0); \ ++ is_signed_type(type), FILTER_OTHER, \ ++ 0, 0); \ + if (ret) \ + return ret; + +@@ -332,6 +335,7 @@ static bool process_pointer(const char * + /* Return true if the string is safe */ + static bool process_string(const char *fmt, int len, struct trace_event_call *call) + { ++ struct trace_event_fields *field; + const char *r, *e, *s; + + e = fmt + len; +@@ -372,8 +376,16 @@ static bool process_string(const char *f + if (process_pointer(fmt, len, call)) + return true; + +- /* Make sure the field is found, and consider it OK for now if it is */ +- return find_event_field(fmt, call) != NULL; ++ /* Make sure the field is found */ ++ field = find_event_field(fmt, call); ++ if (!field) ++ return false; ++ ++ /* Test this field's string before printing the event */ ++ call->flags |= TRACE_EVENT_FL_TEST_STR; ++ field->needs_test = 1; ++ ++ return true; + } + + /* +@@ -2586,7 +2598,7 @@ event_define_fields(struct trace_event_c + ret = trace_define_field_ext(call, field->type, field->name, + offset, field->size, + field->is_signed, field->filter_type, +- field->len); ++ field->len, field->needs_test); + if (WARN_ON_ONCE(ret)) { + pr_err("error code is %d\n", ret); + break; +--- a/kernel/trace/trace_output.c ++++ b/kernel/trace/trace_output.c +@@ -317,10 +317,14 @@ EXPORT_SYMBOL(trace_raw_output_prep); + + void trace_event_printf(struct trace_iterator *iter, const char *fmt, ...) + { ++ struct trace_seq *s = &iter->seq; + va_list ap; + ++ if (ignore_event(iter)) ++ return; ++ + va_start(ap, fmt); +- trace_check_vprintf(iter, trace_event_format(iter, fmt), ap); ++ trace_seq_vprintf(s, trace_event_format(iter, fmt), ap); + va_end(ap); + } + EXPORT_SYMBOL(trace_event_printf); diff --git a/queue-6.12/tracing-fix-test_event_printk-to-process-entire-print-argument.patch b/queue-6.12/tracing-fix-test_event_printk-to-process-entire-print-argument.patch new file mode 100644 index 00000000000..ebd2328bc08 --- /dev/null +++ b/queue-6.12/tracing-fix-test_event_printk-to-process-entire-print-argument.patch @@ -0,0 +1,184 @@ +From a6629626c584200daf495cc9a740048b455addcd Mon Sep 17 00:00:00 2001 +From: Steven Rostedt +Date: Mon, 16 Dec 2024 21:41:19 -0500 +Subject: tracing: Fix test_event_printk() to process entire print argument + +From: Steven Rostedt + +commit a6629626c584200daf495cc9a740048b455addcd upstream. + +The test_event_printk() analyzes print formats of trace events looking for +cases where it may dereference a pointer that is not in the ring buffer +which can possibly be a bug when the trace event is read from the ring +buffer and the content of that pointer no longer exists. + +The function needs to accurately go from one print format argument to the +next. It handles quotes and parenthesis that may be included in an +argument. When it finds the start of the next argument, it uses a simple +"c = strstr(fmt + i, ',')" to find the end of that argument! + +In order to include "%s" dereferencing, it needs to process the entire +content of the print format argument and not just the content of the first +',' it finds. As there may be content like: + + ({ const char *saved_ptr = trace_seq_buffer_ptr(p); static const char + *access_str[] = { "---", "--x", "w--", "w-x", "-u-", "-ux", "wu-", "wux" + }; union kvm_mmu_page_role role; role.word = REC->role; + trace_seq_printf(p, "sp gen %u gfn %llx l%u %u-byte q%u%s %s%s" " %snxe + %sad root %u %s%c", REC->mmu_valid_gen, REC->gfn, role.level, + role.has_4_byte_gpte ? 4 : 8, role.quadrant, role.direct ? " direct" : "", + access_str[role.access], role.invalid ? " invalid" : "", role.efer_nx ? "" + : "!", role.ad_disabled ? "!" : "", REC->root_count, REC->unsync ? + "unsync" : "sync", 0); saved_ptr; }) + +Which is an example of a full argument of an existing event. As the code +already handles finding the next print format argument, process the +argument at the end of it and not the start of it. This way it has both +the start of the argument as well as the end of it. + +Add a helper function "process_pointer()" that will do the processing during +the loop as well as at the end. It also makes the code cleaner and easier +to read. + +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Mark Rutland +Cc: Mathieu Desnoyers +Cc: Andrew Morton +Cc: Al Viro +Cc: Linus Torvalds +Link: https://lore.kernel.org/20241217024720.362271189@goodmis.org +Fixes: 5013f454a352c ("tracing: Add check of trace event print fmts for dereferencing pointers") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/trace_events.c | 82 ++++++++++++++++++++++++++++---------------- + 1 file changed, 53 insertions(+), 29 deletions(-) + +--- a/kernel/trace/trace_events.c ++++ b/kernel/trace/trace_events.c +@@ -265,8 +265,7 @@ static bool test_field(const char *fmt, + len = p - fmt; + + for (; field->type; field++) { +- if (strncmp(field->name, fmt, len) || +- field->name[len]) ++ if (strncmp(field->name, fmt, len) || field->name[len]) + continue; + array_descriptor = strchr(field->type, '['); + /* This is an array and is OK to dereference. */ +@@ -275,6 +274,32 @@ static bool test_field(const char *fmt, + return false; + } + ++/* Return true if the argument pointer is safe */ ++static bool process_pointer(const char *fmt, int len, struct trace_event_call *call) ++{ ++ const char *r, *e, *a; ++ ++ e = fmt + len; ++ ++ /* Find the REC-> in the argument */ ++ r = strstr(fmt, "REC->"); ++ if (r && r < e) { ++ /* ++ * Addresses of events on the buffer, or an array on the buffer is ++ * OK to dereference. There's ways to fool this, but ++ * this is to catch common mistakes, not malicious code. ++ */ ++ a = strchr(fmt, '&'); ++ if ((a && (a < r)) || test_field(r, call)) ++ return true; ++ } else if ((r = strstr(fmt, "__get_dynamic_array(")) && r < e) { ++ return true; ++ } else if ((r = strstr(fmt, "__get_sockaddr(")) && r < e) { ++ return true; ++ } ++ return false; ++} ++ + /* + * Examine the print fmt of the event looking for unsafe dereference + * pointers using %p* that could be recorded in the trace event and +@@ -285,12 +310,12 @@ static void test_event_printk(struct tra + { + u64 dereference_flags = 0; + bool first = true; +- const char *fmt, *c, *r, *a; ++ const char *fmt; + int parens = 0; + char in_quote = 0; + int start_arg = 0; + int arg = 0; +- int i; ++ int i, e; + + fmt = call->print_fmt; + +@@ -403,42 +428,41 @@ static void test_event_printk(struct tra + case ',': + if (in_quote || parens) + continue; ++ e = i; + i++; + while (isspace(fmt[i])) + i++; +- start_arg = i; +- if (!(dereference_flags & (1ULL << arg))) +- goto next_arg; + +- /* Find the REC-> in the argument */ +- c = strchr(fmt + i, ','); +- r = strstr(fmt + i, "REC->"); +- if (r && (!c || r < c)) { +- /* +- * Addresses of events on the buffer, +- * or an array on the buffer is +- * OK to dereference. +- * There's ways to fool this, but +- * this is to catch common mistakes, +- * not malicious code. +- */ +- a = strchr(fmt + i, '&'); +- if ((a && (a < r)) || test_field(r, call)) ++ /* ++ * If start_arg is zero, then this is the start of the ++ * first argument. The processing of the argument happens ++ * when the end of the argument is found, as it needs to ++ * handle paranthesis and such. ++ */ ++ if (!start_arg) { ++ start_arg = i; ++ /* Balance out the i++ in the for loop */ ++ i--; ++ continue; ++ } ++ ++ if (dereference_flags & (1ULL << arg)) { ++ if (process_pointer(fmt + start_arg, e - start_arg, call)) + dereference_flags &= ~(1ULL << arg); +- } else if ((r = strstr(fmt + i, "__get_dynamic_array(")) && +- (!c || r < c)) { +- dereference_flags &= ~(1ULL << arg); +- } else if ((r = strstr(fmt + i, "__get_sockaddr(")) && +- (!c || r < c)) { +- dereference_flags &= ~(1ULL << arg); + } + +- next_arg: +- i--; ++ start_arg = i; + arg++; ++ /* Balance out the i++ in the for loop */ ++ i--; + } + } + ++ if (dereference_flags & (1ULL << arg)) { ++ if (process_pointer(fmt + start_arg, i - start_arg, call)) ++ dereference_flags &= ~(1ULL << arg); ++ } ++ + /* + * If you triggered the below warning, the trace event reported + * uses an unsafe dereference pointer %p*. As the data stored diff --git a/queue-6.12/x86-hyperv-fix-hv-tsc-page-based-sched_clock-for-hibernation.patch b/queue-6.12/x86-hyperv-fix-hv-tsc-page-based-sched_clock-for-hibernation.patch new file mode 100644 index 00000000000..501a4f0c293 --- /dev/null +++ b/queue-6.12/x86-hyperv-fix-hv-tsc-page-based-sched_clock-for-hibernation.patch @@ -0,0 +1,168 @@ +From bcc80dec91ee745b3d66f3e48f0ec2efdea97149 Mon Sep 17 00:00:00 2001 +From: Naman Jain +Date: Tue, 17 Sep 2024 11:09:17 +0530 +Subject: x86/hyperv: Fix hv tsc page based sched_clock for hibernation + +From: Naman Jain + +commit bcc80dec91ee745b3d66f3e48f0ec2efdea97149 upstream. + +read_hv_sched_clock_tsc() assumes that the Hyper-V clock counter is +bigger than the variable hv_sched_clock_offset, which is cached during +early boot, but depending on the timing this assumption may be false +when a hibernated VM starts again (the clock counter starts from 0 +again) and is resuming back (Note: hv_init_tsc_clocksource() is not +called during hibernation/resume); consequently, +read_hv_sched_clock_tsc() may return a negative integer (which is +interpreted as a huge positive integer since the return type is u64) +and new kernel messages are prefixed with huge timestamps before +read_hv_sched_clock_tsc() grows big enough (which typically takes +several seconds). + +Fix the issue by saving the Hyper-V clock counter just before the +suspend, and using it to correct the hv_sched_clock_offset in +resume. This makes hv tsc page based sched_clock continuous and ensures +that post resume, it starts from where it left off during suspend. +Override x86_platform.save_sched_clock_state and +x86_platform.restore_sched_clock_state routines to correct this as soon +as possible. + +Note: if Invariant TSC is available, the issue doesn't happen because +1) we don't register read_hv_sched_clock_tsc() for sched clock: +See commit e5313f1c5404 ("clocksource/drivers/hyper-v: Rework +clocksource and sched clock setup"); +2) the common x86 code adjusts TSC similarly: see +__restore_processor_state() -> tsc_verify_tsc_adjust(true) and +x86_platform.restore_sched_clock_state(). + +Cc: stable@vger.kernel.org +Fixes: 1349401ff1aa ("clocksource/drivers/hyper-v: Suspend/resume Hyper-V clocksource for hibernation") +Co-developed-by: Dexuan Cui +Signed-off-by: Dexuan Cui +Signed-off-by: Naman Jain +Reviewed-by: Michael Kelley +Link: https://lore.kernel.org/r/20240917053917.76787-1-namjain@linux.microsoft.com +Signed-off-by: Wei Liu +Message-ID: <20240917053917.76787-1-namjain@linux.microsoft.com> +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/cpu/mshyperv.c | 58 +++++++++++++++++++++++++++++++++++++ + drivers/clocksource/hyperv_timer.c | 14 ++++++++ + include/clocksource/hyperv_timer.h | 2 + + 3 files changed, 73 insertions(+), 1 deletion(-) + +--- a/arch/x86/kernel/cpu/mshyperv.c ++++ b/arch/x86/kernel/cpu/mshyperv.c +@@ -223,6 +223,63 @@ static void hv_machine_crash_shutdown(st + hyperv_cleanup(); + } + #endif /* CONFIG_CRASH_DUMP */ ++ ++static u64 hv_ref_counter_at_suspend; ++static void (*old_save_sched_clock_state)(void); ++static void (*old_restore_sched_clock_state)(void); ++ ++/* ++ * Hyper-V clock counter resets during hibernation. Save and restore clock ++ * offset during suspend/resume, while also considering the time passed ++ * before suspend. This is to make sure that sched_clock using hv tsc page ++ * based clocksource, proceeds from where it left off during suspend and ++ * it shows correct time for the timestamps of kernel messages after resume. ++ */ ++static void save_hv_clock_tsc_state(void) ++{ ++ hv_ref_counter_at_suspend = hv_read_reference_counter(); ++} ++ ++static void restore_hv_clock_tsc_state(void) ++{ ++ /* ++ * Adjust the offsets used by hv tsc clocksource to ++ * account for the time spent before hibernation. ++ * adjusted value = reference counter (time) at suspend ++ * - reference counter (time) now. ++ */ ++ hv_adj_sched_clock_offset(hv_ref_counter_at_suspend - hv_read_reference_counter()); ++} ++ ++/* ++ * Functions to override save_sched_clock_state and restore_sched_clock_state ++ * functions of x86_platform. The Hyper-V clock counter is reset during ++ * suspend-resume and the offset used to measure time needs to be ++ * corrected, post resume. ++ */ ++static void hv_save_sched_clock_state(void) ++{ ++ old_save_sched_clock_state(); ++ save_hv_clock_tsc_state(); ++} ++ ++static void hv_restore_sched_clock_state(void) ++{ ++ restore_hv_clock_tsc_state(); ++ old_restore_sched_clock_state(); ++} ++ ++static void __init x86_setup_ops_for_tsc_pg_clock(void) ++{ ++ if (!(ms_hyperv.features & HV_MSR_REFERENCE_TSC_AVAILABLE)) ++ return; ++ ++ old_save_sched_clock_state = x86_platform.save_sched_clock_state; ++ x86_platform.save_sched_clock_state = hv_save_sched_clock_state; ++ ++ old_restore_sched_clock_state = x86_platform.restore_sched_clock_state; ++ x86_platform.restore_sched_clock_state = hv_restore_sched_clock_state; ++} + #endif /* CONFIG_HYPERV */ + + static uint32_t __init ms_hyperv_platform(void) +@@ -579,6 +636,7 @@ static void __init ms_hyperv_init_platfo + + /* Register Hyper-V specific clocksource */ + hv_init_clocksource(); ++ x86_setup_ops_for_tsc_pg_clock(); + hv_vtl_init_platform(); + #endif + /* +--- a/drivers/clocksource/hyperv_timer.c ++++ b/drivers/clocksource/hyperv_timer.c +@@ -27,7 +27,8 @@ + #include + + static struct clock_event_device __percpu *hv_clock_event; +-static u64 hv_sched_clock_offset __ro_after_init; ++/* Note: offset can hold negative values after hibernation. */ ++static u64 hv_sched_clock_offset __read_mostly; + + /* + * If false, we're using the old mechanism for stimer0 interrupts +@@ -470,6 +471,17 @@ static void resume_hv_clock_tsc(struct c + hv_set_msr(HV_MSR_REFERENCE_TSC, tsc_msr.as_uint64); + } + ++/* ++ * Called during resume from hibernation, from overridden ++ * x86_platform.restore_sched_clock_state routine. This is to adjust offsets ++ * used to calculate time for hv tsc page based sched_clock, to account for ++ * time spent before hibernation. ++ */ ++void hv_adj_sched_clock_offset(u64 offset) ++{ ++ hv_sched_clock_offset -= offset; ++} ++ + #ifdef HAVE_VDSO_CLOCKMODE_HVCLOCK + static int hv_cs_enable(struct clocksource *cs) + { +--- a/include/clocksource/hyperv_timer.h ++++ b/include/clocksource/hyperv_timer.h +@@ -38,6 +38,8 @@ extern void hv_remap_tsc_clocksource(voi + extern unsigned long hv_get_tsc_pfn(void); + extern struct ms_hyperv_tsc_page *hv_get_tsc_page(void); + ++extern void hv_adj_sched_clock_offset(u64 offset); ++ + static __always_inline bool + hv_read_tsc_page_tsc(const struct ms_hyperv_tsc_page *tsc_pg, + u64 *cur_tsc, u64 *time)