From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 22 Apr 2025 08:25:08 +0000 (+0200)
Subject: 6.6-stable patches
X-Git-Tag: v6.1.135~79
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=1601b2d2f689665008023e4ab4440e3b5449ceda;p=thirdparty%2Fkernel%2Fstable-queue.git

6.6-stable patches

added patches:
	nvme-rdma-unquiesce-admin_q-before-destroy-it.patch
	powerpc-rtas-prevent-spectre-v1-gadget-construction-in-sys_rtas.patch
	x86-split_lock-fix-the-delayed-detection-logic.patch
	x86-tdx-fix-arch_safe_halt-execution-for-tdx-vms.patch
---

diff --git a/queue-6.6/nvme-rdma-unquiesce-admin_q-before-destroy-it.patch b/queue-6.6/nvme-rdma-unquiesce-admin_q-before-destroy-it.patch
new file mode 100644
index 0000000000..e286abc69c
--- /dev/null
+++ b/queue-6.6/nvme-rdma-unquiesce-admin_q-before-destroy-it.patch
@@ -0,0 +1,69 @@
+From 5858b687559809f05393af745cbadf06dee61295 Mon Sep 17 00:00:00 2001
+From: "Chunguang.xu" <chunguang.xu@shopee.com>
+Date: Tue, 3 Dec 2024 11:34:41 +0800
+Subject: nvme-rdma: unquiesce admin_q before destroy it
+
+From: Chunguang.xu <chunguang.xu@shopee.com>
+
+commit 5858b687559809f05393af745cbadf06dee61295 upstream.
+
+Kernel will hang on destroy admin_q while we create ctrl failed, such
+as following calltrace:
+
+PID: 23644    TASK: ff2d52b40f439fc0  CPU: 2    COMMAND: "nvme"
+ #0 [ff61d23de260fb78] __schedule at ffffffff8323bc15
+ #1 [ff61d23de260fc08] schedule at ffffffff8323c014
+ #2 [ff61d23de260fc28] blk_mq_freeze_queue_wait at ffffffff82a3dba1
+ #3 [ff61d23de260fc78] blk_freeze_queue at ffffffff82a4113a
+ #4 [ff61d23de260fc90] blk_cleanup_queue at ffffffff82a33006
+ #5 [ff61d23de260fcb0] nvme_rdma_destroy_admin_queue at ffffffffc12686ce
+ #6 [ff61d23de260fcc8] nvme_rdma_setup_ctrl at ffffffffc1268ced
+ #7 [ff61d23de260fd28] nvme_rdma_create_ctrl at ffffffffc126919b
+ #8 [ff61d23de260fd68] nvmf_dev_write at ffffffffc024f362
+ #9 [ff61d23de260fe38] vfs_write at ffffffff827d5f25
+    RIP: 00007fda7891d574  RSP: 00007ffe2ef06958  RFLAGS: 00000202
+    RAX: ffffffffffffffda  RBX: 000055e8122a4d90  RCX: 00007fda7891d574
+    RDX: 000000000000012b  RSI: 000055e8122a4d90  RDI: 0000000000000004
+    RBP: 00007ffe2ef079c0   R8: 000000000000012b   R9: 000055e8122a4d90
+    R10: 0000000000000000  R11: 0000000000000202  R12: 0000000000000004
+    R13: 000055e8122923c0  R14: 000000000000012b  R15: 00007fda78a54500
+    ORIG_RAX: 0000000000000001  CS: 0033  SS: 002b
+
+This due to we have quiesced admi_q before cancel requests, but forgot
+to unquiesce before destroy it, as a result we fail to drain the
+pending requests, and hang on blk_mq_freeze_queue_wait() forever. Here
+try to reuse nvme_rdma_teardown_admin_queue() to fix this issue and
+simplify the code.
+
+Fixes: 958dc1d32c80 ("nvme-rdma: add clean action for failed reconnection")
+Reported-by: Yingfu.zhou <yingfu.zhou@shopee.com>
+Signed-off-by: Chunguang.xu <chunguang.xu@shopee.com>
+Signed-off-by: Yue.zhao <yue.zhao@shopee.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Hannes Reinecke <hare@suse.de>
+Signed-off-by: Keith Busch <kbusch@kernel.org>
+[Minor context change fixed]
+Signed-off-by: Feng Liu <Feng.Liu3@windriver.com>
+Signed-off-by: He Zhe <Zhe.He@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/nvme/host/rdma.c |    8 +-------
+ 1 file changed, 1 insertion(+), 7 deletions(-)
+
+--- a/drivers/nvme/host/rdma.c
++++ b/drivers/nvme/host/rdma.c
+@@ -1083,13 +1083,7 @@ destroy_io:
+ 		nvme_rdma_free_io_queues(ctrl);
+ 	}
+ destroy_admin:
+-	nvme_quiesce_admin_queue(&ctrl->ctrl);
+-	blk_sync_queue(ctrl->ctrl.admin_q);
+-	nvme_rdma_stop_queue(&ctrl->queues[0]);
+-	nvme_cancel_admin_tagset(&ctrl->ctrl);
+-	if (new)
+-		nvme_remove_admin_tag_set(&ctrl->ctrl);
+-	nvme_rdma_destroy_admin_queue(ctrl);
++	nvme_rdma_teardown_admin_queue(ctrl, new);
+ 	return ret;
+ }
+ 
diff --git a/queue-6.6/powerpc-rtas-prevent-spectre-v1-gadget-construction-in-sys_rtas.patch b/queue-6.6/powerpc-rtas-prevent-spectre-v1-gadget-construction-in-sys_rtas.patch
new file mode 100644
index 0000000000..aa7138410a
--- /dev/null
+++ b/queue-6.6/powerpc-rtas-prevent-spectre-v1-gadget-construction-in-sys_rtas.patch
@@ -0,0 +1,54 @@
+From 0974d03eb479384466d828d65637814bee6b26d7 Mon Sep 17 00:00:00 2001
+From: Nathan Lynch <nathanl@linux.ibm.com>
+Date: Thu, 30 May 2024 19:44:12 -0500
+Subject: powerpc/rtas: Prevent Spectre v1 gadget construction in sys_rtas()
+
+From: Nathan Lynch <nathanl@linux.ibm.com>
+
+commit 0974d03eb479384466d828d65637814bee6b26d7 upstream.
+
+Smatch warns:
+
+  arch/powerpc/kernel/rtas.c:1932 __do_sys_rtas() warn: potential
+  spectre issue 'args.args' [r] (local cap)
+
+The 'nargs' and 'nret' locals come directly from a user-supplied
+buffer and are used as indexes into a small stack-based array and as
+inputs to copy_to_user() after they are subject to bounds checks.
+
+Use array_index_nospec() after the bounds checks to clamp these values
+for speculative execution.
+
+Signed-off-by: Nathan Lynch <nathanl@linux.ibm.com>
+Reported-by: Breno Leitao <leitao@debian.org>
+Reviewed-by: Breno Leitao <leitao@debian.org>
+Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
+Link: https://msgid.link/20240530-sys_rtas-nargs-nret-v1-1-129acddd4d89@linux.ibm.com
+[Minor context change fixed]
+Signed-off-by: Cliff Liu <donghua.liu@windriver.com>
+Signed-off-by: He Zhe <Zhe.He@windriver.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/powerpc/kernel/rtas.c |    4 ++++
+ 1 file changed, 4 insertions(+)
+
+--- a/arch/powerpc/kernel/rtas.c
++++ b/arch/powerpc/kernel/rtas.c
+@@ -18,6 +18,7 @@
+ #include <linux/kernel.h>
+ #include <linux/lockdep.h>
+ #include <linux/memblock.h>
++#include <linux/nospec.h>
+ #include <linux/of.h>
+ #include <linux/of_fdt.h>
+ #include <linux/reboot.h>
+@@ -1839,6 +1840,9 @@ SYSCALL_DEFINE1(rtas, struct rtas_args _
+ 	    || nargs + nret > ARRAY_SIZE(args.args))
+ 		return -EINVAL;
+ 
++	nargs = array_index_nospec(nargs, ARRAY_SIZE(args.args));
++	nret = array_index_nospec(nret, ARRAY_SIZE(args.args) - nargs);
++
+ 	/* Copy in args. */
+ 	if (copy_from_user(args.args, uargs->args,
+ 			   nargs * sizeof(rtas_arg_t)) != 0)
diff --git a/queue-6.6/series b/queue-6.6/series
index 6de89d1c67..30142da25d 100644
--- a/queue-6.6/series
+++ b/queue-6.6/series
@@ -369,3 +369,7 @@ fix-mmu-notifiers-for-range-based-invalidates.patch
 efi-libstub-bump-up-efi_mmap_nr_slack_slots-to-32.patch
 x86-xen-move-xen_reserve_extra_memory.patch
 x86-xen-fix-memblock_reserve-usage-on-pvh.patch
+x86-tdx-fix-arch_safe_halt-execution-for-tdx-vms.patch
+x86-split_lock-fix-the-delayed-detection-logic.patch
+nvme-rdma-unquiesce-admin_q-before-destroy-it.patch
+powerpc-rtas-prevent-spectre-v1-gadget-construction-in-sys_rtas.patch
diff --git a/queue-6.6/x86-split_lock-fix-the-delayed-detection-logic.patch b/queue-6.6/x86-split_lock-fix-the-delayed-detection-logic.patch
new file mode 100644
index 0000000000..382c12b87f
--- /dev/null
+++ b/queue-6.6/x86-split_lock-fix-the-delayed-detection-logic.patch
@@ -0,0 +1,156 @@
+From c929d08df8bee855528b9d15b853c892c54e1eee Mon Sep 17 00:00:00 2001
+From: Maksim Davydov <davydov-max@yandex-team.ru>
+Date: Wed, 15 Jan 2025 16:17:04 +0300
+Subject: x86/split_lock: Fix the delayed detection logic
+
+From: Maksim Davydov <davydov-max@yandex-team.ru>
+
+commit c929d08df8bee855528b9d15b853c892c54e1eee upstream.
+
+If the warning mode with disabled mitigation mode is used, then on each
+CPU where the split lock occurred detection will be disabled in order to
+make progress and delayed work will be scheduled, which then will enable
+detection back.
+
+Now it turns out that all CPUs use one global delayed work structure.
+This leads to the fact that if a split lock occurs on several CPUs
+at the same time (within 2 jiffies), only one CPU will schedule delayed
+work, but the rest will not.
+
+The return value of schedule_delayed_work_on() would have shown this,
+but it is not checked in the code.
+
+A diagram that can help to understand the bug reproduction:
+
+ - sld_update_msr() enables/disables SLD on both CPUs on the same core
+
+ - schedule_delayed_work_on() internally checks WORK_STRUCT_PENDING_BIT.
+   If a work has the 'pending' status, then schedule_delayed_work_on()
+   will return an error code and, most importantly, the work will not
+   be placed in the workqueue.
+
+Let's say we have a multicore system on which split_lock_mitigate=0 and
+a multithreaded application is running that calls splitlock in multiple
+threads. Due to the fact that sld_update_msr() affects the entire core
+(both CPUs), we will consider 2 CPUs from different cores. Let the 2
+threads of this application schedule to CPU0 (core 0) and to CPU 2
+(core 1), then:
+
+|                                 ||                                   |
+|             CPU 0 (core 0)      ||          CPU 2 (core 1)           |
+|_________________________________||___________________________________|
+|                                 ||                                   |
+| 1) SPLIT LOCK occured           ||                                   |
+|                                 ||                                   |
+| 2) split_lock_warn()            ||                                   |
+|                                 ||                                   |
+| 3) sysctl_sld_mitigate == 0     ||                                   |
+|    (work = &sl_reenable)        ||                                   |
+|                                 ||                                   |
+| 4) schedule_delayed_work_on()   ||                                   |
+|    (reenable will be called     ||                                   |
+|     after 2 jiffies on CPU 0)   ||                                   |
+|                                 ||                                   |
+| 5) disable SLD for core 0       ||                                   |
+|                                 ||                                   |
+|    -------------------------    ||                                   |
+|                                 ||                                   |
+|                                 || 6) SPLIT LOCK occured             |
+|                                 ||                                   |
+|                                 || 7) split_lock_warn()              |
+|                                 ||                                   |
+|                                 || 8) sysctl_sld_mitigate == 0       |
+|                                 ||    (work = &sl_reenable,          |
+|                                 ||     the same address as in 3) )   |
+|                                 ||                                   |
+|            2 jiffies            || 9) schedule_delayed_work_on()     |
+|                                 ||    fials because the work is in   |
+|                                 ||    the pending state since 4).    |
+|                                 ||    The work wasn't placed to the  |
+|                                 ||    workqueue. reenable won't be   |
+|                                 ||    called on CPU 2                |
+|                                 ||                                   |
+|                                 || 10) disable SLD for core 0        |
+|                                 ||                                   |
+|                                 ||     From now on SLD will          |
+|                                 ||     never be reenabled on core 1  |
+|                                 ||                                   |
+|    -------------------------    ||                                   |
+|                                 ||                                   |
+|    11) enable SLD for core 0 by ||                                   |
+|        __split_lock_reenable    ||                                   |
+|                                 ||                                   |
+
+If the application threads can be scheduled to all processor cores,
+then over time there will be only one core left, on which SLD will be
+enabled and split lock will be able to be detected; and on all other
+cores SLD will be disabled all the time.
+
+Most likely, this bug has not been noticed for so long because
+sysctl_sld_mitigate default value is 1, and in this case a semaphore
+is used that does not allow 2 different cores to have SLD disabled at
+the same time, that is, strictly only one work is placed in the
+workqueue.
+
+In order to fix the warning mode with disabled mitigation mode,
+delayed work has to be per-CPU. Implement it.
+
+Fixes: 727209376f49 ("x86/split_lock: Add sysctl to control the misery mode")
+Signed-off-by: Maksim Davydov <davydov-max@yandex-team.ru>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Tested-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Ravi Bangoria <ravi.bangoria@amd.com>
+Cc: Tom Lendacky <thomas.lendacky@amd.com>
+Link: https://lore.kernel.org/r/20250115131704.132609-1-davydov-max@yandex-team.ru
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/cpu/intel.c |   20 ++++++++++++++++----
+ 1 file changed, 16 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/kernel/cpu/intel.c
++++ b/arch/x86/kernel/cpu/intel.c
+@@ -1168,7 +1168,13 @@ static void __split_lock_reenable(struct
+ {
+ 	sld_update_msr(true);
+ }
+-static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable);
++/*
++ * In order for each CPU to schedule its delayed work independently of the
++ * others, delayed work struct must be per-CPU. This is not required when
++ * sysctl_sld_mitigate is enabled because of the semaphore that limits
++ * the number of simultaneously scheduled delayed works to 1.
++ */
++static DEFINE_PER_CPU(struct delayed_work, sl_reenable);
+ 
+ /*
+  * If a CPU goes offline with pending delayed work to re-enable split lock
+@@ -1189,7 +1195,7 @@ static int splitlock_cpu_offline(unsigne
+ 
+ static void split_lock_warn(unsigned long ip)
+ {
+-	struct delayed_work *work;
++	struct delayed_work *work = NULL;
+ 	int cpu;
+ 
+ 	if (!current->reported_split_lock)
+@@ -1211,11 +1217,17 @@ static void split_lock_warn(unsigned lon
+ 		if (down_interruptible(&buslock_sem) == -EINTR)
+ 			return;
+ 		work = &sl_reenable_unlock;
+-	} else {
+-		work = &sl_reenable;
+ 	}
+ 
+ 	cpu = get_cpu();
++
++	if (!work) {
++		work = this_cpu_ptr(&sl_reenable);
++		/* Deferred initialization of per-CPU struct */
++		if (!work->work.func)
++			INIT_DELAYED_WORK(work, __split_lock_reenable);
++	}
++
+ 	schedule_delayed_work_on(cpu, work, 2);
+ 
+ 	/* Disable split lock detection on this CPU to make progress */
diff --git a/queue-6.6/x86-tdx-fix-arch_safe_halt-execution-for-tdx-vms.patch b/queue-6.6/x86-tdx-fix-arch_safe_halt-execution-for-tdx-vms.patch
new file mode 100644
index 0000000000..9322bcde10
--- /dev/null
+++ b/queue-6.6/x86-tdx-fix-arch_safe_halt-execution-for-tdx-vms.patch
@@ -0,0 +1,159 @@
+From 9f98a4f4e7216dbe366010b4cdcab6b220f229c4 Mon Sep 17 00:00:00 2001
+From: Vishal Annapurve <vannapurve@google.com>
+Date: Fri, 28 Feb 2025 01:44:15 +0000
+Subject: x86/tdx: Fix arch_safe_halt() execution for TDX VMs
+
+From: Vishal Annapurve <vannapurve@google.com>
+
+commit 9f98a4f4e7216dbe366010b4cdcab6b220f229c4 upstream.
+
+Direct HLT instruction execution causes #VEs for TDX VMs which is routed
+to hypervisor via TDCALL. If HLT is executed in STI-shadow, resulting #VE
+handler will enable interrupts before TDCALL is routed to hypervisor
+leading to missed wakeup events, as current TDX spec doesn't expose
+interruptibility state information to allow #VE handler to selectively
+enable interrupts.
+
+Commit bfe6ed0c6727 ("x86/tdx: Add HLT support for TDX guests")
+prevented the idle routines from executing HLT instruction in STI-shadow.
+But it missed the paravirt routine which can be reached via this path
+as an example:
+
+	kvm_wait()       =>
+        safe_halt()      =>
+        raw_safe_halt()  =>
+        arch_safe_halt() =>
+        irq.safe_halt()  =>
+        pv_native_safe_halt()
+
+To reliably handle arch_safe_halt() for TDX VMs, introduce explicit
+dependency on CONFIG_PARAVIRT and override paravirt halt()/safe_halt()
+routines with TDX-safe versions that execute direct TDCALL and needed
+interrupt flag updates. Executing direct TDCALL brings in additional
+benefit of avoiding HLT related #VEs altogether.
+
+As tested by Ryan Afranji:
+
+  "Tested with the specjbb2015 benchmark. It has heavy lock contention which leads
+   to many halt calls. TDX VMs suffered a poor score before this patchset.
+
+   Verified the major performance improvement with this patchset applied."
+
+Fixes: bfe6ed0c6727 ("x86/tdx: Add HLT support for TDX guests")
+Signed-off-by: Vishal Annapurve <vannapurve@google.com>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Reviewed-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
+Tested-by: Ryan Afranji <afranji@google.com>
+Cc: Andy Lutomirski <luto@kernel.org>
+Cc: Brian Gerst <brgerst@gmail.com>
+Cc: Juergen Gross <jgross@suse.com>
+Cc: H. Peter Anvin <hpa@zytor.com>
+Cc: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Josh Poimboeuf <jpoimboe@redhat.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20250228014416.3925664-3-vannapurve@google.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/Kconfig           |    1 +
+ arch/x86/coco/tdx/tdx.c    |   26 +++++++++++++++++++++++++-
+ arch/x86/include/asm/tdx.h |    4 ++--
+ arch/x86/kernel/process.c  |    2 +-
+ 4 files changed, 29 insertions(+), 4 deletions(-)
+
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -881,6 +881,7 @@ config INTEL_TDX_GUEST
+ 	depends on X86_64 && CPU_SUP_INTEL
+ 	depends on X86_X2APIC
+ 	depends on EFI_STUB
++	depends on PARAVIRT
+ 	select ARCH_HAS_CC_PLATFORM
+ 	select X86_MEM_ENCRYPT
+ 	select X86_MCE
+--- a/arch/x86/coco/tdx/tdx.c
++++ b/arch/x86/coco/tdx/tdx.c
+@@ -13,6 +13,7 @@
+ #include <asm/ia32.h>
+ #include <asm/insn.h>
+ #include <asm/insn-eval.h>
++#include <asm/paravirt_types.h>
+ #include <asm/pgtable.h>
+ #include <asm/traps.h>
+ 
+@@ -334,7 +335,7 @@ static int handle_halt(struct ve_info *v
+ 	return ve_instr_len(ve);
+ }
+ 
+-void __cpuidle tdx_safe_halt(void)
++void __cpuidle tdx_halt(void)
+ {
+ 	const bool irq_disabled = false;
+ 
+@@ -345,6 +346,16 @@ void __cpuidle tdx_safe_halt(void)
+ 		WARN_ONCE(1, "HLT instruction emulation failed\n");
+ }
+ 
++static void __cpuidle tdx_safe_halt(void)
++{
++	tdx_halt();
++	/*
++	 * "__cpuidle" section doesn't support instrumentation, so stick
++	 * with raw_* variant that avoids tracing hooks.
++	 */
++	raw_local_irq_enable();
++}
++
+ static int read_msr(struct pt_regs *regs, struct ve_info *ve)
+ {
+ 	struct tdx_hypercall_args args = {
+@@ -889,6 +900,19 @@ void __init tdx_early_init(void)
+ 	x86_platform.guest.enc_tlb_flush_required    = tdx_tlb_flush_required;
+ 
+ 	/*
++	 * Avoid "sti;hlt" execution in TDX guests as HLT induces a #VE that
++	 * will enable interrupts before HLT TDCALL invocation if executed
++	 * in STI-shadow, possibly resulting in missed wakeup events.
++	 *
++	 * Modify all possible HLT execution paths to use TDX specific routines
++	 * that directly execute TDCALL and toggle the interrupt state as
++	 * needed after TDCALL completion. This also reduces HLT related #VEs
++	 * in addition to having a reliable halt logic execution.
++	 */
++	pv_ops.irq.safe_halt = tdx_safe_halt;
++	pv_ops.irq.halt = tdx_halt;
++
++	/*
+ 	 * TDX intercepts the RDMSR to read the X2APIC ID in the parallel
+ 	 * bringup low level code. That raises #VE which cannot be handled
+ 	 * there.
+--- a/arch/x86/include/asm/tdx.h
++++ b/arch/x86/include/asm/tdx.h
+@@ -46,7 +46,7 @@ void tdx_get_ve_info(struct ve_info *ve)
+ 
+ bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve);
+ 
+-void tdx_safe_halt(void);
++void tdx_halt(void);
+ 
+ bool tdx_early_handle_ve(struct pt_regs *regs);
+ 
+@@ -55,7 +55,7 @@ int tdx_mcall_get_report0(u8 *reportdata
+ #else
+ 
+ static inline void tdx_early_init(void) { };
+-static inline void tdx_safe_halt(void) { };
++static inline void tdx_halt(void) { };
+ 
+ static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; }
+ 
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -955,7 +955,7 @@ void select_idle_routine(const struct cp
+ 		static_call_update(x86_idle, mwait_idle);
+ 	} else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) {
+ 		pr_info("using TDX aware idle routine\n");
+-		static_call_update(x86_idle, tdx_safe_halt);
++		static_call_update(x86_idle, tdx_halt);
+ 	} else
+ 		static_call_update(x86_idle, default_idle);
+ }