From: Greg Kroah-Hartman Date: Tue, 22 Apr 2025 08:25:08 +0000 (+0200) Subject: 6.6-stable patches X-Git-Tag: v6.1.135~79 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=1601b2d2f689665008023e4ab4440e3b5449ceda;p=thirdparty%2Fkernel%2Fstable-queue.git 6.6-stable patches added patches: nvme-rdma-unquiesce-admin_q-before-destroy-it.patch powerpc-rtas-prevent-spectre-v1-gadget-construction-in-sys_rtas.patch x86-split_lock-fix-the-delayed-detection-logic.patch x86-tdx-fix-arch_safe_halt-execution-for-tdx-vms.patch --- diff --git a/queue-6.6/nvme-rdma-unquiesce-admin_q-before-destroy-it.patch b/queue-6.6/nvme-rdma-unquiesce-admin_q-before-destroy-it.patch new file mode 100644 index 0000000000..e286abc69c --- /dev/null +++ b/queue-6.6/nvme-rdma-unquiesce-admin_q-before-destroy-it.patch @@ -0,0 +1,69 @@ +From 5858b687559809f05393af745cbadf06dee61295 Mon Sep 17 00:00:00 2001 +From: "Chunguang.xu" +Date: Tue, 3 Dec 2024 11:34:41 +0800 +Subject: nvme-rdma: unquiesce admin_q before destroy it + +From: Chunguang.xu + +commit 5858b687559809f05393af745cbadf06dee61295 upstream. + +Kernel will hang on destroy admin_q while we create ctrl failed, such +as following calltrace: + +PID: 23644 TASK: ff2d52b40f439fc0 CPU: 2 COMMAND: "nvme" + #0 [ff61d23de260fb78] __schedule at ffffffff8323bc15 + #1 [ff61d23de260fc08] schedule at ffffffff8323c014 + #2 [ff61d23de260fc28] blk_mq_freeze_queue_wait at ffffffff82a3dba1 + #3 [ff61d23de260fc78] blk_freeze_queue at ffffffff82a4113a + #4 [ff61d23de260fc90] blk_cleanup_queue at ffffffff82a33006 + #5 [ff61d23de260fcb0] nvme_rdma_destroy_admin_queue at ffffffffc12686ce + #6 [ff61d23de260fcc8] nvme_rdma_setup_ctrl at ffffffffc1268ced + #7 [ff61d23de260fd28] nvme_rdma_create_ctrl at ffffffffc126919b + #8 [ff61d23de260fd68] nvmf_dev_write at ffffffffc024f362 + #9 [ff61d23de260fe38] vfs_write at ffffffff827d5f25 + RIP: 00007fda7891d574 RSP: 00007ffe2ef06958 RFLAGS: 00000202 + RAX: ffffffffffffffda RBX: 000055e8122a4d90 RCX: 00007fda7891d574 + RDX: 000000000000012b RSI: 000055e8122a4d90 RDI: 0000000000000004 + RBP: 00007ffe2ef079c0 R8: 000000000000012b R9: 000055e8122a4d90 + R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000004 + R13: 000055e8122923c0 R14: 000000000000012b R15: 00007fda78a54500 + ORIG_RAX: 0000000000000001 CS: 0033 SS: 002b + +This due to we have quiesced admi_q before cancel requests, but forgot +to unquiesce before destroy it, as a result we fail to drain the +pending requests, and hang on blk_mq_freeze_queue_wait() forever. Here +try to reuse nvme_rdma_teardown_admin_queue() to fix this issue and +simplify the code. + +Fixes: 958dc1d32c80 ("nvme-rdma: add clean action for failed reconnection") +Reported-by: Yingfu.zhou +Signed-off-by: Chunguang.xu +Signed-off-by: Yue.zhao +Reviewed-by: Christoph Hellwig +Reviewed-by: Hannes Reinecke +Signed-off-by: Keith Busch +[Minor context change fixed] +Signed-off-by: Feng Liu +Signed-off-by: He Zhe +Signed-off-by: Greg Kroah-Hartman +--- + drivers/nvme/host/rdma.c | 8 +------- + 1 file changed, 1 insertion(+), 7 deletions(-) + +--- a/drivers/nvme/host/rdma.c ++++ b/drivers/nvme/host/rdma.c +@@ -1083,13 +1083,7 @@ destroy_io: + nvme_rdma_free_io_queues(ctrl); + } + destroy_admin: +- nvme_quiesce_admin_queue(&ctrl->ctrl); +- blk_sync_queue(ctrl->ctrl.admin_q); +- nvme_rdma_stop_queue(&ctrl->queues[0]); +- nvme_cancel_admin_tagset(&ctrl->ctrl); +- if (new) +- nvme_remove_admin_tag_set(&ctrl->ctrl); +- nvme_rdma_destroy_admin_queue(ctrl); ++ nvme_rdma_teardown_admin_queue(ctrl, new); + return ret; + } + diff --git a/queue-6.6/powerpc-rtas-prevent-spectre-v1-gadget-construction-in-sys_rtas.patch b/queue-6.6/powerpc-rtas-prevent-spectre-v1-gadget-construction-in-sys_rtas.patch new file mode 100644 index 0000000000..aa7138410a --- /dev/null +++ b/queue-6.6/powerpc-rtas-prevent-spectre-v1-gadget-construction-in-sys_rtas.patch @@ -0,0 +1,54 @@ +From 0974d03eb479384466d828d65637814bee6b26d7 Mon Sep 17 00:00:00 2001 +From: Nathan Lynch +Date: Thu, 30 May 2024 19:44:12 -0500 +Subject: powerpc/rtas: Prevent Spectre v1 gadget construction in sys_rtas() + +From: Nathan Lynch + +commit 0974d03eb479384466d828d65637814bee6b26d7 upstream. + +Smatch warns: + + arch/powerpc/kernel/rtas.c:1932 __do_sys_rtas() warn: potential + spectre issue 'args.args' [r] (local cap) + +The 'nargs' and 'nret' locals come directly from a user-supplied +buffer and are used as indexes into a small stack-based array and as +inputs to copy_to_user() after they are subject to bounds checks. + +Use array_index_nospec() after the bounds checks to clamp these values +for speculative execution. + +Signed-off-by: Nathan Lynch +Reported-by: Breno Leitao +Reviewed-by: Breno Leitao +Signed-off-by: Michael Ellerman +Link: https://msgid.link/20240530-sys_rtas-nargs-nret-v1-1-129acddd4d89@linux.ibm.com +[Minor context change fixed] +Signed-off-by: Cliff Liu +Signed-off-by: He Zhe +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/kernel/rtas.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/arch/powerpc/kernel/rtas.c ++++ b/arch/powerpc/kernel/rtas.c +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -1839,6 +1840,9 @@ SYSCALL_DEFINE1(rtas, struct rtas_args _ + || nargs + nret > ARRAY_SIZE(args.args)) + return -EINVAL; + ++ nargs = array_index_nospec(nargs, ARRAY_SIZE(args.args)); ++ nret = array_index_nospec(nret, ARRAY_SIZE(args.args) - nargs); ++ + /* Copy in args. */ + if (copy_from_user(args.args, uargs->args, + nargs * sizeof(rtas_arg_t)) != 0) diff --git a/queue-6.6/series b/queue-6.6/series index 6de89d1c67..30142da25d 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -369,3 +369,7 @@ fix-mmu-notifiers-for-range-based-invalidates.patch efi-libstub-bump-up-efi_mmap_nr_slack_slots-to-32.patch x86-xen-move-xen_reserve_extra_memory.patch x86-xen-fix-memblock_reserve-usage-on-pvh.patch +x86-tdx-fix-arch_safe_halt-execution-for-tdx-vms.patch +x86-split_lock-fix-the-delayed-detection-logic.patch +nvme-rdma-unquiesce-admin_q-before-destroy-it.patch +powerpc-rtas-prevent-spectre-v1-gadget-construction-in-sys_rtas.patch diff --git a/queue-6.6/x86-split_lock-fix-the-delayed-detection-logic.patch b/queue-6.6/x86-split_lock-fix-the-delayed-detection-logic.patch new file mode 100644 index 0000000000..382c12b87f --- /dev/null +++ b/queue-6.6/x86-split_lock-fix-the-delayed-detection-logic.patch @@ -0,0 +1,156 @@ +From c929d08df8bee855528b9d15b853c892c54e1eee Mon Sep 17 00:00:00 2001 +From: Maksim Davydov +Date: Wed, 15 Jan 2025 16:17:04 +0300 +Subject: x86/split_lock: Fix the delayed detection logic + +From: Maksim Davydov + +commit c929d08df8bee855528b9d15b853c892c54e1eee upstream. + +If the warning mode with disabled mitigation mode is used, then on each +CPU where the split lock occurred detection will be disabled in order to +make progress and delayed work will be scheduled, which then will enable +detection back. + +Now it turns out that all CPUs use one global delayed work structure. +This leads to the fact that if a split lock occurs on several CPUs +at the same time (within 2 jiffies), only one CPU will schedule delayed +work, but the rest will not. + +The return value of schedule_delayed_work_on() would have shown this, +but it is not checked in the code. + +A diagram that can help to understand the bug reproduction: + + - sld_update_msr() enables/disables SLD on both CPUs on the same core + + - schedule_delayed_work_on() internally checks WORK_STRUCT_PENDING_BIT. + If a work has the 'pending' status, then schedule_delayed_work_on() + will return an error code and, most importantly, the work will not + be placed in the workqueue. + +Let's say we have a multicore system on which split_lock_mitigate=0 and +a multithreaded application is running that calls splitlock in multiple +threads. Due to the fact that sld_update_msr() affects the entire core +(both CPUs), we will consider 2 CPUs from different cores. Let the 2 +threads of this application schedule to CPU0 (core 0) and to CPU 2 +(core 1), then: + +| || | +| CPU 0 (core 0) || CPU 2 (core 1) | +|_________________________________||___________________________________| +| || | +| 1) SPLIT LOCK occured || | +| || | +| 2) split_lock_warn() || | +| || | +| 3) sysctl_sld_mitigate == 0 || | +| (work = &sl_reenable) || | +| || | +| 4) schedule_delayed_work_on() || | +| (reenable will be called || | +| after 2 jiffies on CPU 0) || | +| || | +| 5) disable SLD for core 0 || | +| || | +| ------------------------- || | +| || | +| || 6) SPLIT LOCK occured | +| || | +| || 7) split_lock_warn() | +| || | +| || 8) sysctl_sld_mitigate == 0 | +| || (work = &sl_reenable, | +| || the same address as in 3) ) | +| || | +| 2 jiffies || 9) schedule_delayed_work_on() | +| || fials because the work is in | +| || the pending state since 4). | +| || The work wasn't placed to the | +| || workqueue. reenable won't be | +| || called on CPU 2 | +| || | +| || 10) disable SLD for core 0 | +| || | +| || From now on SLD will | +| || never be reenabled on core 1 | +| || | +| ------------------------- || | +| || | +| 11) enable SLD for core 0 by || | +| __split_lock_reenable || | +| || | + +If the application threads can be scheduled to all processor cores, +then over time there will be only one core left, on which SLD will be +enabled and split lock will be able to be detected; and on all other +cores SLD will be disabled all the time. + +Most likely, this bug has not been noticed for so long because +sysctl_sld_mitigate default value is 1, and in this case a semaphore +is used that does not allow 2 different cores to have SLD disabled at +the same time, that is, strictly only one work is placed in the +workqueue. + +In order to fix the warning mode with disabled mitigation mode, +delayed work has to be per-CPU. Implement it. + +Fixes: 727209376f49 ("x86/split_lock: Add sysctl to control the misery mode") +Signed-off-by: Maksim Davydov +Signed-off-by: Ingo Molnar +Tested-by: Guilherme G. Piccoli +Cc: Thomas Gleixner +Cc: Ravi Bangoria +Cc: Tom Lendacky +Link: https://lore.kernel.org/r/20250115131704.132609-1-davydov-max@yandex-team.ru +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/kernel/cpu/intel.c | 20 ++++++++++++++++---- + 1 file changed, 16 insertions(+), 4 deletions(-) + +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -1168,7 +1168,13 @@ static void __split_lock_reenable(struct + { + sld_update_msr(true); + } +-static DECLARE_DELAYED_WORK(sl_reenable, __split_lock_reenable); ++/* ++ * In order for each CPU to schedule its delayed work independently of the ++ * others, delayed work struct must be per-CPU. This is not required when ++ * sysctl_sld_mitigate is enabled because of the semaphore that limits ++ * the number of simultaneously scheduled delayed works to 1. ++ */ ++static DEFINE_PER_CPU(struct delayed_work, sl_reenable); + + /* + * If a CPU goes offline with pending delayed work to re-enable split lock +@@ -1189,7 +1195,7 @@ static int splitlock_cpu_offline(unsigne + + static void split_lock_warn(unsigned long ip) + { +- struct delayed_work *work; ++ struct delayed_work *work = NULL; + int cpu; + + if (!current->reported_split_lock) +@@ -1211,11 +1217,17 @@ static void split_lock_warn(unsigned lon + if (down_interruptible(&buslock_sem) == -EINTR) + return; + work = &sl_reenable_unlock; +- } else { +- work = &sl_reenable; + } + + cpu = get_cpu(); ++ ++ if (!work) { ++ work = this_cpu_ptr(&sl_reenable); ++ /* Deferred initialization of per-CPU struct */ ++ if (!work->work.func) ++ INIT_DELAYED_WORK(work, __split_lock_reenable); ++ } ++ + schedule_delayed_work_on(cpu, work, 2); + + /* Disable split lock detection on this CPU to make progress */ diff --git a/queue-6.6/x86-tdx-fix-arch_safe_halt-execution-for-tdx-vms.patch b/queue-6.6/x86-tdx-fix-arch_safe_halt-execution-for-tdx-vms.patch new file mode 100644 index 0000000000..9322bcde10 --- /dev/null +++ b/queue-6.6/x86-tdx-fix-arch_safe_halt-execution-for-tdx-vms.patch @@ -0,0 +1,159 @@ +From 9f98a4f4e7216dbe366010b4cdcab6b220f229c4 Mon Sep 17 00:00:00 2001 +From: Vishal Annapurve +Date: Fri, 28 Feb 2025 01:44:15 +0000 +Subject: x86/tdx: Fix arch_safe_halt() execution for TDX VMs + +From: Vishal Annapurve + +commit 9f98a4f4e7216dbe366010b4cdcab6b220f229c4 upstream. + +Direct HLT instruction execution causes #VEs for TDX VMs which is routed +to hypervisor via TDCALL. If HLT is executed in STI-shadow, resulting #VE +handler will enable interrupts before TDCALL is routed to hypervisor +leading to missed wakeup events, as current TDX spec doesn't expose +interruptibility state information to allow #VE handler to selectively +enable interrupts. + +Commit bfe6ed0c6727 ("x86/tdx: Add HLT support for TDX guests") +prevented the idle routines from executing HLT instruction in STI-shadow. +But it missed the paravirt routine which can be reached via this path +as an example: + + kvm_wait() => + safe_halt() => + raw_safe_halt() => + arch_safe_halt() => + irq.safe_halt() => + pv_native_safe_halt() + +To reliably handle arch_safe_halt() for TDX VMs, introduce explicit +dependency on CONFIG_PARAVIRT and override paravirt halt()/safe_halt() +routines with TDX-safe versions that execute direct TDCALL and needed +interrupt flag updates. Executing direct TDCALL brings in additional +benefit of avoiding HLT related #VEs altogether. + +As tested by Ryan Afranji: + + "Tested with the specjbb2015 benchmark. It has heavy lock contention which leads + to many halt calls. TDX VMs suffered a poor score before this patchset. + + Verified the major performance improvement with this patchset applied." + +Fixes: bfe6ed0c6727 ("x86/tdx: Add HLT support for TDX guests") +Signed-off-by: Vishal Annapurve +Signed-off-by: Ingo Molnar +Reviewed-by: Kirill A. Shutemov +Tested-by: Ryan Afranji +Cc: Andy Lutomirski +Cc: Brian Gerst +Cc: Juergen Gross +Cc: H. Peter Anvin +Cc: Linus Torvalds +Cc: Josh Poimboeuf +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20250228014416.3925664-3-vannapurve@google.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/Kconfig | 1 + + arch/x86/coco/tdx/tdx.c | 26 +++++++++++++++++++++++++- + arch/x86/include/asm/tdx.h | 4 ++-- + arch/x86/kernel/process.c | 2 +- + 4 files changed, 29 insertions(+), 4 deletions(-) + +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -881,6 +881,7 @@ config INTEL_TDX_GUEST + depends on X86_64 && CPU_SUP_INTEL + depends on X86_X2APIC + depends on EFI_STUB ++ depends on PARAVIRT + select ARCH_HAS_CC_PLATFORM + select X86_MEM_ENCRYPT + select X86_MCE +--- a/arch/x86/coco/tdx/tdx.c ++++ b/arch/x86/coco/tdx/tdx.c +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + +@@ -334,7 +335,7 @@ static int handle_halt(struct ve_info *v + return ve_instr_len(ve); + } + +-void __cpuidle tdx_safe_halt(void) ++void __cpuidle tdx_halt(void) + { + const bool irq_disabled = false; + +@@ -345,6 +346,16 @@ void __cpuidle tdx_safe_halt(void) + WARN_ONCE(1, "HLT instruction emulation failed\n"); + } + ++static void __cpuidle tdx_safe_halt(void) ++{ ++ tdx_halt(); ++ /* ++ * "__cpuidle" section doesn't support instrumentation, so stick ++ * with raw_* variant that avoids tracing hooks. ++ */ ++ raw_local_irq_enable(); ++} ++ + static int read_msr(struct pt_regs *regs, struct ve_info *ve) + { + struct tdx_hypercall_args args = { +@@ -889,6 +900,19 @@ void __init tdx_early_init(void) + x86_platform.guest.enc_tlb_flush_required = tdx_tlb_flush_required; + + /* ++ * Avoid "sti;hlt" execution in TDX guests as HLT induces a #VE that ++ * will enable interrupts before HLT TDCALL invocation if executed ++ * in STI-shadow, possibly resulting in missed wakeup events. ++ * ++ * Modify all possible HLT execution paths to use TDX specific routines ++ * that directly execute TDCALL and toggle the interrupt state as ++ * needed after TDCALL completion. This also reduces HLT related #VEs ++ * in addition to having a reliable halt logic execution. ++ */ ++ pv_ops.irq.safe_halt = tdx_safe_halt; ++ pv_ops.irq.halt = tdx_halt; ++ ++ /* + * TDX intercepts the RDMSR to read the X2APIC ID in the parallel + * bringup low level code. That raises #VE which cannot be handled + * there. +--- a/arch/x86/include/asm/tdx.h ++++ b/arch/x86/include/asm/tdx.h +@@ -46,7 +46,7 @@ void tdx_get_ve_info(struct ve_info *ve) + + bool tdx_handle_virt_exception(struct pt_regs *regs, struct ve_info *ve); + +-void tdx_safe_halt(void); ++void tdx_halt(void); + + bool tdx_early_handle_ve(struct pt_regs *regs); + +@@ -55,7 +55,7 @@ int tdx_mcall_get_report0(u8 *reportdata + #else + + static inline void tdx_early_init(void) { }; +-static inline void tdx_safe_halt(void) { }; ++static inline void tdx_halt(void) { }; + + static inline bool tdx_early_handle_ve(struct pt_regs *regs) { return false; } + +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -955,7 +955,7 @@ void select_idle_routine(const struct cp + static_call_update(x86_idle, mwait_idle); + } else if (cpu_feature_enabled(X86_FEATURE_TDX_GUEST)) { + pr_info("using TDX aware idle routine\n"); +- static_call_update(x86_idle, tdx_safe_halt); ++ static_call_update(x86_idle, tdx_halt); + } else + static_call_update(x86_idle, default_idle); + }