mptcp-ensure-listener-is-unhashed-before-updating-the-sk-status.patch
mm-hwpoison-try-to-recover-from-copy-on-write-faults.patch
mm-hwpoison-when-copy-on-write-hits-poison-take-page-offline.patch
+x86-microcode-amd-load-late-on-both-threads-too.patch
+x86-smp-make-stop_other_cpus-more-robust.patch
+x86-smp-dont-access-non-existing-cpuid-leaf.patch
+x86-smp-remove-pointless-wmb-s-from-native_stop_other_cpus.patch
+x86-smp-use-dedicated-cache-line-for-mwait_play_dead.patch
+x86-smp-cure-kexec-vs.-mwait_play_dead-breakage.patch
--- /dev/null
+From a32b0f0db3f396f1c9be2fe621e77c09ec3d8e7d Mon Sep 17 00:00:00 2001
+From: "Borislav Petkov (AMD)" <bp@alien8.de>
+Date: Tue, 2 May 2023 19:53:50 +0200
+Subject: x86/microcode/AMD: Load late on both threads too
+
+From: Borislav Petkov (AMD) <bp@alien8.de>
+
+commit a32b0f0db3f396f1c9be2fe621e77c09ec3d8e7d upstream.
+
+Do the same as early loading - load on both threads.
+
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Cc: <stable@kernel.org>
+Link: https://lore.kernel.org/r/20230605141332.25948-1-bp@alien8.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/cpu/microcode/amd.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/x86/kernel/cpu/microcode/amd.c
++++ b/arch/x86/kernel/cpu/microcode/amd.c
+@@ -705,7 +705,7 @@ static enum ucode_state apply_microcode_
+ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
+
+ /* need to apply patch? */
+- if (rev >= mc_amd->hdr.patch_id) {
++ if (rev > mc_amd->hdr.patch_id) {
+ ret = UCODE_OK;
+ goto out;
+ }
--- /dev/null
+From d7893093a7417527c0d73c9832244e65c9d0114f Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 15 Jun 2023 22:33:57 +0200
+Subject: x86/smp: Cure kexec() vs. mwait_play_dead() breakage
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit d7893093a7417527c0d73c9832244e65c9d0114f upstream.
+
+TLDR: It's a mess.
+
+When kexec() is executed on a system with offline CPUs, which are parked in
+mwait_play_dead() it can end up in a triple fault during the bootup of the
+kexec kernel or cause hard to diagnose data corruption.
+
+The reason is that kexec() eventually overwrites the previous kernel's text,
+page tables, data and stack. If it writes to the cache line which is
+monitored by a previously offlined CPU, MWAIT resumes execution and ends
+up executing the wrong text, dereferencing overwritten page tables or
+corrupting the kexec kernels data.
+
+Cure this by bringing the offlined CPUs out of MWAIT into HLT.
+
+Write to the monitored cache line of each offline CPU, which makes MWAIT
+resume execution. The written control word tells the offlined CPUs to issue
+HLT, which does not have the MWAIT problem.
+
+That does not help, if a stray NMI, MCE or SMI hits the offlined CPUs as
+those make it come out of HLT.
+
+A follow up change will put them into INIT, which protects at least against
+NMI and SMI.
+
+Fixes: ea53069231f9 ("x86, hotplug: Use mwait to offline a processor, fix the legacy case")
+Reported-by: Ashok Raj <ashok.raj@intel.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Tested-by: Ashok Raj <ashok.raj@intel.com>
+Reviewed-by: Ashok Raj <ashok.raj@intel.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20230615193330.492257119@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/smp.h | 2 +
+ arch/x86/kernel/smp.c | 5 +++
+ arch/x86/kernel/smpboot.c | 59 +++++++++++++++++++++++++++++++++++++++++++++
+ 3 files changed, 66 insertions(+)
+
+--- a/arch/x86/include/asm/smp.h
++++ b/arch/x86/include/asm/smp.h
+@@ -132,6 +132,8 @@ void wbinvd_on_cpu(int cpu);
+ int wbinvd_on_all_cpus(void);
+ void cond_wakeup_cpu0(void);
+
++void smp_kick_mwait_play_dead(void);
++
+ void native_smp_send_reschedule(int cpu);
+ void native_send_call_func_ipi(const struct cpumask *mask);
+ void native_send_call_func_single_ipi(int cpu);
+--- a/arch/x86/kernel/smp.c
++++ b/arch/x86/kernel/smp.c
+@@ -21,6 +21,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/cpu.h>
+ #include <linux/gfp.h>
++#include <linux/kexec.h>
+
+ #include <asm/mtrr.h>
+ #include <asm/tlbflush.h>
+@@ -157,6 +158,10 @@ static void native_stop_other_cpus(int w
+ if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
+ return;
+
++ /* For kexec, ensure that offline CPUs are out of MWAIT and in HLT */
++ if (kexec_in_progress)
++ smp_kick_mwait_play_dead();
++
+ /*
+ * 1) Send an IPI on the reboot vector to all other CPUs.
+ *
+--- a/arch/x86/kernel/smpboot.c
++++ b/arch/x86/kernel/smpboot.c
+@@ -53,6 +53,7 @@
+ #include <linux/tboot.h>
+ #include <linux/gfp.h>
+ #include <linux/cpuidle.h>
++#include <linux/kexec.h>
+ #include <linux/numa.h>
+ #include <linux/pgtable.h>
+ #include <linux/overflow.h>
+@@ -104,6 +105,9 @@ struct mwait_cpu_dead {
+ unsigned int status;
+ };
+
++#define CPUDEAD_MWAIT_WAIT 0xDEADBEEF
++#define CPUDEAD_MWAIT_KEXEC_HLT 0x4A17DEAD
++
+ /*
+ * Cache line aligned data for mwait_play_dead(). Separate on purpose so
+ * that it's unlikely to be touched by other CPUs.
+@@ -166,6 +170,10 @@ static void smp_callin(void)
+ {
+ int cpuid;
+
++ /* Mop up eventual mwait_play_dead() wreckage */
++ this_cpu_write(mwait_cpu_dead.status, 0);
++ this_cpu_write(mwait_cpu_dead.control, 0);
++
+ /*
+ * If waken up by an INIT in an 82489DX configuration
+ * cpu_callout_mask guarantees we don't get here before
+@@ -1795,6 +1803,10 @@ static inline void mwait_play_dead(void)
+ (highest_subcstate - 1);
+ }
+
++ /* Set up state for the kexec() hack below */
++ md->status = CPUDEAD_MWAIT_WAIT;
++ md->control = CPUDEAD_MWAIT_WAIT;
++
+ wbinvd();
+
+ while (1) {
+@@ -1812,10 +1824,57 @@ static inline void mwait_play_dead(void)
+ mb();
+ __mwait(eax, 0);
+
++ if (READ_ONCE(md->control) == CPUDEAD_MWAIT_KEXEC_HLT) {
++ /*
++ * Kexec is about to happen. Don't go back into mwait() as
++ * the kexec kernel might overwrite text and data including
++ * page tables and stack. So mwait() would resume when the
++ * monitor cache line is written to and then the CPU goes
++ * south due to overwritten text, page tables and stack.
++ *
++ * Note: This does _NOT_ protect against a stray MCE, NMI,
++ * SMI. They will resume execution at the instruction
++ * following the HLT instruction and run into the problem
++ * which this is trying to prevent.
++ */
++ WRITE_ONCE(md->status, CPUDEAD_MWAIT_KEXEC_HLT);
++ while(1)
++ native_halt();
++ }
++
+ cond_wakeup_cpu0();
+ }
+ }
+
++/*
++ * Kick all "offline" CPUs out of mwait on kexec(). See comment in
++ * mwait_play_dead().
++ */
++void smp_kick_mwait_play_dead(void)
++{
++ u32 newstate = CPUDEAD_MWAIT_KEXEC_HLT;
++ struct mwait_cpu_dead *md;
++ unsigned int cpu, i;
++
++ for_each_cpu_andnot(cpu, cpu_present_mask, cpu_online_mask) {
++ md = per_cpu_ptr(&mwait_cpu_dead, cpu);
++
++ /* Does it sit in mwait_play_dead() ? */
++ if (READ_ONCE(md->status) != CPUDEAD_MWAIT_WAIT)
++ continue;
++
++ /* Wait up to 5ms */
++ for (i = 0; READ_ONCE(md->status) != newstate && i < 1000; i++) {
++ /* Bring it out of mwait */
++ WRITE_ONCE(md->control, newstate);
++ udelay(5);
++ }
++
++ if (READ_ONCE(md->status) != newstate)
++ pr_err_once("CPU%u is stuck in mwait_play_dead()\n", cpu);
++ }
++}
++
+ void hlt_play_dead(void)
+ {
+ if (__this_cpu_read(cpu_info.x86) >= 4)
--- /dev/null
+From 9b040453d4440659f33dc6f0aa26af418ebfe70b Mon Sep 17 00:00:00 2001
+From: Tony Battersby <tonyb@cybernetics.com>
+Date: Thu, 15 Jun 2023 22:33:52 +0200
+Subject: x86/smp: Dont access non-existing CPUID leaf
+
+From: Tony Battersby <tonyb@cybernetics.com>
+
+commit 9b040453d4440659f33dc6f0aa26af418ebfe70b upstream.
+
+stop_this_cpu() tests CPUID leaf 0x8000001f::EAX unconditionally. Intel
+CPUs return the content of the highest supported leaf when a non-existing
+leaf is read, while AMD CPUs return all zeros for unsupported leafs.
+
+So the result of the test on Intel CPUs is lottery.
+
+While harmless it's incorrect and causes the conditional wbinvd() to be
+issued where not required.
+
+Check whether the leaf is supported before reading it.
+
+[ tglx: Adjusted changelog ]
+
+Fixes: 08f253ec3767 ("x86/cpu: Clear SME feature flag when not in use")
+Signed-off-by: Tony Battersby <tonyb@cybernetics.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Mario Limonciello <mario.limonciello@amd.com>
+Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/3817d810-e0f1-8ef8-0bbd-663b919ca49b@cybernetics.com
+Link: https://lore.kernel.org/r/20230615193330.322186388@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/process.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -748,6 +748,7 @@ struct cpumask cpus_stop_mask;
+
+ void __noreturn stop_this_cpu(void *dummy)
+ {
++ struct cpuinfo_x86 *c = this_cpu_ptr(&cpu_info);
+ unsigned int cpu = smp_processor_id();
+
+ local_irq_disable();
+@@ -762,7 +763,7 @@ void __noreturn stop_this_cpu(void *dumm
+ */
+ set_cpu_online(cpu, false);
+ disable_local_APIC();
+- mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
++ mcheck_cpu_clear(c);
+
+ /*
+ * Use wbinvd on processors that support SME. This provides support
+@@ -776,7 +777,7 @@ void __noreturn stop_this_cpu(void *dumm
+ * Test the CPUID bit directly because the machine might've cleared
+ * X86_FEATURE_SME due to cmdline options.
+ */
+- if (cpuid_eax(0x8000001f) & BIT(0))
++ if (c->extended_cpuid_level >= 0x8000001f && (cpuid_eax(0x8000001f) & BIT(0)))
+ native_wbinvd();
+
+ /*
--- /dev/null
+From 1f5e7eb7868e42227ac426c96d437117e6e06e8e Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed, 26 Apr 2023 18:37:00 +0200
+Subject: x86/smp: Make stop_other_cpus() more robust
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 1f5e7eb7868e42227ac426c96d437117e6e06e8e upstream.
+
+Tony reported intermittent lockups on poweroff. His analysis identified the
+wbinvd() in stop_this_cpu() as the culprit. This was added to ensure that
+on SME enabled machines a kexec() does not leave any stale data in the
+caches when switching from encrypted to non-encrypted mode or vice versa.
+
+That wbinvd() is conditional on the SME feature bit which is read directly
+from CPUID. But that readout does not check whether the CPUID leaf is
+available or not. If it's not available the CPU will return the value of
+the highest supported leaf instead. Depending on the content the "SME" bit
+might be set or not.
+
+That's incorrect but harmless. Making the CPUID readout conditional makes
+the observed hangs go away, but it does not fix the underlying problem:
+
+CPU0 CPU1
+
+ stop_other_cpus()
+ send_IPIs(REBOOT); stop_this_cpu()
+ while (num_online_cpus() > 1); set_online(false);
+ proceed... -> hang
+ wbinvd()
+
+WBINVD is an expensive operation and if multiple CPUs issue it at the same
+time the resulting delays are even larger.
+
+But CPU0 already observed num_online_cpus() going down to 1 and proceeds
+which causes the system to hang.
+
+This issue exists independent of WBINVD, but the delays caused by WBINVD
+make it more prominent.
+
+Make this more robust by adding a cpumask which is initialized to the
+online CPU mask before sending the IPIs and CPUs clear their bit in
+stop_this_cpu() after the WBINVD completed. Check for that cpumask to
+become empty in stop_other_cpus() instead of watching num_online_cpus().
+
+The cpumask cannot plug all holes either, but it's better than a raw
+counter and allows to restrict the NMI fallback IPI to be sent only the
+CPUs which have not reported within the timeout window.
+
+Fixes: 08f253ec3767 ("x86/cpu: Clear SME feature flag when not in use")
+Reported-by: Tony Battersby <tonyb@cybernetics.com>
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
+Reviewed-by: Ashok Raj <ashok.raj@intel.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/all/3817d810-e0f1-8ef8-0bbd-663b919ca49b@cybernetics.com
+Link: https://lore.kernel.org/r/87h6r770bv.ffs@tglx
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/cpu.h | 2 +
+ arch/x86/kernel/process.c | 23 +++++++++++++++-
+ arch/x86/kernel/smp.c | 62 +++++++++++++++++++++++++++++----------------
+ 3 files changed, 64 insertions(+), 23 deletions(-)
+
+--- a/arch/x86/include/asm/cpu.h
++++ b/arch/x86/include/asm/cpu.h
+@@ -96,4 +96,6 @@ static inline bool intel_cpu_signatures_
+
+ extern u64 x86_read_arch_cap_msr(void);
+
++extern struct cpumask cpus_stop_mask;
++
+ #endif /* _ASM_X86_CPU_H */
+--- a/arch/x86/kernel/process.c
++++ b/arch/x86/kernel/process.c
+@@ -744,13 +744,23 @@ bool xen_set_default_idle(void)
+ }
+ #endif
+
++struct cpumask cpus_stop_mask;
++
+ void __noreturn stop_this_cpu(void *dummy)
+ {
++ unsigned int cpu = smp_processor_id();
++
+ local_irq_disable();
++
+ /*
+- * Remove this CPU:
++ * Remove this CPU from the online mask and disable it
++ * unconditionally. This might be redundant in case that the reboot
++ * vector was handled late and stop_other_cpus() sent an NMI.
++ *
++ * According to SDM and APM NMIs can be accepted even after soft
++ * disabling the local APIC.
+ */
+- set_cpu_online(smp_processor_id(), false);
++ set_cpu_online(cpu, false);
+ disable_local_APIC();
+ mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
+
+@@ -768,6 +778,15 @@ void __noreturn stop_this_cpu(void *dumm
+ */
+ if (cpuid_eax(0x8000001f) & BIT(0))
+ native_wbinvd();
++
++ /*
++ * This brings a cache line back and dirties it, but
++ * native_stop_other_cpus() will overwrite cpus_stop_mask after it
++ * observed that all CPUs reported stop. This write will invalidate
++ * the related cache line on this CPU.
++ */
++ cpumask_clear_cpu(cpu, &cpus_stop_mask);
++
+ for (;;) {
+ /*
+ * Use native_halt() so that memory contents don't change
+--- a/arch/x86/kernel/smp.c
++++ b/arch/x86/kernel/smp.c
+@@ -27,6 +27,7 @@
+ #include <asm/mmu_context.h>
+ #include <asm/proto.h>
+ #include <asm/apic.h>
++#include <asm/cpu.h>
+ #include <asm/idtentry.h>
+ #include <asm/nmi.h>
+ #include <asm/mce.h>
+@@ -146,31 +147,43 @@ static int register_stop_handler(void)
+
+ static void native_stop_other_cpus(int wait)
+ {
+- unsigned long flags;
+- unsigned long timeout;
++ unsigned int cpu = smp_processor_id();
++ unsigned long flags, timeout;
+
+ if (reboot_force)
+ return;
+
+- /*
+- * Use an own vector here because smp_call_function
+- * does lots of things not suitable in a panic situation.
+- */
++ /* Only proceed if this is the first CPU to reach this code */
++ if (atomic_cmpxchg(&stopping_cpu, -1, cpu) != -1)
++ return;
+
+ /*
+- * We start by using the REBOOT_VECTOR irq.
+- * The irq is treated as a sync point to allow critical
+- * regions of code on other cpus to release their spin locks
+- * and re-enable irqs. Jumping straight to an NMI might
+- * accidentally cause deadlocks with further shutdown/panic
+- * code. By syncing, we give the cpus up to one second to
+- * finish their work before we force them off with the NMI.
++ * 1) Send an IPI on the reboot vector to all other CPUs.
++ *
++ * The other CPUs should react on it after leaving critical
++ * sections and re-enabling interrupts. They might still hold
++ * locks, but there is nothing which can be done about that.
++ *
++ * 2) Wait for all other CPUs to report that they reached the
++ * HLT loop in stop_this_cpu()
++ *
++ * 3) If #2 timed out send an NMI to the CPUs which did not
++ * yet report
++ *
++ * 4) Wait for all other CPUs to report that they reached the
++ * HLT loop in stop_this_cpu()
++ *
++ * #3 can obviously race against a CPU reaching the HLT loop late.
++ * That CPU will have reported already and the "have all CPUs
++ * reached HLT" condition will be true despite the fact that the
++ * other CPU is still handling the NMI. Again, there is no
++ * protection against that as "disabled" APICs still respond to
++ * NMIs.
+ */
+- if (num_online_cpus() > 1) {
+- /* did someone beat us here? */
+- if (atomic_cmpxchg(&stopping_cpu, -1, safe_smp_processor_id()) != -1)
+- return;
++ cpumask_copy(&cpus_stop_mask, cpu_online_mask);
++ cpumask_clear_cpu(cpu, &cpus_stop_mask);
+
++ if (!cpumask_empty(&cpus_stop_mask)) {
+ /* sync above data before sending IRQ */
+ wmb();
+
+@@ -183,12 +196,12 @@ static void native_stop_other_cpus(int w
+ * CPUs reach shutdown state.
+ */
+ timeout = USEC_PER_SEC;
+- while (num_online_cpus() > 1 && timeout--)
++ while (!cpumask_empty(&cpus_stop_mask) && timeout--)
+ udelay(1);
+ }
+
+ /* if the REBOOT_VECTOR didn't work, try with the NMI */
+- if (num_online_cpus() > 1) {
++ if (!cpumask_empty(&cpus_stop_mask)) {
+ /*
+ * If NMI IPI is enabled, try to register the stop handler
+ * and send the IPI. In any case try to wait for the other
+@@ -200,7 +213,8 @@ static void native_stop_other_cpus(int w
+
+ pr_emerg("Shutting down cpus with NMI\n");
+
+- apic_send_IPI_allbutself(NMI_VECTOR);
++ for_each_cpu(cpu, &cpus_stop_mask)
++ apic->send_IPI(cpu, NMI_VECTOR);
+ }
+ /*
+ * Don't wait longer than 10 ms if the caller didn't
+@@ -208,7 +222,7 @@ static void native_stop_other_cpus(int w
+ * one or more CPUs do not reach shutdown state.
+ */
+ timeout = USEC_PER_MSEC * 10;
+- while (num_online_cpus() > 1 && (wait || timeout--))
++ while (!cpumask_empty(&cpus_stop_mask) && (wait || timeout--))
+ udelay(1);
+ }
+
+@@ -216,6 +230,12 @@ static void native_stop_other_cpus(int w
+ disable_local_APIC();
+ mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
+ local_irq_restore(flags);
++
++ /*
++ * Ensure that the cpus_stop_mask cache lines are invalidated on
++ * the other CPUs. See comment vs. SME in stop_this_cpu().
++ */
++ cpumask_clear(&cpus_stop_mask);
+ }
+
+ /*
--- /dev/null
+From 2affa6d6db28855e6340b060b809c23477aa546e Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 15 Jun 2023 22:33:54 +0200
+Subject: x86/smp: Remove pointless wmb()s from native_stop_other_cpus()
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit 2affa6d6db28855e6340b060b809c23477aa546e upstream.
+
+The wmb()s before sending the IPIs are not synchronizing anything.
+
+If at all then the apic IPI functions have to provide or act as appropriate
+barriers.
+
+Remove these cargo cult barriers which have no explanation of what they are
+synchronizing.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20230615193330.378358382@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/smp.c | 6 ------
+ 1 file changed, 6 deletions(-)
+
+--- a/arch/x86/kernel/smp.c
++++ b/arch/x86/kernel/smp.c
+@@ -184,9 +184,6 @@ static void native_stop_other_cpus(int w
+ cpumask_clear_cpu(cpu, &cpus_stop_mask);
+
+ if (!cpumask_empty(&cpus_stop_mask)) {
+- /* sync above data before sending IRQ */
+- wmb();
+-
+ apic_send_IPI_allbutself(REBOOT_VECTOR);
+
+ /*
+@@ -208,9 +205,6 @@ static void native_stop_other_cpus(int w
+ * CPUs to stop.
+ */
+ if (!smp_no_nmi_ipi && !register_stop_handler()) {
+- /* Sync above data before sending IRQ */
+- wmb();
+-
+ pr_emerg("Shutting down cpus with NMI\n");
+
+ for_each_cpu(cpu, &cpus_stop_mask)
--- /dev/null
+From f9c9987bf52f4e42e940ae217333ebb5a4c3b506 Mon Sep 17 00:00:00 2001
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Thu, 15 Jun 2023 22:33:55 +0200
+Subject: x86/smp: Use dedicated cache-line for mwait_play_dead()
+
+From: Thomas Gleixner <tglx@linutronix.de>
+
+commit f9c9987bf52f4e42e940ae217333ebb5a4c3b506 upstream.
+
+Monitoring idletask::thread_info::flags in mwait_play_dead() has been an
+obvious choice as all what is needed is a cache line which is not written
+by other CPUs.
+
+But there is a use case where a "dead" CPU needs to be brought out of
+MWAIT: kexec().
+
+This is required as kexec() can overwrite text, pagetables, stacks and the
+monitored cacheline of the original kernel. The latter causes MWAIT to
+resume execution which obviously causes havoc on the kexec kernel which
+results usually in triple faults.
+
+Use a dedicated per CPU storage to prepare for that.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Reviewed-by: Ashok Raj <ashok.raj@intel.com>
+Reviewed-by: Borislav Petkov (AMD) <bp@alien8.de>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20230615193330.434553750@linutronix.de
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/kernel/smpboot.c | 24 ++++++++++++++----------
+ 1 file changed, 14 insertions(+), 10 deletions(-)
+
+--- a/arch/x86/kernel/smpboot.c
++++ b/arch/x86/kernel/smpboot.c
+@@ -99,6 +99,17 @@ EXPORT_PER_CPU_SYMBOL(cpu_die_map);
+ DEFINE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info);
+ EXPORT_PER_CPU_SYMBOL(cpu_info);
+
++struct mwait_cpu_dead {
++ unsigned int control;
++ unsigned int status;
++};
++
++/*
++ * Cache line aligned data for mwait_play_dead(). Separate on purpose so
++ * that it's unlikely to be touched by other CPUs.
++ */
++static DEFINE_PER_CPU_ALIGNED(struct mwait_cpu_dead, mwait_cpu_dead);
++
+ /* Logical package management. We might want to allocate that dynamically */
+ unsigned int __max_logical_packages __read_mostly;
+ EXPORT_SYMBOL(__max_logical_packages);
+@@ -1746,10 +1757,10 @@ EXPORT_SYMBOL_GPL(cond_wakeup_cpu0);
+ */
+ static inline void mwait_play_dead(void)
+ {
++ struct mwait_cpu_dead *md = this_cpu_ptr(&mwait_cpu_dead);
+ unsigned int eax, ebx, ecx, edx;
+ unsigned int highest_cstate = 0;
+ unsigned int highest_subcstate = 0;
+- void *mwait_ptr;
+ int i;
+
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD ||
+@@ -1784,13 +1795,6 @@ static inline void mwait_play_dead(void)
+ (highest_subcstate - 1);
+ }
+
+- /*
+- * This should be a memory location in a cache line which is
+- * unlikely to be touched by other processors. The actual
+- * content is immaterial as it is not actually modified in any way.
+- */
+- mwait_ptr = ¤t_thread_info()->flags;
+-
+ wbinvd();
+
+ while (1) {
+@@ -1802,9 +1806,9 @@ static inline void mwait_play_dead(void)
+ * case where we return around the loop.
+ */
+ mb();
+- clflush(mwait_ptr);
++ clflush(md);
+ mb();
+- __monitor(mwait_ptr, 0, 0);
++ __monitor(md, 0, 0);
+ mb();
+ __mwait(eax, 0);
+