--- /dev/null
+From 04c3024560d3a14acd18d0a51a1d0a89d29b7eb5 Mon Sep 17 00:00:00 2001
+From: "Borislav Petkov (AMD)" <bp@alien8.de>
+Date: Fri, 27 Oct 2023 14:24:16 +0200
+Subject: x86/barrier: Do not serialize MSR accesses on AMD
+
+From: Borislav Petkov (AMD) <bp@alien8.de>
+
+commit 04c3024560d3a14acd18d0a51a1d0a89d29b7eb5 upstream.
+
+AMD does not have the requirement for a synchronization barrier when
+acccessing a certain group of MSRs. Do not incur that unnecessary
+penalty there.
+
+There will be a CPUID bit which explicitly states that a MFENCE is not
+needed. Once that bit is added to the APM, this will be extended with
+it.
+
+While at it, move to processor.h to avoid include hell. Untangling that
+file properly is a matter for another day.
+
+Some notes on the performance aspect of why this is relevant, courtesy
+of Kishon VijayAbraham <Kishon.VijayAbraham@amd.com>:
+
+On a AMD Zen4 system with 96 cores, a modified ipi-bench[1] on a VM
+shows x2AVIC IPI rate is 3% to 4% lower than AVIC IPI rate. The
+ipi-bench is modified so that the IPIs are sent between two vCPUs in the
+same CCX. This also requires to pin the vCPU to a physical core to
+prevent any latencies. This simulates the use case of pinning vCPUs to
+the thread of a single CCX to avoid interrupt IPI latency.
+
+In order to avoid run-to-run variance (for both x2AVIC and AVIC), the
+below configurations are done:
+
+ 1) Disable Power States in BIOS (to prevent the system from going to
+ lower power state)
+
+ 2) Run the system at fixed frequency 2500MHz (to prevent the system
+ from increasing the frequency when the load is more)
+
+With the above configuration:
+
+*) Performance measured using ipi-bench for AVIC:
+ Average Latency: 1124.98ns [Time to send IPI from one vCPU to another vCPU]
+
+ Cumulative throughput: 42.6759M/s [Total number of IPIs sent in a second from
+ 48 vCPUs simultaneously]
+
+*) Performance measured using ipi-bench for x2AVIC:
+ Average Latency: 1172.42ns [Time to send IPI from one vCPU to another vCPU]
+
+ Cumulative throughput: 40.9432M/s [Total number of IPIs sent in a second from
+ 48 vCPUs simultaneously]
+
+From above, x2AVIC latency is ~4% more than AVIC. However, the expectation is
+x2AVIC performance to be better or equivalent to AVIC. Upon analyzing
+the perf captures, it is observed significant time is spent in
+weak_wrmsr_fence() invoked by x2apic_send_IPI().
+
+With the fix to skip weak_wrmsr_fence()
+
+*) Performance measured using ipi-bench for x2AVIC:
+ Average Latency: 1117.44ns [Time to send IPI from one vCPU to another vCPU]
+
+ Cumulative throughput: 42.9608M/s [Total number of IPIs sent in a second from
+ 48 vCPUs simultaneously]
+
+Comparing the performance of x2AVIC with and without the fix, it can be seen
+the performance improves by ~4%.
+
+Performance captured using an unmodified ipi-bench using the 'mesh-ipi' option
+with and without weak_wrmsr_fence() on a Zen4 system also showed significant
+performance improvement without weak_wrmsr_fence(). The 'mesh-ipi' option ignores
+CCX or CCD and just picks random vCPU.
+
+ Average throughput (10 iterations) with weak_wrmsr_fence(),
+ Cumulative throughput: 4933374 IPI/s
+
+ Average throughput (10 iterations) without weak_wrmsr_fence(),
+ Cumulative throughput: 6355156 IPI/s
+
+[1] https://github.com/bytedance/kvm-utils/tree/master/microbenchmark/ipi-bench
+
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/r/20230622095212.20940-1-bp@alien8.de
+Signed-off-by: Kishon Vijay Abraham I <kvijayab@amd.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/barrier.h | 18 ------------------
+ arch/x86/include/asm/cpufeatures.h | 2 +-
+ arch/x86/include/asm/processor.h | 18 ++++++++++++++++++
+ arch/x86/kernel/cpu/amd.c | 3 +++
+ arch/x86/kernel/cpu/common.c | 7 +++++++
+ arch/x86/kernel/cpu/hygon.c | 3 +++
+ 6 files changed, 32 insertions(+), 19 deletions(-)
+
+--- a/arch/x86/include/asm/barrier.h
++++ b/arch/x86/include/asm/barrier.h
+@@ -81,22 +81,4 @@ do { \
+
+ #include <asm-generic/barrier.h>
+
+-/*
+- * Make previous memory operations globally visible before
+- * a WRMSR.
+- *
+- * MFENCE makes writes visible, but only affects load/store
+- * instructions. WRMSR is unfortunately not a load/store
+- * instruction and is unaffected by MFENCE. The LFENCE ensures
+- * that the WRMSR is not reordered.
+- *
+- * Most WRMSRs are full serializing instructions themselves and
+- * do not require this barrier. This is only required for the
+- * IA32_TSC_DEADLINE and X2APIC MSRs.
+- */
+-static inline void weak_wrmsr_fence(void)
+-{
+- asm volatile("mfence; lfence" : : : "memory");
+-}
+-
+ #endif /* _ASM_X86_BARRIER_H */
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -308,10 +308,10 @@
+ #define X86_FEATURE_SMBA (11*32+21) /* "" Slow Memory Bandwidth Allocation */
+ #define X86_FEATURE_BMEC (11*32+22) /* "" Bandwidth Monitoring Event Configuration */
+ #define X86_FEATURE_USER_SHSTK (11*32+23) /* Shadow stack support for user mode applications */
+-
+ #define X86_FEATURE_SRSO (11*32+24) /* "" AMD BTB untrain RETs */
+ #define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */
+ #define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */
++#define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */
+
+ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
+ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -734,4 +734,22 @@ bool arch_is_platform_page(u64 paddr);
+
+ extern bool gds_ucode_mitigated(void);
+
++/*
++ * Make previous memory operations globally visible before
++ * a WRMSR.
++ *
++ * MFENCE makes writes visible, but only affects load/store
++ * instructions. WRMSR is unfortunately not a load/store
++ * instruction and is unaffected by MFENCE. The LFENCE ensures
++ * that the WRMSR is not reordered.
++ *
++ * Most WRMSRs are full serializing instructions themselves and
++ * do not require this barrier. This is only required for the
++ * IA32_TSC_DEADLINE and X2APIC MSRs.
++ */
++static inline void weak_wrmsr_fence(void)
++{
++ alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE));
++}
++
+ #endif /* _ASM_X86_PROCESSOR_H */
+--- a/arch/x86/kernel/cpu/amd.c
++++ b/arch/x86/kernel/cpu/amd.c
+@@ -1157,6 +1157,9 @@ static void init_amd(struct cpuinfo_x86
+ if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
+ cpu_has_amd_erratum(c, amd_erratum_1485))
+ msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
++
++ /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
++ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+ }
+
+ #ifdef CONFIG_X86_32
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -1858,6 +1858,13 @@ static void identify_cpu(struct cpuinfo_
+ c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+ #endif
+
++
++ /*
++ * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and
++ * Hygon will clear it in ->c_init() below.
++ */
++ set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
++
+ /*
+ * Vendor-specific initialization. In this section we
+ * canonicalize the feature flags, meaning if there are
+--- a/arch/x86/kernel/cpu/hygon.c
++++ b/arch/x86/kernel/cpu/hygon.c
+@@ -348,6 +348,9 @@ static void init_hygon(struct cpuinfo_x8
+ set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+
+ check_null_seg_clears_base(c);
++
++ /* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
++ clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+ }
+
+ static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)