From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 21 Feb 2024 10:45:33 +0000 (+0100)
Subject: 6.6-stable patches
X-Git-Tag: v4.19.307~23
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=064bb0e6b369a309639c2884d8711fbe14aa13ca;p=thirdparty%2Fkernel%2Fstable-queue.git

6.6-stable patches

added patches:
	x86-barrier-do-not-serialize-msr-accesses-on-amd.patch
---

diff --git a/queue-6.6/series b/queue-6.6/series
index a2fe973ddaf..b4b1b895860 100644
--- a/queue-6.6/series
+++ b/queue-6.6/series
@@ -332,3 +332,4 @@ sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch
 of-property-add-in-ports-out-ports-support-to-of_graph_get_port_parent.patch
 nilfs2-fix-potential-bug-in-end_buffer_async_write.patch
 dm-limit-the-number-of-targets-and-parameter-size-area.patch
+x86-barrier-do-not-serialize-msr-accesses-on-amd.patch
diff --git a/queue-6.6/x86-barrier-do-not-serialize-msr-accesses-on-amd.patch b/queue-6.6/x86-barrier-do-not-serialize-msr-accesses-on-amd.patch
new file mode 100644
index 00000000000..31857dd1ec6
--- /dev/null
+++ b/queue-6.6/x86-barrier-do-not-serialize-msr-accesses-on-amd.patch
@@ -0,0 +1,199 @@
+From 04c3024560d3a14acd18d0a51a1d0a89d29b7eb5 Mon Sep 17 00:00:00 2001
+From: "Borislav Petkov (AMD)" <bp@alien8.de>
+Date: Fri, 27 Oct 2023 14:24:16 +0200
+Subject: x86/barrier: Do not serialize MSR accesses on AMD
+
+From: Borislav Petkov (AMD) <bp@alien8.de>
+
+commit 04c3024560d3a14acd18d0a51a1d0a89d29b7eb5 upstream.
+
+AMD does not have the requirement for a synchronization barrier when
+acccessing a certain group of MSRs. Do not incur that unnecessary
+penalty there.
+
+There will be a CPUID bit which explicitly states that a MFENCE is not
+needed. Once that bit is added to the APM, this will be extended with
+it.
+
+While at it, move to processor.h to avoid include hell. Untangling that
+file properly is a matter for another day.
+
+Some notes on the performance aspect of why this is relevant, courtesy
+of Kishon VijayAbraham <Kishon.VijayAbraham@amd.com>:
+
+On a AMD Zen4 system with 96 cores, a modified ipi-bench[1] on a VM
+shows x2AVIC IPI rate is 3% to 4% lower than AVIC IPI rate. The
+ipi-bench is modified so that the IPIs are sent between two vCPUs in the
+same CCX. This also requires to pin the vCPU to a physical core to
+prevent any latencies. This simulates the use case of pinning vCPUs to
+the thread of a single CCX to avoid interrupt IPI latency.
+
+In order to avoid run-to-run variance (for both x2AVIC and AVIC), the
+below configurations are done:
+
+  1) Disable Power States in BIOS (to prevent the system from going to
+     lower power state)
+
+  2) Run the system at fixed frequency 2500MHz (to prevent the system
+     from increasing the frequency when the load is more)
+
+With the above configuration:
+
+*) Performance measured using ipi-bench for AVIC:
+  Average Latency:  1124.98ns [Time to send IPI from one vCPU to another vCPU]
+
+  Cumulative throughput: 42.6759M/s [Total number of IPIs sent in a second from
+  				     48 vCPUs simultaneously]
+
+*) Performance measured using ipi-bench for x2AVIC:
+  Average Latency:  1172.42ns [Time to send IPI from one vCPU to another vCPU]
+
+  Cumulative throughput: 40.9432M/s [Total number of IPIs sent in a second from
+  				     48 vCPUs simultaneously]
+
+From above, x2AVIC latency is ~4% more than AVIC. However, the expectation is
+x2AVIC performance to be better or equivalent to AVIC. Upon analyzing
+the perf captures, it is observed significant time is spent in
+weak_wrmsr_fence() invoked by x2apic_send_IPI().
+
+With the fix to skip weak_wrmsr_fence()
+
+*) Performance measured using ipi-bench for x2AVIC:
+  Average Latency:  1117.44ns [Time to send IPI from one vCPU to another vCPU]
+
+  Cumulative throughput: 42.9608M/s [Total number of IPIs sent in a second from
+  				     48 vCPUs simultaneously]
+
+Comparing the performance of x2AVIC with and without the fix, it can be seen
+the performance improves by ~4%.
+
+Performance captured using an unmodified ipi-bench using the 'mesh-ipi' option
+with and without weak_wrmsr_fence() on a Zen4 system also showed significant
+performance improvement without weak_wrmsr_fence(). The 'mesh-ipi' option ignores
+CCX or CCD and just picks random vCPU.
+
+  Average throughput (10 iterations) with weak_wrmsr_fence(),
+        Cumulative throughput: 4933374 IPI/s
+
+  Average throughput (10 iterations) without weak_wrmsr_fence(),
+        Cumulative throughput: 6355156 IPI/s
+
+[1] https://github.com/bytedance/kvm-utils/tree/master/microbenchmark/ipi-bench
+
+Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de>
+Link: https://lore.kernel.org/r/20230622095212.20940-1-bp@alien8.de
+Signed-off-by: Kishon Vijay Abraham I <kvijayab@amd.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/x86/include/asm/barrier.h     |   18 ------------------
+ arch/x86/include/asm/cpufeatures.h |    2 +-
+ arch/x86/include/asm/processor.h   |   18 ++++++++++++++++++
+ arch/x86/kernel/cpu/amd.c          |    3 +++
+ arch/x86/kernel/cpu/common.c       |    7 +++++++
+ arch/x86/kernel/cpu/hygon.c        |    3 +++
+ 6 files changed, 32 insertions(+), 19 deletions(-)
+
+--- a/arch/x86/include/asm/barrier.h
++++ b/arch/x86/include/asm/barrier.h
+@@ -81,22 +81,4 @@ do {									\
+ 
+ #include <asm-generic/barrier.h>
+ 
+-/*
+- * Make previous memory operations globally visible before
+- * a WRMSR.
+- *
+- * MFENCE makes writes visible, but only affects load/store
+- * instructions.  WRMSR is unfortunately not a load/store
+- * instruction and is unaffected by MFENCE.  The LFENCE ensures
+- * that the WRMSR is not reordered.
+- *
+- * Most WRMSRs are full serializing instructions themselves and
+- * do not require this barrier.  This is only required for the
+- * IA32_TSC_DEADLINE and X2APIC MSRs.
+- */
+-static inline void weak_wrmsr_fence(void)
+-{
+-	asm volatile("mfence; lfence" : : : "memory");
+-}
+-
+ #endif /* _ASM_X86_BARRIER_H */
+--- a/arch/x86/include/asm/cpufeatures.h
++++ b/arch/x86/include/asm/cpufeatures.h
+@@ -308,10 +308,10 @@
+ #define X86_FEATURE_SMBA		(11*32+21) /* "" Slow Memory Bandwidth Allocation */
+ #define X86_FEATURE_BMEC		(11*32+22) /* "" Bandwidth Monitoring Event Configuration */
+ #define X86_FEATURE_USER_SHSTK		(11*32+23) /* Shadow stack support for user mode applications */
+-
+ #define X86_FEATURE_SRSO		(11*32+24) /* "" AMD BTB untrain RETs */
+ #define X86_FEATURE_SRSO_ALIAS		(11*32+25) /* "" AMD BTB untrain RETs through aliasing */
+ #define X86_FEATURE_IBPB_ON_VMEXIT	(11*32+26) /* "" Issue an IBPB only on VMEXIT */
++#define X86_FEATURE_APIC_MSRS_FENCE	(11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */
+ 
+ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */
+ #define X86_FEATURE_AVX_VNNI		(12*32+ 4) /* AVX VNNI instructions */
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -734,4 +734,22 @@ bool arch_is_platform_page(u64 paddr);
+ 
+ extern bool gds_ucode_mitigated(void);
+ 
++/*
++ * Make previous memory operations globally visible before
++ * a WRMSR.
++ *
++ * MFENCE makes writes visible, but only affects load/store
++ * instructions.  WRMSR is unfortunately not a load/store
++ * instruction and is unaffected by MFENCE.  The LFENCE ensures
++ * that the WRMSR is not reordered.
++ *
++ * Most WRMSRs are full serializing instructions themselves and
++ * do not require this barrier.  This is only required for the
++ * IA32_TSC_DEADLINE and X2APIC MSRs.
++ */
++static inline void weak_wrmsr_fence(void)
++{
++	alternative("mfence; lfence", "", ALT_NOT(X86_FEATURE_APIC_MSRS_FENCE));
++}
++
+ #endif /* _ASM_X86_PROCESSOR_H */
+--- a/arch/x86/kernel/cpu/amd.c
++++ b/arch/x86/kernel/cpu/amd.c
+@@ -1157,6 +1157,9 @@ static void init_amd(struct cpuinfo_x86
+ 	if (!cpu_has(c, X86_FEATURE_HYPERVISOR) &&
+ 	     cpu_has_amd_erratum(c, amd_erratum_1485))
+ 		msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT);
++
++	/* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
++	clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+ }
+ 
+ #ifdef CONFIG_X86_32
+--- a/arch/x86/kernel/cpu/common.c
++++ b/arch/x86/kernel/cpu/common.c
+@@ -1858,6 +1858,13 @@ static void identify_cpu(struct cpuinfo_
+ 	c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
+ #endif
+ 
++
++	/*
++	 * Set default APIC and TSC_DEADLINE MSR fencing flag. AMD and
++	 * Hygon will clear it in ->c_init() below.
++	 */
++	set_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
++
+ 	/*
+ 	 * Vendor-specific initialization.  In this section we
+ 	 * canonicalize the feature flags, meaning if there are
+--- a/arch/x86/kernel/cpu/hygon.c
++++ b/arch/x86/kernel/cpu/hygon.c
+@@ -348,6 +348,9 @@ static void init_hygon(struct cpuinfo_x8
+ 		set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS);
+ 
+ 	check_null_seg_clears_base(c);
++
++	/* Hygon CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */
++	clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE);
+ }
+ 
+ static void cpu_detect_tlb_hygon(struct cpuinfo_x86 *c)