]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - releases/4.19.35/x86-perf-amd-resolve-race-condition-when-disabling-pmc.patch
Linux 4.19.35
[thirdparty/kernel/stable-queue.git] / releases / 4.19.35 / x86-perf-amd-resolve-race-condition-when-disabling-pmc.patch
1 From 914123fa39042e651d79eaf86bbf63a1b938dddf Mon Sep 17 00:00:00 2001
2 From: "Lendacky, Thomas" <Thomas.Lendacky@amd.com>
3 Date: Tue, 2 Apr 2019 15:21:14 +0000
4 Subject: x86/perf/amd: Resolve race condition when disabling PMC
5
6 From: Lendacky, Thomas <Thomas.Lendacky@amd.com>
7
8 commit 914123fa39042e651d79eaf86bbf63a1b938dddf upstream.
9
10 On AMD processors, the detection of an overflowed counter in the NMI
11 handler relies on the current value of the counter. So, for example, to
12 check for overflow on a 48 bit counter, bit 47 is checked to see if it
13 is 1 (not overflowed) or 0 (overflowed).
14
15 There is currently a race condition present when disabling and then
16 updating the PMC. Increased NMI latency in newer AMD processors makes this
17 race condition more pronounced. If the counter value has overflowed, it is
18 possible to update the PMC value before the NMI handler can run. The
19 updated PMC value is not an overflowed value, so when the perf NMI handler
20 does run, it will not find an overflowed counter. This may appear as an
21 unknown NMI resulting in either a panic or a series of messages, depending
22 on how the kernel is configured.
23
24 To eliminate this race condition, the PMC value must be checked after
25 disabling the counter. Add an AMD function, amd_pmu_disable_all(), that
26 will wait for the NMI handler to reset any active and overflowed counter
27 after calling x86_pmu_disable_all().
28
29 Signed-off-by: Tom Lendacky <thomas.lendacky@amd.com>
30 Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
31 Cc: <stable@vger.kernel.org> # 4.14.x-
32 Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
33 Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
34 Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
35 Cc: Borislav Petkov <bp@alien8.de>
36 Cc: Jiri Olsa <jolsa@redhat.com>
37 Cc: Linus Torvalds <torvalds@linux-foundation.org>
38 Cc: Namhyung Kim <namhyung@kernel.org>
39 Cc: Peter Zijlstra <peterz@infradead.org>
40 Cc: Stephane Eranian <eranian@google.com>
41 Cc: Thomas Gleixner <tglx@linutronix.de>
42 Cc: Vince Weaver <vincent.weaver@maine.edu>
43 Link: https://lkml.kernel.org/r/Message-ID:
44 Signed-off-by: Ingo Molnar <mingo@kernel.org>
45 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
46
47 ---
48 arch/x86/events/amd/core.c | 65 ++++++++++++++++++++++++++++++++++++++++++---
49 1 file changed, 62 insertions(+), 3 deletions(-)
50
51 --- a/arch/x86/events/amd/core.c
52 +++ b/arch/x86/events/amd/core.c
53 @@ -3,6 +3,7 @@
54 #include <linux/types.h>
55 #include <linux/init.h>
56 #include <linux/slab.h>
57 +#include <linux/delay.h>
58 #include <asm/apicdef.h>
59
60 #include "../perf_event.h"
61 @@ -429,6 +430,64 @@ static void amd_pmu_cpu_dead(int cpu)
62 }
63 }
64
65 +/*
66 + * When a PMC counter overflows, an NMI is used to process the event and
67 + * reset the counter. NMI latency can result in the counter being updated
68 + * before the NMI can run, which can result in what appear to be spurious
69 + * NMIs. This function is intended to wait for the NMI to run and reset
70 + * the counter to avoid possible unhandled NMI messages.
71 + */
72 +#define OVERFLOW_WAIT_COUNT 50
73 +
74 +static void amd_pmu_wait_on_overflow(int idx)
75 +{
76 + unsigned int i;
77 + u64 counter;
78 +
79 + /*
80 + * Wait for the counter to be reset if it has overflowed. This loop
81 + * should exit very, very quickly, but just in case, don't wait
82 + * forever...
83 + */
84 + for (i = 0; i < OVERFLOW_WAIT_COUNT; i++) {
85 + rdmsrl(x86_pmu_event_addr(idx), counter);
86 + if (counter & (1ULL << (x86_pmu.cntval_bits - 1)))
87 + break;
88 +
89 + /* Might be in IRQ context, so can't sleep */
90 + udelay(1);
91 + }
92 +}
93 +
94 +static void amd_pmu_disable_all(void)
95 +{
96 + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events);
97 + int idx;
98 +
99 + x86_pmu_disable_all();
100 +
101 + /*
102 + * This shouldn't be called from NMI context, but add a safeguard here
103 + * to return, since if we're in NMI context we can't wait for an NMI
104 + * to reset an overflowed counter value.
105 + */
106 + if (in_nmi())
107 + return;
108 +
109 + /*
110 + * Check each counter for overflow and wait for it to be reset by the
111 + * NMI if it has overflowed. This relies on the fact that all active
112 + * counters are always enabled when this function is caled and
113 + * ARCH_PERFMON_EVENTSEL_INT is always set.
114 + */
115 + for (idx = 0; idx < x86_pmu.num_counters; idx++) {
116 + if (!test_bit(idx, cpuc->active_mask))
117 + continue;
118 +
119 + amd_pmu_wait_on_overflow(idx);
120 + }
121 +}
122 +
123 static struct event_constraint *
124 amd_get_event_constraints(struct cpu_hw_events *cpuc, int idx,
125 struct perf_event *event)
126 @@ -622,7 +681,7 @@ static ssize_t amd_event_sysfs_show(char
127 static __initconst const struct x86_pmu amd_pmu = {
128 .name = "AMD",
129 .handle_irq = x86_pmu_handle_irq,
130 - .disable_all = x86_pmu_disable_all,
131 + .disable_all = amd_pmu_disable_all,
132 .enable_all = x86_pmu_enable_all,
133 .enable = x86_pmu_enable_event,
134 .disable = x86_pmu_disable_event,
135 @@ -728,7 +787,7 @@ void amd_pmu_enable_virt(void)
136 cpuc->perf_ctr_virt_mask = 0;
137
138 /* Reload all events */
139 - x86_pmu_disable_all();
140 + amd_pmu_disable_all();
141 x86_pmu_enable_all(0);
142 }
143 EXPORT_SYMBOL_GPL(amd_pmu_enable_virt);
144 @@ -746,7 +805,7 @@ void amd_pmu_disable_virt(void)
145 cpuc->perf_ctr_virt_mask = AMD64_EVENTSEL_HOSTONLY;
146
147 /* Reload all events */
148 - x86_pmu_disable_all();
149 + amd_pmu_disable_all();
150 x86_pmu_enable_all(0);
151 }
152 EXPORT_SYMBOL_GPL(amd_pmu_disable_virt);