]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - releases/4.4.180/x86-speculation-prepare-for-conditional-ibpb-in-switch_mm.patch
4.14-stable patches
[thirdparty/kernel/stable-queue.git] / releases / 4.4.180 / x86-speculation-prepare-for-conditional-ibpb-in-switch_mm.patch
1 From foo@baz Tue 14 May 2019 08:29:35 PM CEST
2 From: Thomas Gleixner <tglx@linutronix.de>
3 Date: Sun, 25 Nov 2018 19:33:49 +0100
4 Subject: x86/speculation: Prepare for conditional IBPB in switch_mm()
5
6 From: Thomas Gleixner <tglx@linutronix.de>
7
8 commit 4c71a2b6fd7e42814aa68a6dec88abf3b42ea573 upstream.
9
10 The IBPB speculation barrier is issued from switch_mm() when the kernel
11 switches to a user space task with a different mm than the user space task
12 which ran last on the same CPU.
13
14 An additional optimization is to avoid IBPB when the incoming task can be
15 ptraced by the outgoing task. This optimization only works when switching
16 directly between two user space tasks. When switching from a kernel task to
17 a user space task the optimization fails because the previous task cannot
18 be accessed anymore. So for quite some scenarios the optimization is just
19 adding overhead.
20
21 The upcoming conditional IBPB support will issue IBPB only for user space
22 tasks which have the TIF_SPEC_IB bit set. This requires to handle the
23 following cases:
24
25 1) Switch from a user space task (potential attacker) which has
26 TIF_SPEC_IB set to a user space task (potential victim) which has
27 TIF_SPEC_IB not set.
28
29 2) Switch from a user space task (potential attacker) which has
30 TIF_SPEC_IB not set to a user space task (potential victim) which has
31 TIF_SPEC_IB set.
32
33 This needs to be optimized for the case where the IBPB can be avoided when
34 only kernel threads ran in between user space tasks which belong to the
35 same process.
36
37 The current check whether two tasks belong to the same context is using the
38 tasks context id. While correct, it's simpler to use the mm pointer because
39 it allows to mangle the TIF_SPEC_IB bit into it. The context id based
40 mechanism requires extra storage, which creates worse code.
41
42 When a task is scheduled out its TIF_SPEC_IB bit is mangled as bit 0 into
43 the per CPU storage which is used to track the last user space mm which was
44 running on a CPU. This bit can be used together with the TIF_SPEC_IB bit of
45 the incoming task to make the decision whether IBPB needs to be issued or
46 not to cover the two cases above.
47
48 As conditional IBPB is going to be the default, remove the dubious ptrace
49 check for the IBPB always case and simply issue IBPB always when the
50 process changes.
51
52 Move the storage to a different place in the struct as the original one
53 created a hole.
54
55 Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
56 Reviewed-by: Ingo Molnar <mingo@kernel.org>
57 Cc: Peter Zijlstra <peterz@infradead.org>
58 Cc: Andy Lutomirski <luto@kernel.org>
59 Cc: Linus Torvalds <torvalds@linux-foundation.org>
60 Cc: Jiri Kosina <jkosina@suse.cz>
61 Cc: Tom Lendacky <thomas.lendacky@amd.com>
62 Cc: Josh Poimboeuf <jpoimboe@redhat.com>
63 Cc: Andrea Arcangeli <aarcange@redhat.com>
64 Cc: David Woodhouse <dwmw@amazon.co.uk>
65 Cc: Tim Chen <tim.c.chen@linux.intel.com>
66 Cc: Andi Kleen <ak@linux.intel.com>
67 Cc: Dave Hansen <dave.hansen@intel.com>
68 Cc: Casey Schaufler <casey.schaufler@intel.com>
69 Cc: Asit Mallick <asit.k.mallick@intel.com>
70 Cc: Arjan van de Ven <arjan@linux.intel.com>
71 Cc: Jon Masters <jcm@redhat.com>
72 Cc: Waiman Long <longman9394@gmail.com>
73 Cc: Greg KH <gregkh@linuxfoundation.org>
74 Cc: Dave Stewart <david.c.stewart@intel.com>
75 Cc: Kees Cook <keescook@chromium.org>
76 Link: https://lkml.kernel.org/r/20181125185005.466447057@linutronix.de
77 [bwh: Backported to 4.4:
78 - Drop changes in initialize_tlbstate_and_flush()
79 - Adjust context]
80 Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
81 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
82 ---
83 arch/x86/include/asm/nospec-branch.h | 2
84 arch/x86/include/asm/tlbflush.h | 8 +-
85 arch/x86/kernel/cpu/bugs.c | 29 +++++++-
86 arch/x86/mm/tlb.c | 113 ++++++++++++++++++++++++++---------
87 4 files changed, 117 insertions(+), 35 deletions(-)
88
89 --- a/arch/x86/include/asm/nospec-branch.h
90 +++ b/arch/x86/include/asm/nospec-branch.h
91 @@ -257,6 +257,8 @@ do { \
92 } while (0)
93
94 DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
95 +DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
96 +DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
97
98 #endif /* __ASSEMBLY__ */
99
100 --- a/arch/x86/include/asm/tlbflush.h
101 +++ b/arch/x86/include/asm/tlbflush.h
102 @@ -68,8 +68,12 @@ static inline void invpcid_flush_all_non
103 struct tlb_state {
104 struct mm_struct *active_mm;
105 int state;
106 - /* last user mm's ctx id */
107 - u64 last_ctx_id;
108 +
109 + /* Last user mm for optimizing IBPB */
110 + union {
111 + struct mm_struct *last_user_mm;
112 + unsigned long last_user_mm_ibpb;
113 + };
114
115 /*
116 * Access to this CR4 shadow and to H/W CR4 is protected by
117 --- a/arch/x86/kernel/cpu/bugs.c
118 +++ b/arch/x86/kernel/cpu/bugs.c
119 @@ -53,6 +53,10 @@ u64 x86_amd_ls_cfg_ssbd_mask;
120
121 /* Control conditional STIPB in switch_to() */
122 DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp);
123 +/* Control conditional IBPB in switch_mm() */
124 +DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
125 +/* Control unconditional IBPB in switch_mm() */
126 +DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
127
128 void __init check_bugs(void)
129 {
130 @@ -319,7 +323,17 @@ spectre_v2_user_select_mitigation(enum s
131 /* Initialize Indirect Branch Prediction Barrier */
132 if (boot_cpu_has(X86_FEATURE_IBPB)) {
133 setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
134 - pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n");
135 +
136 + switch (mode) {
137 + case SPECTRE_V2_USER_STRICT:
138 + static_branch_enable(&switch_mm_always_ibpb);
139 + break;
140 + default:
141 + break;
142 + }
143 +
144 + pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
145 + mode == SPECTRE_V2_USER_STRICT ? "always-on" : "conditional");
146 }
147
148 /* If enhanced IBRS is enabled no STIPB required */
149 @@ -867,10 +881,15 @@ static char *stibp_state(void)
150
151 static char *ibpb_state(void)
152 {
153 - if (boot_cpu_has(X86_FEATURE_USE_IBPB))
154 - return ", IBPB";
155 - else
156 - return "";
157 + if (boot_cpu_has(X86_FEATURE_IBPB)) {
158 + switch (spectre_v2_user) {
159 + case SPECTRE_V2_USER_NONE:
160 + return ", IBPB: disabled";
161 + case SPECTRE_V2_USER_STRICT:
162 + return ", IBPB: always-on";
163 + }
164 + }
165 + return "";
166 }
167
168 static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
169 --- a/arch/x86/mm/tlb.c
170 +++ b/arch/x86/mm/tlb.c
171 @@ -7,7 +7,6 @@
172 #include <linux/module.h>
173 #include <linux/cpu.h>
174 #include <linux/debugfs.h>
175 -#include <linux/ptrace.h>
176
177 #include <asm/tlbflush.h>
178 #include <asm/mmu_context.h>
179 @@ -31,6 +30,12 @@
180 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
181 */
182
183 +/*
184 + * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
185 + * stored in cpu_tlb_state.last_user_mm_ibpb.
186 + */
187 +#define LAST_USER_MM_IBPB 0x1UL
188 +
189 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
190
191 struct flush_tlb_info {
192 @@ -102,17 +107,87 @@ void switch_mm(struct mm_struct *prev, s
193 local_irq_restore(flags);
194 }
195
196 -static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id)
197 +static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
198 {
199 + unsigned long next_tif = task_thread_info(next)->flags;
200 + unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
201 +
202 + return (unsigned long)next->mm | ibpb;
203 +}
204 +
205 +static void cond_ibpb(struct task_struct *next)
206 +{
207 + if (!next || !next->mm)
208 + return;
209 +
210 /*
211 - * Check if the current (previous) task has access to the memory
212 - * of the @tsk (next) task. If access is denied, make sure to
213 - * issue a IBPB to stop user->user Spectre-v2 attacks.
214 - *
215 - * Note: __ptrace_may_access() returns 0 or -ERRNO.
216 + * Both, the conditional and the always IBPB mode use the mm
217 + * pointer to avoid the IBPB when switching between tasks of the
218 + * same process. Using the mm pointer instead of mm->context.ctx_id
219 + * opens a hypothetical hole vs. mm_struct reuse, which is more or
220 + * less impossible to control by an attacker. Aside of that it
221 + * would only affect the first schedule so the theoretically
222 + * exposed data is not really interesting.
223 */
224 - return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id &&
225 - ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB));
226 + if (static_branch_likely(&switch_mm_cond_ibpb)) {
227 + unsigned long prev_mm, next_mm;
228 +
229 + /*
230 + * This is a bit more complex than the always mode because
231 + * it has to handle two cases:
232 + *
233 + * 1) Switch from a user space task (potential attacker)
234 + * which has TIF_SPEC_IB set to a user space task
235 + * (potential victim) which has TIF_SPEC_IB not set.
236 + *
237 + * 2) Switch from a user space task (potential attacker)
238 + * which has TIF_SPEC_IB not set to a user space task
239 + * (potential victim) which has TIF_SPEC_IB set.
240 + *
241 + * This could be done by unconditionally issuing IBPB when
242 + * a task which has TIF_SPEC_IB set is either scheduled in
243 + * or out. Though that results in two flushes when:
244 + *
245 + * - the same user space task is scheduled out and later
246 + * scheduled in again and only a kernel thread ran in
247 + * between.
248 + *
249 + * - a user space task belonging to the same process is
250 + * scheduled in after a kernel thread ran in between
251 + *
252 + * - a user space task belonging to the same process is
253 + * scheduled in immediately.
254 + *
255 + * Optimize this with reasonably small overhead for the
256 + * above cases. Mangle the TIF_SPEC_IB bit into the mm
257 + * pointer of the incoming task which is stored in
258 + * cpu_tlbstate.last_user_mm_ibpb for comparison.
259 + */
260 + next_mm = mm_mangle_tif_spec_ib(next);
261 + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
262 +
263 + /*
264 + * Issue IBPB only if the mm's are different and one or
265 + * both have the IBPB bit set.
266 + */
267 + if (next_mm != prev_mm &&
268 + (next_mm | prev_mm) & LAST_USER_MM_IBPB)
269 + indirect_branch_prediction_barrier();
270 +
271 + this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
272 + }
273 +
274 + if (static_branch_unlikely(&switch_mm_always_ibpb)) {
275 + /*
276 + * Only flush when switching to a user space task with a
277 + * different context than the user space task which ran
278 + * last on this CPU.
279 + */
280 + if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
281 + indirect_branch_prediction_barrier();
282 + this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
283 + }
284 + }
285 }
286
287 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
288 @@ -121,30 +196,12 @@ void switch_mm_irqs_off(struct mm_struct
289 unsigned cpu = smp_processor_id();
290
291 if (likely(prev != next)) {
292 - u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
293 -
294 /*
295 * Avoid user/user BTB poisoning by flushing the branch
296 * predictor when switching between processes. This stops
297 * one process from doing Spectre-v2 attacks on another.
298 - *
299 - * As an optimization, flush indirect branches only when
300 - * switching into a processes that can't be ptrace by the
301 - * current one (as in such case, attacker has much more
302 - * convenient way how to tamper with the next process than
303 - * branch buffer poisoning).
304 - */
305 - if (static_cpu_has(X86_FEATURE_USE_IBPB) &&
306 - ibpb_needed(tsk, last_ctx_id))
307 - indirect_branch_prediction_barrier();
308 -
309 - /*
310 - * Record last user mm's context id, so we can avoid
311 - * flushing branch buffer with IBPB if we switch back
312 - * to the same user.
313 */
314 - if (next != &init_mm)
315 - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
316 + cond_ibpb(tsk);
317
318 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
319 this_cpu_write(cpu_tlbstate.active_mm, next);