]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/blob - queue-4.9/0041-x86-speculation-Prepare-for-conditional-IBPB-in-swit.patch
4.9-stable patches
[thirdparty/kernel/stable-queue.git] / queue-4.9 / 0041-x86-speculation-Prepare-for-conditional-IBPB-in-swit.patch
1 From 0f0bdeea5ed07fbc6fadd7dbf2af796887c85e46 Mon Sep 17 00:00:00 2001
2 From: Thomas Gleixner <tglx@linutronix.de>
3 Date: Sun, 25 Nov 2018 19:33:49 +0100
4 Subject: [PATCH 41/76] x86/speculation: Prepare for conditional IBPB in
5 switch_mm()
6
7 commit 4c71a2b6fd7e42814aa68a6dec88abf3b42ea573 upstream.
8
9 The IBPB speculation barrier is issued from switch_mm() when the kernel
10 switches to a user space task with a different mm than the user space task
11 which ran last on the same CPU.
12
13 An additional optimization is to avoid IBPB when the incoming task can be
14 ptraced by the outgoing task. This optimization only works when switching
15 directly between two user space tasks. When switching from a kernel task to
16 a user space task the optimization fails because the previous task cannot
17 be accessed anymore. So for quite some scenarios the optimization is just
18 adding overhead.
19
20 The upcoming conditional IBPB support will issue IBPB only for user space
21 tasks which have the TIF_SPEC_IB bit set. This requires to handle the
22 following cases:
23
24 1) Switch from a user space task (potential attacker) which has
25 TIF_SPEC_IB set to a user space task (potential victim) which has
26 TIF_SPEC_IB not set.
27
28 2) Switch from a user space task (potential attacker) which has
29 TIF_SPEC_IB not set to a user space task (potential victim) which has
30 TIF_SPEC_IB set.
31
32 This needs to be optimized for the case where the IBPB can be avoided when
33 only kernel threads ran in between user space tasks which belong to the
34 same process.
35
36 The current check whether two tasks belong to the same context is using the
37 tasks context id. While correct, it's simpler to use the mm pointer because
38 it allows to mangle the TIF_SPEC_IB bit into it. The context id based
39 mechanism requires extra storage, which creates worse code.
40
41 When a task is scheduled out its TIF_SPEC_IB bit is mangled as bit 0 into
42 the per CPU storage which is used to track the last user space mm which was
43 running on a CPU. This bit can be used together with the TIF_SPEC_IB bit of
44 the incoming task to make the decision whether IBPB needs to be issued or
45 not to cover the two cases above.
46
47 As conditional IBPB is going to be the default, remove the dubious ptrace
48 check for the IBPB always case and simply issue IBPB always when the
49 process changes.
50
51 Move the storage to a different place in the struct as the original one
52 created a hole.
53
54 Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
55 Reviewed-by: Ingo Molnar <mingo@kernel.org>
56 Cc: Peter Zijlstra <peterz@infradead.org>
57 Cc: Andy Lutomirski <luto@kernel.org>
58 Cc: Linus Torvalds <torvalds@linux-foundation.org>
59 Cc: Jiri Kosina <jkosina@suse.cz>
60 Cc: Tom Lendacky <thomas.lendacky@amd.com>
61 Cc: Josh Poimboeuf <jpoimboe@redhat.com>
62 Cc: Andrea Arcangeli <aarcange@redhat.com>
63 Cc: David Woodhouse <dwmw@amazon.co.uk>
64 Cc: Tim Chen <tim.c.chen@linux.intel.com>
65 Cc: Andi Kleen <ak@linux.intel.com>
66 Cc: Dave Hansen <dave.hansen@intel.com>
67 Cc: Casey Schaufler <casey.schaufler@intel.com>
68 Cc: Asit Mallick <asit.k.mallick@intel.com>
69 Cc: Arjan van de Ven <arjan@linux.intel.com>
70 Cc: Jon Masters <jcm@redhat.com>
71 Cc: Waiman Long <longman9394@gmail.com>
72 Cc: Greg KH <gregkh@linuxfoundation.org>
73 Cc: Dave Stewart <david.c.stewart@intel.com>
74 Cc: Kees Cook <keescook@chromium.org>
75 Link: https://lkml.kernel.org/r/20181125185005.466447057@linutronix.de
76 [bwh: Backported to 4.9:
77 - Drop changes in initialize_tlbstate_and_flush()
78 - Adjust context]
79 Signed-off-by: Ben Hutchings <ben@decadent.org.uk>
80 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
81 ---
82 arch/x86/include/asm/nospec-branch.h | 2 +
83 arch/x86/include/asm/tlbflush.h | 8 +-
84 arch/x86/kernel/cpu/bugs.c | 29 +++++--
85 arch/x86/mm/tlb.c | 113 ++++++++++++++++++++-------
86 4 files changed, 117 insertions(+), 35 deletions(-)
87
88 diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h
89 index dab01da02de4..e655341bffe9 100644
90 --- a/arch/x86/include/asm/nospec-branch.h
91 +++ b/arch/x86/include/asm/nospec-branch.h
92 @@ -303,6 +303,8 @@ do { \
93 } while (0)
94
95 DECLARE_STATIC_KEY_FALSE(switch_to_cond_stibp);
96 +DECLARE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
97 +DECLARE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
98
99 #endif /* __ASSEMBLY__ */
100
101 diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
102 index 686a58d793e5..f5ca15622dc9 100644
103 --- a/arch/x86/include/asm/tlbflush.h
104 +++ b/arch/x86/include/asm/tlbflush.h
105 @@ -68,8 +68,12 @@ static inline void invpcid_flush_all_nonglobals(void)
106 struct tlb_state {
107 struct mm_struct *active_mm;
108 int state;
109 - /* last user mm's ctx id */
110 - u64 last_ctx_id;
111 +
112 + /* Last user mm for optimizing IBPB */
113 + union {
114 + struct mm_struct *last_user_mm;
115 + unsigned long last_user_mm_ibpb;
116 + };
117
118 /*
119 * Access to this CR4 shadow and to H/W CR4 is protected by
120 diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
121 index c52d5596d943..8d92f87f218f 100644
122 --- a/arch/x86/kernel/cpu/bugs.c
123 +++ b/arch/x86/kernel/cpu/bugs.c
124 @@ -54,6 +54,10 @@ u64 __ro_after_init x86_amd_ls_cfg_ssbd_mask;
125
126 /* Control conditional STIPB in switch_to() */
127 DEFINE_STATIC_KEY_FALSE(switch_to_cond_stibp);
128 +/* Control conditional IBPB in switch_mm() */
129 +DEFINE_STATIC_KEY_FALSE(switch_mm_cond_ibpb);
130 +/* Control unconditional IBPB in switch_mm() */
131 +DEFINE_STATIC_KEY_FALSE(switch_mm_always_ibpb);
132
133 void __init check_bugs(void)
134 {
135 @@ -329,7 +333,17 @@ spectre_v2_user_select_mitigation(enum spectre_v2_mitigation_cmd v2_cmd)
136 /* Initialize Indirect Branch Prediction Barrier */
137 if (boot_cpu_has(X86_FEATURE_IBPB)) {
138 setup_force_cpu_cap(X86_FEATURE_USE_IBPB);
139 - pr_info("Spectre v2 mitigation: Enabling Indirect Branch Prediction Barrier\n");
140 +
141 + switch (mode) {
142 + case SPECTRE_V2_USER_STRICT:
143 + static_branch_enable(&switch_mm_always_ibpb);
144 + break;
145 + default:
146 + break;
147 + }
148 +
149 + pr_info("mitigation: Enabling %s Indirect Branch Prediction Barrier\n",
150 + mode == SPECTRE_V2_USER_STRICT ? "always-on" : "conditional");
151 }
152
153 /* If enhanced IBRS is enabled no STIPB required */
154 @@ -961,10 +975,15 @@ static char *stibp_state(void)
155
156 static char *ibpb_state(void)
157 {
158 - if (boot_cpu_has(X86_FEATURE_USE_IBPB))
159 - return ", IBPB";
160 - else
161 - return "";
162 + if (boot_cpu_has(X86_FEATURE_IBPB)) {
163 + switch (spectre_v2_user) {
164 + case SPECTRE_V2_USER_NONE:
165 + return ", IBPB: disabled";
166 + case SPECTRE_V2_USER_STRICT:
167 + return ", IBPB: always-on";
168 + }
169 + }
170 + return "";
171 }
172
173 static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr,
174 diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
175 index ff8f8e529317..a112bb175dd4 100644
176 --- a/arch/x86/mm/tlb.c
177 +++ b/arch/x86/mm/tlb.c
178 @@ -7,7 +7,6 @@
179 #include <linux/export.h>
180 #include <linux/cpu.h>
181 #include <linux/debugfs.h>
182 -#include <linux/ptrace.h>
183
184 #include <asm/tlbflush.h>
185 #include <asm/mmu_context.h>
186 @@ -31,6 +30,12 @@
187 * Implement flush IPI by CALL_FUNCTION_VECTOR, Alex Shi
188 */
189
190 +/*
191 + * Use bit 0 to mangle the TIF_SPEC_IB state into the mm pointer which is
192 + * stored in cpu_tlb_state.last_user_mm_ibpb.
193 + */
194 +#define LAST_USER_MM_IBPB 0x1UL
195 +
196 atomic64_t last_mm_ctx_id = ATOMIC64_INIT(1);
197
198 struct flush_tlb_info {
199 @@ -102,17 +107,87 @@ void switch_mm(struct mm_struct *prev, struct mm_struct *next,
200 local_irq_restore(flags);
201 }
202
203 -static bool ibpb_needed(struct task_struct *tsk, u64 last_ctx_id)
204 +static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
205 +{
206 + unsigned long next_tif = task_thread_info(next)->flags;
207 + unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
208 +
209 + return (unsigned long)next->mm | ibpb;
210 +}
211 +
212 +static void cond_ibpb(struct task_struct *next)
213 {
214 + if (!next || !next->mm)
215 + return;
216 +
217 /*
218 - * Check if the current (previous) task has access to the memory
219 - * of the @tsk (next) task. If access is denied, make sure to
220 - * issue a IBPB to stop user->user Spectre-v2 attacks.
221 - *
222 - * Note: __ptrace_may_access() returns 0 or -ERRNO.
223 + * Both, the conditional and the always IBPB mode use the mm
224 + * pointer to avoid the IBPB when switching between tasks of the
225 + * same process. Using the mm pointer instead of mm->context.ctx_id
226 + * opens a hypothetical hole vs. mm_struct reuse, which is more or
227 + * less impossible to control by an attacker. Aside of that it
228 + * would only affect the first schedule so the theoretically
229 + * exposed data is not really interesting.
230 */
231 - return (tsk && tsk->mm && tsk->mm->context.ctx_id != last_ctx_id &&
232 - ptrace_may_access_sched(tsk, PTRACE_MODE_SPEC_IBPB));
233 + if (static_branch_likely(&switch_mm_cond_ibpb)) {
234 + unsigned long prev_mm, next_mm;
235 +
236 + /*
237 + * This is a bit more complex than the always mode because
238 + * it has to handle two cases:
239 + *
240 + * 1) Switch from a user space task (potential attacker)
241 + * which has TIF_SPEC_IB set to a user space task
242 + * (potential victim) which has TIF_SPEC_IB not set.
243 + *
244 + * 2) Switch from a user space task (potential attacker)
245 + * which has TIF_SPEC_IB not set to a user space task
246 + * (potential victim) which has TIF_SPEC_IB set.
247 + *
248 + * This could be done by unconditionally issuing IBPB when
249 + * a task which has TIF_SPEC_IB set is either scheduled in
250 + * or out. Though that results in two flushes when:
251 + *
252 + * - the same user space task is scheduled out and later
253 + * scheduled in again and only a kernel thread ran in
254 + * between.
255 + *
256 + * - a user space task belonging to the same process is
257 + * scheduled in after a kernel thread ran in between
258 + *
259 + * - a user space task belonging to the same process is
260 + * scheduled in immediately.
261 + *
262 + * Optimize this with reasonably small overhead for the
263 + * above cases. Mangle the TIF_SPEC_IB bit into the mm
264 + * pointer of the incoming task which is stored in
265 + * cpu_tlbstate.last_user_mm_ibpb for comparison.
266 + */
267 + next_mm = mm_mangle_tif_spec_ib(next);
268 + prev_mm = this_cpu_read(cpu_tlbstate.last_user_mm_ibpb);
269 +
270 + /*
271 + * Issue IBPB only if the mm's are different and one or
272 + * both have the IBPB bit set.
273 + */
274 + if (next_mm != prev_mm &&
275 + (next_mm | prev_mm) & LAST_USER_MM_IBPB)
276 + indirect_branch_prediction_barrier();
277 +
278 + this_cpu_write(cpu_tlbstate.last_user_mm_ibpb, next_mm);
279 + }
280 +
281 + if (static_branch_unlikely(&switch_mm_always_ibpb)) {
282 + /*
283 + * Only flush when switching to a user space task with a
284 + * different context than the user space task which ran
285 + * last on this CPU.
286 + */
287 + if (this_cpu_read(cpu_tlbstate.last_user_mm) != next->mm) {
288 + indirect_branch_prediction_barrier();
289 + this_cpu_write(cpu_tlbstate.last_user_mm, next->mm);
290 + }
291 + }
292 }
293
294 void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
295 @@ -121,22 +196,12 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
296 unsigned cpu = smp_processor_id();
297
298 if (likely(prev != next)) {
299 - u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
300 -
301 /*
302 * Avoid user/user BTB poisoning by flushing the branch
303 * predictor when switching between processes. This stops
304 * one process from doing Spectre-v2 attacks on another.
305 - *
306 - * As an optimization, flush indirect branches only when
307 - * switching into a processes that can't be ptrace by the
308 - * current one (as in such case, attacker has much more
309 - * convenient way how to tamper with the next process than
310 - * branch buffer poisoning).
311 */
312 - if (static_cpu_has(X86_FEATURE_USE_IBPB) &&
313 - ibpb_needed(tsk, last_ctx_id))
314 - indirect_branch_prediction_barrier();
315 + cond_ibpb(tsk);
316
317 if (IS_ENABLED(CONFIG_VMAP_STACK)) {
318 /*
319 @@ -152,14 +217,6 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
320 set_pgd(pgd, init_mm.pgd[stack_pgd_index]);
321 }
322
323 - /*
324 - * Record last user mm's context id, so we can avoid
325 - * flushing branch buffer with IBPB if we switch back
326 - * to the same user.
327 - */
328 - if (next != &init_mm)
329 - this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
330 -
331 this_cpu_write(cpu_tlbstate.state, TLBSTATE_OK);
332 this_cpu_write(cpu_tlbstate.active_mm, next);
333
334 --
335 2.21.0
336