]> git.ipfire.org Git - thirdparty/linux.git/blame - arch/x86/kernel/process.c
Merge tag 'x86-fpu-2020-06-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
[thirdparty/linux.git] / arch / x86 / kernel / process.c
CommitLineData
b2441318 1// SPDX-License-Identifier: GPL-2.0
c767a54b
JP
2#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
3
61c4628b
SS
4#include <linux/errno.h>
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/smp.h>
389d1fb1 8#include <linux/prctl.h>
61c4628b
SS
9#include <linux/slab.h>
10#include <linux/sched.h>
4c822698 11#include <linux/sched/idle.h>
b17b0153 12#include <linux/sched/debug.h>
29930025 13#include <linux/sched/task.h>
68db0cf1 14#include <linux/sched/task_stack.h>
186f4360
PG
15#include <linux/init.h>
16#include <linux/export.h>
7f424a8b 17#include <linux/pm.h>
162a688e 18#include <linux/tick.h>
9d62dcdf 19#include <linux/random.h>
7c68af6e 20#include <linux/user-return-notifier.h>
814e2c84
AI
21#include <linux/dmi.h>
22#include <linux/utsname.h>
90e24014 23#include <linux/stackprotector.h>
90e24014 24#include <linux/cpuidle.h>
89f579ce
YW
25#include <linux/acpi.h>
26#include <linux/elf-randomize.h>
61613521 27#include <trace/events/power.h>
24f1e32c 28#include <linux/hw_breakpoint.h>
93789b32 29#include <asm/cpu.h>
d3ec5cae 30#include <asm/apic.h>
7c0f6ba6 31#include <linux/uaccess.h>
b253149b 32#include <asm/mwait.h>
78f7f1e5 33#include <asm/fpu/internal.h>
66cb5917 34#include <asm/debugreg.h>
90e24014 35#include <asm/nmi.h>
375074cc 36#include <asm/tlbflush.h>
8838eb6c 37#include <asm/mce.h>
9fda6a06 38#include <asm/vm86.h>
7b32aead 39#include <asm/switch_to.h>
b7ffc44d 40#include <asm/desc.h>
e9ea1e7f 41#include <asm/prctl.h>
885f82bf 42#include <asm/spec-ctrl.h>
577d5cd7 43#include <asm/io_bitmap.h>
89f579ce 44#include <asm/proto.h>
90e24014 45
ff16701a
TG
46#include "process.h"
47
45046892
TG
48/*
49 * per-CPU TSS segments. Threads are completely 'soft' on Linux,
50 * no more per-task TSS's. The TSS size is kept cacheline-aligned
51 * so they are allowed to end up in the .data..cacheline_aligned
52 * section. Since TSS's are completely CPU-local, we want them
53 * on exact cacheline boundaries, to eliminate cacheline ping-pong.
54 */
2fd9c41a 55__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = {
d0a0de21 56 .x86_tss = {
20bb8344
AL
57 /*
58 * .sp0 is only used when entering ring 0 from a lower
59 * privilege level. Since the init task never runs anything
60 * but ring 0 code, there is no need for a valid value here.
61 * Poison it.
62 */
63 .sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
9aaefe7b 64
9aaefe7b
AL
65 /*
66 * .sp1 is cpu_current_top_of_stack. The init task never
67 * runs user code, but cpu_current_top_of_stack should still
68 * be well defined before the first context switch.
69 */
70 .sp1 = TOP_OF_INIT_STACK,
9aaefe7b 71
d0a0de21
AL
72#ifdef CONFIG_X86_32
73 .ss0 = __KERNEL_DS,
74 .ss1 = __KERNEL_CS,
d0a0de21 75#endif
ecc7e37d 76 .io_bitmap_base = IO_BITMAP_OFFSET_INVALID,
d0a0de21 77 },
d0a0de21 78};
c482feef 79EXPORT_PER_CPU_SYMBOL(cpu_tss_rw);
45046892 80
b7ceaec1
AL
81DEFINE_PER_CPU(bool, __tss_limit_invalid);
82EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid);
b7ffc44d 83
55ccf3fe
SS
84/*
85 * this gets called so that we can store lazy state into memory and copy the
86 * current task into the new thread.
87 */
61c4628b
SS
88int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
89{
5aaeb5c0 90 memcpy(dst, src, arch_task_struct_size);
2459ee86
AL
91#ifdef CONFIG_VM86
92 dst->thread.vm86 = NULL;
93#endif
f1853505 94
5f409e20 95 return fpu__copy(dst, src);
61c4628b 96}
7f424a8b 97
389d1fb1 98/*
4bfe6cce 99 * Free thread data structures etc..
389d1fb1 100 */
e6464694 101void exit_thread(struct task_struct *tsk)
389d1fb1 102{
e6464694 103 struct thread_struct *t = &tsk->thread;
ca6787ba 104 struct fpu *fpu = &t->fpu;
ea5f1cd7
TG
105
106 if (test_thread_flag(TIF_IO_BITMAP))
4bfe6cce 107 io_bitmap_exit(tsk);
1dcc8d7b 108
9fda6a06
BG
109 free_vm86(t);
110
50338615 111 fpu__drop(fpu);
389d1fb1
JF
112}
113
2fff071d
TG
114static int set_new_tls(struct task_struct *p, unsigned long tls)
115{
116 struct user_desc __user *utls = (struct user_desc __user *)tls;
117
118 if (in_ia32_syscall())
119 return do_set_thread_area(p, -1, utls, 0);
120 else
121 return do_set_thread_area_64(p, ARCH_SET_FS, tls);
122}
123
2fff071d
TG
124int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
125 unsigned long arg, struct task_struct *p, unsigned long tls)
126{
127 struct inactive_task_frame *frame;
128 struct fork_frame *fork_frame;
129 struct pt_regs *childregs;
4804e382 130 int ret = 0;
2fff071d
TG
131
132 childregs = task_pt_regs(p);
133 fork_frame = container_of(childregs, struct fork_frame, regs);
134 frame = &fork_frame->frame;
135
136 frame->bp = 0;
137 frame->ret_addr = (unsigned long) ret_from_fork;
138 p->thread.sp = (unsigned long) fork_frame;
577d5cd7 139 p->thread.io_bitmap = NULL;
2fff071d
TG
140 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
141
142#ifdef CONFIG_X86_64
143 savesegment(gs, p->thread.gsindex);
144 p->thread.gsbase = p->thread.gsindex ? 0 : current->thread.gsbase;
145 savesegment(fs, p->thread.fsindex);
146 p->thread.fsbase = p->thread.fsindex ? 0 : current->thread.fsbase;
147 savesegment(es, p->thread.es);
148 savesegment(ds, p->thread.ds);
149#else
150 p->thread.sp0 = (unsigned long) (childregs + 1);
151 /*
152 * Clear all status flags including IF and set fixed bit. 64bit
153 * does not have this initialization as the frame does not contain
154 * flags. The flags consistency (especially vs. AC) is there
155 * ensured via objtool, which lacks 32bit support.
156 */
157 frame->flags = X86_EFLAGS_FIXED;
158#endif
159
160 /* Kernel thread ? */
161 if (unlikely(p->flags & PF_KTHREAD)) {
162 memset(childregs, 0, sizeof(struct pt_regs));
163 kthread_frame_init(frame, sp, arg);
164 return 0;
165 }
166
167 frame->bx = 0;
168 *childregs = *current_pt_regs();
169 childregs->ax = 0;
170 if (sp)
171 childregs->sp = sp;
172
173#ifdef CONFIG_X86_32
174 task_user_gs(p) = get_user_gs(current_pt_regs());
175#endif
176
2fff071d 177 /* Set a new TLS for the child thread? */
4804e382 178 if (clone_flags & CLONE_SETTLS)
2fff071d 179 ret = set_new_tls(p, tls);
4804e382
TG
180
181 if (!ret && unlikely(test_tsk_thread_flag(current, TIF_IO_BITMAP)))
182 io_bitmap_share(p);
183
2fff071d
TG
184 return ret;
185}
186
389d1fb1
JF
187void flush_thread(void)
188{
189 struct task_struct *tsk = current;
190
24f1e32c 191 flush_ptrace_hw_breakpoint(tsk);
389d1fb1 192 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
110d7f75 193
b860eb8d 194 fpu__clear_all(&tsk->thread.fpu);
389d1fb1
JF
195}
196
389d1fb1
JF
197void disable_TSC(void)
198{
199 preempt_disable();
200 if (!test_and_set_thread_flag(TIF_NOTSC))
201 /*
202 * Must flip the CPU state synchronously with
203 * TIF_NOTSC in the current running context.
204 */
5a920155 205 cr4_set_bits(X86_CR4_TSD);
389d1fb1
JF
206 preempt_enable();
207}
208
389d1fb1
JF
209static void enable_TSC(void)
210{
211 preempt_disable();
212 if (test_and_clear_thread_flag(TIF_NOTSC))
213 /*
214 * Must flip the CPU state synchronously with
215 * TIF_NOTSC in the current running context.
216 */
5a920155 217 cr4_clear_bits(X86_CR4_TSD);
389d1fb1
JF
218 preempt_enable();
219}
220
221int get_tsc_mode(unsigned long adr)
222{
223 unsigned int val;
224
225 if (test_thread_flag(TIF_NOTSC))
226 val = PR_TSC_SIGSEGV;
227 else
228 val = PR_TSC_ENABLE;
229
230 return put_user(val, (unsigned int __user *)adr);
231}
232
233int set_tsc_mode(unsigned int val)
234{
235 if (val == PR_TSC_SIGSEGV)
236 disable_TSC();
237 else if (val == PR_TSC_ENABLE)
238 enable_TSC();
239 else
240 return -EINVAL;
241
242 return 0;
243}
244
e9ea1e7f
KH
245DEFINE_PER_CPU(u64, msr_misc_features_shadow);
246
247static void set_cpuid_faulting(bool on)
248{
249 u64 msrval;
250
251 msrval = this_cpu_read(msr_misc_features_shadow);
252 msrval &= ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT;
253 msrval |= (on << MSR_MISC_FEATURES_ENABLES_CPUID_FAULT_BIT);
254 this_cpu_write(msr_misc_features_shadow, msrval);
255 wrmsrl(MSR_MISC_FEATURES_ENABLES, msrval);
256}
257
258static void disable_cpuid(void)
259{
260 preempt_disable();
261 if (!test_and_set_thread_flag(TIF_NOCPUID)) {
262 /*
263 * Must flip the CPU state synchronously with
264 * TIF_NOCPUID in the current running context.
265 */
266 set_cpuid_faulting(true);
267 }
268 preempt_enable();
269}
270
271static void enable_cpuid(void)
272{
273 preempt_disable();
274 if (test_and_clear_thread_flag(TIF_NOCPUID)) {
275 /*
276 * Must flip the CPU state synchronously with
277 * TIF_NOCPUID in the current running context.
278 */
279 set_cpuid_faulting(false);
280 }
281 preempt_enable();
282}
283
284static int get_cpuid_mode(void)
285{
286 return !test_thread_flag(TIF_NOCPUID);
287}
288
289static int set_cpuid_mode(struct task_struct *task, unsigned long cpuid_enabled)
290{
67e87d43 291 if (!boot_cpu_has(X86_FEATURE_CPUID_FAULT))
e9ea1e7f
KH
292 return -ENODEV;
293
294 if (cpuid_enabled)
295 enable_cpuid();
296 else
297 disable_cpuid();
298
299 return 0;
300}
301
302/*
303 * Called immediately after a successful exec.
304 */
305void arch_setup_new_exec(void)
306{
307 /* If cpuid was previously disabled for this task, re-enable it. */
308 if (test_thread_flag(TIF_NOCPUID))
309 enable_cpuid();
71368af9
WL
310
311 /*
312 * Don't inherit TIF_SSBD across exec boundary when
313 * PR_SPEC_DISABLE_NOEXEC is used.
314 */
315 if (test_thread_flag(TIF_SSBD) &&
316 task_spec_ssb_noexec(current)) {
317 clear_thread_flag(TIF_SSBD);
318 task_clear_spec_ssb_disable(current);
319 task_clear_spec_ssb_noexec(current);
320 speculation_ctrl_update(task_thread_info(current)->flags);
321 }
e9ea1e7f
KH
322}
323
111e7b15 324#ifdef CONFIG_X86_IOPL_IOPERM
22fe5b04
TG
325static inline void tss_invalidate_io_bitmap(struct tss_struct *tss)
326{
327 /*
328 * Invalidate the I/O bitmap by moving io_bitmap_base outside the
329 * TSS limit so any subsequent I/O access from user space will
330 * trigger a #GP.
331 *
332 * This is correct even when VMEXIT rewrites the TSS limit
333 * to 0x67 as the only requirement is that the base points
334 * outside the limit.
335 */
336 tss->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET_INVALID;
337}
338
339static inline void switch_to_bitmap(unsigned long tifp)
340{
341 /*
342 * Invalidate I/O bitmap if the previous task used it. This prevents
343 * any possible leakage of an active I/O bitmap.
344 *
345 * If the next task has an I/O bitmap it will handle it on exit to
346 * user mode.
347 */
348 if (tifp & _TIF_IO_BITMAP)
349 tss_invalidate_io_bitmap(this_cpu_ptr(&cpu_tss_rw));
350}
351
352static void tss_copy_io_bitmap(struct tss_struct *tss, struct io_bitmap *iobm)
060aa16f
TG
353{
354 /*
355 * Copy at least the byte range of the incoming tasks bitmap which
356 * covers the permitted I/O ports.
357 *
358 * If the previous task which used an I/O bitmap had more bits
359 * permitted, then the copy needs to cover those as well so they
360 * get turned off.
361 */
362 memcpy(tss->io_bitmap.bitmap, iobm->bitmap,
363 max(tss->io_bitmap.prev_max, iobm->max));
364
365 /*
366 * Store the new max and the sequence number of this bitmap
367 * and a pointer to the bitmap itself.
368 */
369 tss->io_bitmap.prev_max = iobm->max;
370 tss->io_bitmap.prev_sequence = iobm->sequence;
371}
372
22fe5b04
TG
373/**
374 * tss_update_io_bitmap - Update I/O bitmap before exiting to usermode
375 */
99bcd4a6 376void native_tss_update_io_bitmap(void)
af8b3cd3 377{
ff16701a 378 struct tss_struct *tss = this_cpu_ptr(&cpu_tss_rw);
7b0b8cfd 379 struct thread_struct *t = &current->thread;
c8137ace 380 u16 *base = &tss->x86_tss.io_bitmap_base;
ff16701a 381
7b0b8cfd
BP
382 if (!test_thread_flag(TIF_IO_BITMAP)) {
383 tss_invalidate_io_bitmap(tss);
384 return;
385 }
386
387 if (IS_ENABLED(CONFIG_X86_IOPL_IOPERM) && t->iopl_emul == 3) {
388 *base = IO_BITMAP_OFFSET_VALID_ALL;
389 } else {
390 struct io_bitmap *iobm = t->io_bitmap;
391
af8b3cd3 392 /*
7b0b8cfd
BP
393 * Only copy bitmap data when the sequence number differs. The
394 * update time is accounted to the incoming task.
af8b3cd3 395 */
7b0b8cfd
BP
396 if (tss->io_bitmap.prev_sequence != iobm->sequence)
397 tss_copy_io_bitmap(tss, iobm);
398
399 /* Enable the bitmap */
400 *base = IO_BITMAP_OFFSET_VALID_MAP;
af8b3cd3 401 }
7b0b8cfd
BP
402
403 /*
404 * Make sure that the TSS limit is covering the IO bitmap. It might have
405 * been cut down by a VMEXIT to 0x67 which would cause a subsequent I/O
406 * access from user space to trigger a #GP because tbe bitmap is outside
407 * the TSS limit.
408 */
409 refresh_tss_limit();
af8b3cd3 410}
111e7b15
TG
411#else /* CONFIG_X86_IOPL_IOPERM */
412static inline void switch_to_bitmap(unsigned long tifp) { }
413#endif
af8b3cd3 414
1f50ddb4
TG
415#ifdef CONFIG_SMP
416
417struct ssb_state {
418 struct ssb_state *shared_state;
419 raw_spinlock_t lock;
420 unsigned int disable_state;
421 unsigned long local_state;
422};
423
424#define LSTATE_SSB 0
425
426static DEFINE_PER_CPU(struct ssb_state, ssb_state);
427
428void speculative_store_bypass_ht_init(void)
885f82bf 429{
1f50ddb4
TG
430 struct ssb_state *st = this_cpu_ptr(&ssb_state);
431 unsigned int this_cpu = smp_processor_id();
432 unsigned int cpu;
433
434 st->local_state = 0;
435
436 /*
437 * Shared state setup happens once on the first bringup
438 * of the CPU. It's not destroyed on CPU hotunplug.
439 */
440 if (st->shared_state)
441 return;
442
443 raw_spin_lock_init(&st->lock);
444
445 /*
446 * Go over HT siblings and check whether one of them has set up the
447 * shared state pointer already.
448 */
449 for_each_cpu(cpu, topology_sibling_cpumask(this_cpu)) {
450 if (cpu == this_cpu)
451 continue;
452
453 if (!per_cpu(ssb_state, cpu).shared_state)
454 continue;
455
456 /* Link it to the state of the sibling: */
457 st->shared_state = per_cpu(ssb_state, cpu).shared_state;
458 return;
459 }
460
461 /*
462 * First HT sibling to come up on the core. Link shared state of
463 * the first HT sibling to itself. The siblings on the same core
464 * which come up later will see the shared state pointer and link
465 * themself to the state of this CPU.
466 */
467 st->shared_state = st;
468}
885f82bf 469
1f50ddb4
TG
470/*
471 * Logic is: First HT sibling enables SSBD for both siblings in the core
472 * and last sibling to disable it, disables it for the whole core. This how
473 * MSR_SPEC_CTRL works in "hardware":
474 *
475 * CORE_SPEC_CTRL = THREAD0_SPEC_CTRL | THREAD1_SPEC_CTRL
476 */
477static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
478{
479 struct ssb_state *st = this_cpu_ptr(&ssb_state);
480 u64 msr = x86_amd_ls_cfg_base;
481
482 if (!static_cpu_has(X86_FEATURE_ZEN)) {
483 msr |= ssbd_tif_to_amd_ls_cfg(tifn);
885f82bf 484 wrmsrl(MSR_AMD64_LS_CFG, msr);
1f50ddb4
TG
485 return;
486 }
487
488 if (tifn & _TIF_SSBD) {
489 /*
490 * Since this can race with prctl(), block reentry on the
491 * same CPU.
492 */
493 if (__test_and_set_bit(LSTATE_SSB, &st->local_state))
494 return;
495
496 msr |= x86_amd_ls_cfg_ssbd_mask;
497
498 raw_spin_lock(&st->shared_state->lock);
499 /* First sibling enables SSBD: */
500 if (!st->shared_state->disable_state)
501 wrmsrl(MSR_AMD64_LS_CFG, msr);
502 st->shared_state->disable_state++;
503 raw_spin_unlock(&st->shared_state->lock);
885f82bf 504 } else {
1f50ddb4
TG
505 if (!__test_and_clear_bit(LSTATE_SSB, &st->local_state))
506 return;
507
508 raw_spin_lock(&st->shared_state->lock);
509 st->shared_state->disable_state--;
510 if (!st->shared_state->disable_state)
511 wrmsrl(MSR_AMD64_LS_CFG, msr);
512 raw_spin_unlock(&st->shared_state->lock);
885f82bf
TG
513 }
514}
1f50ddb4
TG
515#else
516static __always_inline void amd_set_core_ssb_state(unsigned long tifn)
517{
518 u64 msr = x86_amd_ls_cfg_base | ssbd_tif_to_amd_ls_cfg(tifn);
519
520 wrmsrl(MSR_AMD64_LS_CFG, msr);
521}
522#endif
523
11fb0683
TL
524static __always_inline void amd_set_ssb_virt_state(unsigned long tifn)
525{
526 /*
527 * SSBD has the same definition in SPEC_CTRL and VIRT_SPEC_CTRL,
528 * so ssbd_tif_to_spec_ctrl() just works.
529 */
530 wrmsrl(MSR_AMD64_VIRT_SPEC_CTRL, ssbd_tif_to_spec_ctrl(tifn));
531}
532
01daf568
TC
533/*
534 * Update the MSRs managing speculation control, during context switch.
535 *
536 * tifp: Previous task's thread flags
537 * tifn: Next task's thread flags
538 */
539static __always_inline void __speculation_ctrl_update(unsigned long tifp,
540 unsigned long tifn)
1f50ddb4 541{
5bfbe3ad 542 unsigned long tif_diff = tifp ^ tifn;
01daf568
TC
543 u64 msr = x86_spec_ctrl_base;
544 bool updmsr = false;
545
2f5fb193
TG
546 lockdep_assert_irqs_disabled();
547
5bfbe3ad
TC
548 /*
549 * If TIF_SSBD is different, select the proper mitigation
550 * method. Note that if SSBD mitigation is disabled or permanentely
551 * enabled this branch can't be taken because nothing can set
552 * TIF_SSBD.
553 */
554 if (tif_diff & _TIF_SSBD) {
01daf568
TC
555 if (static_cpu_has(X86_FEATURE_VIRT_SSBD)) {
556 amd_set_ssb_virt_state(tifn);
557 } else if (static_cpu_has(X86_FEATURE_LS_CFG_SSBD)) {
558 amd_set_core_ssb_state(tifn);
559 } else if (static_cpu_has(X86_FEATURE_SPEC_CTRL_SSBD) ||
560 static_cpu_has(X86_FEATURE_AMD_SSBD)) {
561 msr |= ssbd_tif_to_spec_ctrl(tifn);
562 updmsr = true;
563 }
564 }
1f50ddb4 565
5bfbe3ad
TC
566 /*
567 * Only evaluate TIF_SPEC_IB if conditional STIBP is enabled,
568 * otherwise avoid the MSR write.
569 */
570 if (IS_ENABLED(CONFIG_SMP) &&
571 static_branch_unlikely(&switch_to_cond_stibp)) {
572 updmsr |= !!(tif_diff & _TIF_SPEC_IB);
573 msr |= stibp_tif_to_spec_ctrl(tifn);
574 }
575
01daf568
TC
576 if (updmsr)
577 wrmsrl(MSR_IA32_SPEC_CTRL, msr);
1f50ddb4
TG
578}
579
6d991ba5 580static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk)
1f50ddb4 581{
6d991ba5
TG
582 if (test_and_clear_tsk_thread_flag(tsk, TIF_SPEC_FORCE_UPDATE)) {
583 if (task_spec_ssb_disable(tsk))
584 set_tsk_thread_flag(tsk, TIF_SSBD);
585 else
586 clear_tsk_thread_flag(tsk, TIF_SSBD);
9137bb27
TG
587
588 if (task_spec_ib_disable(tsk))
589 set_tsk_thread_flag(tsk, TIF_SPEC_IB);
590 else
591 clear_tsk_thread_flag(tsk, TIF_SPEC_IB);
6d991ba5
TG
592 }
593 /* Return the updated threadinfo flags*/
594 return task_thread_info(tsk)->flags;
1f50ddb4 595}
885f82bf 596
26c4d75b 597void speculation_ctrl_update(unsigned long tif)
885f82bf 598{
2f5fb193
TG
599 unsigned long flags;
600
01daf568 601 /* Forced update. Make sure all relevant TIF flags are different */
2f5fb193 602 local_irq_save(flags);
01daf568 603 __speculation_ctrl_update(~tif, tif);
2f5fb193 604 local_irq_restore(flags);
885f82bf
TG
605}
606
6d991ba5
TG
607/* Called from seccomp/prctl update */
608void speculation_ctrl_update_current(void)
609{
610 preempt_disable();
611 speculation_ctrl_update(speculation_ctrl_update_tif(current));
612 preempt_enable();
613}
614
ff16701a 615void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p)
389d1fb1 616{
af8b3cd3 617 unsigned long tifp, tifn;
389d1fb1 618
af8b3cd3
KH
619 tifn = READ_ONCE(task_thread_info(next_p)->flags);
620 tifp = READ_ONCE(task_thread_info(prev_p)->flags);
22fe5b04
TG
621
622 switch_to_bitmap(tifp);
af8b3cd3
KH
623
624 propagate_user_return_notify(prev_p, next_p);
625
b9894a2f
KH
626 if ((tifp & _TIF_BLOCKSTEP || tifn & _TIF_BLOCKSTEP) &&
627 arch_has_block_step()) {
628 unsigned long debugctl, msk;
ea8e61b7 629
b9894a2f 630 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
ea8e61b7 631 debugctl &= ~DEBUGCTLMSR_BTF;
b9894a2f
KH
632 msk = tifn & _TIF_BLOCKSTEP;
633 debugctl |= (msk >> TIF_BLOCKSTEP) << DEBUGCTLMSR_BTF_SHIFT;
634 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
ea8e61b7 635 }
389d1fb1 636
5a920155 637 if ((tifp ^ tifn) & _TIF_NOTSC)
9d0b6232 638 cr4_toggle_bits_irqsoff(X86_CR4_TSD);
e9ea1e7f
KH
639
640 if ((tifp ^ tifn) & _TIF_NOCPUID)
641 set_cpuid_faulting(!!(tifn & _TIF_NOCPUID));
885f82bf 642
6d991ba5
TG
643 if (likely(!((tifp | tifn) & _TIF_SPEC_FORCE_UPDATE))) {
644 __speculation_ctrl_update(tifp, tifn);
645 } else {
646 speculation_ctrl_update_tif(prev_p);
647 tifn = speculation_ctrl_update_tif(next_p);
648
649 /* Enforce MSR update to ensure consistent state */
650 __speculation_ctrl_update(~tifn, tifn);
651 }
6650cdd9
PZI
652
653 if ((tifp ^ tifn) & _TIF_SLD)
654 switch_to_sld(tifn);
389d1fb1
JF
655}
656
00dba564
TG
657/*
658 * Idle related variables and functions
659 */
d1896049 660unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
00dba564
TG
661EXPORT_SYMBOL(boot_option_idle_override);
662
a476bda3 663static void (*x86_idle)(void);
00dba564 664
90e24014
RW
665#ifndef CONFIG_SMP
666static inline void play_dead(void)
667{
668 BUG();
669}
670#endif
671
7d1a9417
TG
672void arch_cpu_idle_enter(void)
673{
6a369583 674 tsc_verify_tsc_adjust(false);
7d1a9417 675 local_touch_nmi();
7d1a9417 676}
90e24014 677
7d1a9417
TG
678void arch_cpu_idle_dead(void)
679{
680 play_dead();
681}
90e24014 682
7d1a9417
TG
683/*
684 * Called from the generic idle code.
685 */
686void arch_cpu_idle(void)
687{
16f8b05a 688 x86_idle();
90e24014
RW
689}
690
00dba564 691/*
7d1a9417 692 * We use this if we don't have any better idle routine..
00dba564 693 */
6727ad9e 694void __cpuidle default_idle(void)
00dba564 695{
4d0e42cc 696 trace_cpu_idle_rcuidle(1, smp_processor_id());
7d1a9417 697 safe_halt();
4d0e42cc 698 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
00dba564 699}
fa86ee90 700#if defined(CONFIG_APM_MODULE) || defined(CONFIG_HALTPOLL_CPUIDLE_MODULE)
00dba564
TG
701EXPORT_SYMBOL(default_idle);
702#endif
703
6a377ddc
LB
704#ifdef CONFIG_XEN
705bool xen_set_default_idle(void)
e5fd47bf 706{
a476bda3 707 bool ret = !!x86_idle;
e5fd47bf 708
a476bda3 709 x86_idle = default_idle;
e5fd47bf
KRW
710
711 return ret;
712}
6a377ddc 713#endif
bba4ed01 714
d3ec5cae
IV
715void stop_this_cpu(void *dummy)
716{
717 local_irq_disable();
718 /*
719 * Remove this CPU:
720 */
4f062896 721 set_cpu_online(smp_processor_id(), false);
d3ec5cae 722 disable_local_APIC();
8838eb6c 723 mcheck_cpu_clear(this_cpu_ptr(&cpu_info));
d3ec5cae 724
f23d74f6
TL
725 /*
726 * Use wbinvd on processors that support SME. This provides support
727 * for performing a successful kexec when going from SME inactive
728 * to SME active (or vice-versa). The cache must be cleared so that
729 * if there are entries with the same physical address, both with and
730 * without the encryption bit, they don't race each other when flushed
731 * and potentially end up with the wrong entry being committed to
732 * memory.
733 */
734 if (boot_cpu_has(X86_FEATURE_SME))
735 native_wbinvd();
bba4ed01
TL
736 for (;;) {
737 /*
f23d74f6
TL
738 * Use native_halt() so that memory contents don't change
739 * (stack usage and variables) after possibly issuing the
740 * native_wbinvd() above.
bba4ed01 741 */
f23d74f6 742 native_halt();
bba4ed01 743 }
7f424a8b
PZ
744}
745
aa276e1c 746/*
07c94a38
BP
747 * AMD Erratum 400 aware idle routine. We handle it the same way as C3 power
748 * states (local apic timer and TSC stop).
aa276e1c 749 */
02c68a02 750static void amd_e400_idle(void)
aa276e1c 751{
07c94a38
BP
752 /*
753 * We cannot use static_cpu_has_bug() here because X86_BUG_AMD_APIC_C1E
754 * gets set after static_cpu_has() places have been converted via
755 * alternatives.
756 */
757 if (!boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
758 default_idle();
759 return;
aa276e1c
TG
760 }
761
07c94a38 762 tick_broadcast_enter();
aa276e1c 763
07c94a38 764 default_idle();
0beefa20 765
07c94a38
BP
766 /*
767 * The switch back from broadcast mode needs to be called with
768 * interrupts disabled.
769 */
770 local_irq_disable();
771 tick_broadcast_exit();
772 local_irq_enable();
aa276e1c
TG
773}
774
b253149b
LB
775/*
776 * Intel Core2 and older machines prefer MWAIT over HALT for C1.
777 * We can't rely on cpuidle installing MWAIT, because it will not load
778 * on systems that support only C1 -- so the boot default must be MWAIT.
779 *
780 * Some AMD machines are the opposite, they depend on using HALT.
781 *
782 * So for default C1, which is used during boot until cpuidle loads,
783 * use MWAIT-C1 on Intel HW that has it, else use HALT.
784 */
785static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
786{
787 if (c->x86_vendor != X86_VENDOR_INTEL)
788 return 0;
789
67e87d43 790 if (!cpu_has(c, X86_FEATURE_MWAIT) || boot_cpu_has_bug(X86_BUG_MONITOR))
b253149b
LB
791 return 0;
792
793 return 1;
794}
795
796/*
0fb0328d
HR
797 * MONITOR/MWAIT with no hints, used for default C1 state. This invokes MWAIT
798 * with interrupts enabled and no flags, which is backwards compatible with the
799 * original MWAIT implementation.
b253149b 800 */
6727ad9e 801static __cpuidle void mwait_idle(void)
b253149b 802{
f8e617f4 803 if (!current_set_polling_and_test()) {
e43d0189 804 trace_cpu_idle_rcuidle(1, smp_processor_id());
f8e617f4 805 if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
ca59809f 806 mb(); /* quirk */
b253149b 807 clflush((void *)&current_thread_info()->flags);
ca59809f 808 mb(); /* quirk */
f8e617f4 809 }
b253149b
LB
810
811 __monitor((void *)&current_thread_info()->flags, 0, 0);
b253149b
LB
812 if (!need_resched())
813 __sti_mwait(0, 0);
814 else
815 local_irq_enable();
e43d0189 816 trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id());
f8e617f4 817 } else {
b253149b 818 local_irq_enable();
f8e617f4
MG
819 }
820 __current_clr_polling();
b253149b
LB
821}
822
148f9bb8 823void select_idle_routine(const struct cpuinfo_x86 *c)
7f424a8b 824{
3e5095d1 825#ifdef CONFIG_SMP
7d1a9417 826 if (boot_option_idle_override == IDLE_POLL && smp_num_siblings > 1)
c767a54b 827 pr_warn_once("WARNING: polling idle and HT enabled, performance may degrade\n");
7f424a8b 828#endif
7d1a9417 829 if (x86_idle || boot_option_idle_override == IDLE_POLL)
6ddd2a27
TG
830 return;
831
3344ed30 832 if (boot_cpu_has_bug(X86_BUG_AMD_E400)) {
c767a54b 833 pr_info("using AMD E400 aware idle routine\n");
a476bda3 834 x86_idle = amd_e400_idle;
b253149b
LB
835 } else if (prefer_mwait_c1_over_halt(c)) {
836 pr_info("using mwait in idle threads\n");
837 x86_idle = mwait_idle;
6ddd2a27 838 } else
a476bda3 839 x86_idle = default_idle;
7f424a8b
PZ
840}
841
07c94a38 842void amd_e400_c1e_apic_setup(void)
30e1e6d1 843{
07c94a38
BP
844 if (boot_cpu_has_bug(X86_BUG_AMD_APIC_C1E)) {
845 pr_info("Switch to broadcast mode on CPU%d\n", smp_processor_id());
846 local_irq_disable();
847 tick_broadcast_force();
848 local_irq_enable();
849 }
30e1e6d1
RR
850}
851
e7ff3a47
TG
852void __init arch_post_acpi_subsys_init(void)
853{
854 u32 lo, hi;
855
856 if (!boot_cpu_has_bug(X86_BUG_AMD_E400))
857 return;
858
859 /*
860 * AMD E400 detection needs to happen after ACPI has been enabled. If
861 * the machine is affected K8_INTP_C1E_ACTIVE_MASK bits are set in
862 * MSR_K8_INT_PENDING_MSG.
863 */
864 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
865 if (!(lo & K8_INTP_C1E_ACTIVE_MASK))
866 return;
867
868 boot_cpu_set_bug(X86_BUG_AMD_APIC_C1E);
869
870 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
871 mark_tsc_unstable("TSC halt in AMD C1E");
872 pr_info("System has AMD C1E enabled\n");
873}
874
7f424a8b
PZ
875static int __init idle_setup(char *str)
876{
ab6bc3e3
CG
877 if (!str)
878 return -EINVAL;
879
7f424a8b 880 if (!strcmp(str, "poll")) {
c767a54b 881 pr_info("using polling idle threads\n");
d1896049 882 boot_option_idle_override = IDLE_POLL;
7d1a9417 883 cpu_idle_poll_ctrl(true);
d1896049 884 } else if (!strcmp(str, "halt")) {
c1e3b377
ZY
885 /*
886 * When the boot option of idle=halt is added, halt is
887 * forced to be used for CPU idle. In such case CPU C2/C3
888 * won't be used again.
889 * To continue to load the CPU idle driver, don't touch
890 * the boot_option_idle_override.
891 */
a476bda3 892 x86_idle = default_idle;
d1896049 893 boot_option_idle_override = IDLE_HALT;
da5e09a1
ZY
894 } else if (!strcmp(str, "nomwait")) {
895 /*
896 * If the boot option of "idle=nomwait" is added,
897 * it means that mwait will be disabled for CPU C2/C3
898 * states. In such case it won't touch the variable
899 * of boot_option_idle_override.
900 */
d1896049 901 boot_option_idle_override = IDLE_NOMWAIT;
c1e3b377 902 } else
7f424a8b
PZ
903 return -1;
904
7f424a8b
PZ
905 return 0;
906}
907early_param("idle", idle_setup);
908
9d62dcdf
AW
909unsigned long arch_align_stack(unsigned long sp)
910{
911 if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
912 sp -= get_random_int() % 8192;
913 return sp & ~0xf;
914}
915
916unsigned long arch_randomize_brk(struct mm_struct *mm)
917{
9c6f0902 918 return randomize_page(mm->brk, 0x02000000);
9d62dcdf
AW
919}
920
7ba78053
TG
921/*
922 * Called from fs/proc with a reference on @p to find the function
923 * which called into schedule(). This needs to be done carefully
924 * because the task might wake up and we might look at a stack
925 * changing under us.
926 */
927unsigned long get_wchan(struct task_struct *p)
928{
74327a3e 929 unsigned long start, bottom, top, sp, fp, ip, ret = 0;
7ba78053
TG
930 int count = 0;
931
6e662ae7 932 if (p == current || p->state == TASK_RUNNING)
7ba78053
TG
933 return 0;
934
74327a3e
AL
935 if (!try_get_task_stack(p))
936 return 0;
937
7ba78053
TG
938 start = (unsigned long)task_stack_page(p);
939 if (!start)
74327a3e 940 goto out;
7ba78053
TG
941
942 /*
943 * Layout of the stack page:
944 *
945 * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long)
946 * PADDING
947 * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
948 * stack
15f4eae7 949 * ----------- bottom = start
7ba78053
TG
950 *
951 * The tasks stack pointer points at the location where the
952 * framepointer is stored. The data on the stack is:
953 * ... IP FP ... IP FP
954 *
955 * We need to read FP and IP, so we need to adjust the upper
956 * bound by another unsigned long.
957 */
958 top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
959 top -= 2 * sizeof(unsigned long);
15f4eae7 960 bottom = start;
7ba78053
TG
961
962 sp = READ_ONCE(p->thread.sp);
963 if (sp < bottom || sp > top)
74327a3e 964 goto out;
7ba78053 965
7b32aead 966 fp = READ_ONCE_NOCHECK(((struct inactive_task_frame *)sp)->bp);
7ba78053
TG
967 do {
968 if (fp < bottom || fp > top)
74327a3e 969 goto out;
f7d27c35 970 ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
74327a3e
AL
971 if (!in_sched_functions(ip)) {
972 ret = ip;
973 goto out;
974 }
f7d27c35 975 fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
7ba78053 976 } while (count++ < 16 && p->state != TASK_RUNNING);
74327a3e
AL
977
978out:
979 put_task_stack(p);
980 return ret;
7ba78053 981}
b0b9b014
KH
982
983long do_arch_prctl_common(struct task_struct *task, int option,
984 unsigned long cpuid_enabled)
985{
e9ea1e7f
KH
986 switch (option) {
987 case ARCH_GET_CPUID:
988 return get_cpuid_mode();
989 case ARCH_SET_CPUID:
990 return set_cpuid_mode(task, cpuid_enabled);
991 }
992
b0b9b014
KH
993 return -EINVAL;
994}