]> git.ipfire.org Git - thirdparty/kernel/stable.git/blame - arch/x86/kernel/process_64.c
treewide: Add SPDX license identifier for missed files
[thirdparty/kernel/stable.git] / arch / x86 / kernel / process_64.c
CommitLineData
457c8996 1// SPDX-License-Identifier: GPL-2.0-only
1da177e4 2/*
1da177e4
LT
3 * Copyright (C) 1995 Linus Torvalds
4 *
5 * Pentium III FXSR, SSE support
6 * Gareth Hughes <gareth@valinux.com>, May 2000
6612538c 7 *
1da177e4
LT
8 * X86-64 port
9 * Andi Kleen.
76e4f660
AR
10 *
11 * CPU hotplug support - ashok.raj@intel.com
1da177e4
LT
12 */
13
14/*
15 * This file handles the architecture-dependent parts of process handling..
16 */
17
76e4f660 18#include <linux/cpu.h>
1da177e4
LT
19#include <linux/errno.h>
20#include <linux/sched.h>
29930025 21#include <linux/sched/task.h>
68db0cf1 22#include <linux/sched/task_stack.h>
6612538c 23#include <linux/fs.h>
1da177e4
LT
24#include <linux/kernel.h>
25#include <linux/mm.h>
26#include <linux/elfcore.h>
27#include <linux/smp.h>
28#include <linux/slab.h>
29#include <linux/user.h>
1da177e4
LT
30#include <linux/interrupt.h>
31#include <linux/delay.h>
186f4360 32#include <linux/export.h>
1da177e4 33#include <linux/ptrace.h>
95833c83 34#include <linux/notifier.h>
c6fd91f0 35#include <linux/kprobes.h>
1eeb66a1 36#include <linux/kdebug.h>
529e25f6 37#include <linux/prctl.h>
7de08b4e
GP
38#include <linux/uaccess.h>
39#include <linux/io.h>
8b96f011 40#include <linux/ftrace.h>
ff3f097e 41#include <linux/syscalls.h>
1da177e4 42
1da177e4 43#include <asm/pgtable.h>
1da177e4 44#include <asm/processor.h>
78f7f1e5 45#include <asm/fpu/internal.h>
1da177e4 46#include <asm/mmu_context.h>
1da177e4 47#include <asm/prctl.h>
1da177e4
LT
48#include <asm/desc.h>
49#include <asm/proto.h>
50#include <asm/ia32.h>
bbc1f698 51#include <asm/syscalls.h>
66cb5917 52#include <asm/debugreg.h>
f05e798a 53#include <asm/switch_to.h>
b7a58459 54#include <asm/xen/hypervisor.h>
2eefd878 55#include <asm/vdso.h>
fa7d9493 56#include <asm/resctrl_sched.h>
ada26481 57#include <asm/unistd.h>
b1378a56 58#include <asm/fsgsbase.h>
ada26481
DS
59#ifdef CONFIG_IA32_EMULATION
60/* Not included via unistd.h */
61#include <asm/unistd_32_ia32.h>
62#endif
1da177e4 63
ff16701a
TG
64#include "process.h"
65
6612538c 66/* Prints also some state that isn't saved in the pt_regs */
9fe6299d 67void __show_regs(struct pt_regs *regs, enum show_regs_mode mode)
1da177e4
LT
68{
69 unsigned long cr0 = 0L, cr2 = 0L, cr3 = 0L, cr4 = 0L, fs, gs, shadowgs;
bb1995d5 70 unsigned long d0, d1, d2, d3, d6, d7;
6612538c 71 unsigned int fsindex, gsindex;
d38bc89c 72 unsigned int ds, es;
814e2c84 73
b02fcf9b
JP
74 show_iret_regs(regs);
75
6fa81a12
JP
76 if (regs->orig_ax != -1)
77 pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax);
78 else
79 pr_cont("\n");
80
d015a092 81 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
65ea5b03 82 regs->ax, regs->bx, regs->cx);
d015a092 83 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
65ea5b03 84 regs->dx, regs->si, regs->di);
d015a092 85 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
65ea5b03 86 regs->bp, regs->r8, regs->r9);
d015a092 87 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
7de08b4e 88 regs->r10, regs->r11, regs->r12);
d015a092 89 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
7de08b4e 90 regs->r13, regs->r14, regs->r15);
1da177e4 91
9fe6299d 92 if (mode == SHOW_REGS_SHORT)
b02fcf9b
JP
93 return;
94
9fe6299d
JH
95 if (mode == SHOW_REGS_USER) {
96 rdmsrl(MSR_FS_BASE, fs);
97 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
98 printk(KERN_DEFAULT "FS: %016lx GS: %016lx\n",
99 fs, shadowgs);
100 return;
101 }
102
7de08b4e 103 asm("movl %%ds,%0" : "=r" (ds));
7de08b4e 104 asm("movl %%es,%0" : "=r" (es));
1da177e4
LT
105 asm("movl %%fs,%0" : "=r" (fsindex));
106 asm("movl %%gs,%0" : "=r" (gsindex));
107
108 rdmsrl(MSR_FS_BASE, fs);
7de08b4e
GP
109 rdmsrl(MSR_GS_BASE, gs);
110 rdmsrl(MSR_KERNEL_GS_BASE, shadowgs);
1da177e4 111
f51c9452
GOC
112 cr0 = read_cr0();
113 cr2 = read_cr2();
6c690ee1 114 cr3 = __read_cr3();
1e02ce4c 115 cr4 = __read_cr4();
1da177e4 116
d015a092 117 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
7de08b4e 118 fs, fsindex, gs, gsindex, shadowgs);
d38bc89c 119 printk(KERN_DEFAULT "CS: %04lx DS: %04x ES: %04x CR0: %016lx\n", regs->cs, ds,
8092c654 120 es, cr0);
d015a092 121 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
8092c654 122 cr4);
bb1995d5
AS
123
124 get_debugreg(d0, 0);
125 get_debugreg(d1, 1);
126 get_debugreg(d2, 2);
bb1995d5
AS
127 get_debugreg(d3, 3);
128 get_debugreg(d6, 6);
129 get_debugreg(d7, 7);
4338774c
DJ
130
131 /* Only print out debug registers if they are in their non-default state. */
ba6d018e
NI
132 if (!((d0 == 0) && (d1 == 0) && (d2 == 0) && (d3 == 0) &&
133 (d6 == DR6_RESERVED) && (d7 == 0x400))) {
134 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n",
135 d0, d1, d2);
136 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n",
137 d3, d6, d7);
138 }
4338774c 139
c0b17b5b
DH
140 if (boot_cpu_has(X86_FEATURE_OSPKE))
141 printk(KERN_DEFAULT "PKRU: %08x\n", read_pkru());
1da177e4
LT
142}
143
1da177e4
LT
144void release_thread(struct task_struct *dead_task)
145{
146 if (dead_task->mm) {
a5b9e5a2 147#ifdef CONFIG_MODIFY_LDT_SYSCALL
37868fe1 148 if (dead_task->mm->context.ldt) {
349eab6e 149 pr_warn("WARNING: dead process %s still has LDT? <%p/%d>\n",
c767a54b 150 dead_task->comm,
0d430e3f 151 dead_task->mm->context.ldt->entries,
bbf79d21 152 dead_task->mm->context.ldt->nr_entries);
1da177e4
LT
153 BUG();
154 }
a5b9e5a2 155#endif
1da177e4
LT
156 }
157}
158
e137a4d8
AL
159enum which_selector {
160 FS,
161 GS
162};
163
164/*
165 * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are
166 * not available. The goal is to be reasonably fast on non-FSGSBASE systems.
167 * It's forcibly inlined because it'll generate better code and this function
168 * is hot.
169 */
170static __always_inline void save_base_legacy(struct task_struct *prev_p,
171 unsigned short selector,
172 enum which_selector which)
173{
174 if (likely(selector == 0)) {
175 /*
176 * On Intel (without X86_BUG_NULL_SEG), the segment base could
177 * be the pre-existing saved base or it could be zero. On AMD
178 * (with X86_BUG_NULL_SEG), the segment base could be almost
179 * anything.
180 *
181 * This branch is very hot (it's hit twice on almost every
182 * context switch between 64-bit programs), and avoiding
183 * the RDMSR helps a lot, so we just assume that whatever
184 * value is already saved is correct. This matches historical
185 * Linux behavior, so it won't break existing applications.
186 *
187 * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we
188 * report that the base is zero, it needs to actually be zero:
189 * see the corresponding logic in load_seg_legacy.
190 */
191 } else {
192 /*
193 * If the selector is 1, 2, or 3, then the base is zero on
194 * !X86_BUG_NULL_SEG CPUs and could be anything on
195 * X86_BUG_NULL_SEG CPUs. In the latter case, Linux
196 * has never attempted to preserve the base across context
197 * switches.
198 *
199 * If selector > 3, then it refers to a real segment, and
200 * saving the base isn't necessary.
201 */
202 if (which == FS)
203 prev_p->thread.fsbase = 0;
204 else
205 prev_p->thread.gsbase = 0;
206 }
207}
208
209static __always_inline void save_fsgs(struct task_struct *task)
210{
211 savesegment(fs, task->thread.fsindex);
212 savesegment(gs, task->thread.gsindex);
213 save_base_legacy(task, task->thread.fsindex, FS);
214 save_base_legacy(task, task->thread.gsindex, GS);
215}
216
42b933b5
VK
217#if IS_ENABLED(CONFIG_KVM)
218/*
219 * While a process is running,current->thread.fsbase and current->thread.gsbase
220 * may not match the corresponding CPU registers (see save_base_legacy()). KVM
221 * wants an efficient way to save and restore FSBASE and GSBASE.
222 * When FSGSBASE extensions are enabled, this will have to use RD{FS,GS}BASE.
223 */
224void save_fsgs_for_kvm(void)
225{
226 save_fsgs(current);
227}
228EXPORT_SYMBOL_GPL(save_fsgs_for_kvm);
229#endif
230
e137a4d8
AL
231static __always_inline void loadseg(enum which_selector which,
232 unsigned short sel)
233{
234 if (which == FS)
235 loadsegment(fs, sel);
236 else
237 load_gs_index(sel);
238}
239
240static __always_inline void load_seg_legacy(unsigned short prev_index,
241 unsigned long prev_base,
242 unsigned short next_index,
243 unsigned long next_base,
244 enum which_selector which)
245{
246 if (likely(next_index <= 3)) {
247 /*
248 * The next task is using 64-bit TLS, is not using this
249 * segment at all, or is having fun with arcane CPU features.
250 */
251 if (next_base == 0) {
252 /*
253 * Nasty case: on AMD CPUs, we need to forcibly zero
254 * the base.
255 */
256 if (static_cpu_has_bug(X86_BUG_NULL_SEG)) {
257 loadseg(which, __USER_DS);
258 loadseg(which, next_index);
259 } else {
260 /*
261 * We could try to exhaustively detect cases
262 * under which we can skip the segment load,
263 * but there's really only one case that matters
264 * for performance: if both the previous and
265 * next states are fully zeroed, we can skip
266 * the load.
267 *
268 * (This assumes that prev_base == 0 has no
269 * false positives. This is the case on
270 * Intel-style CPUs.)
271 */
272 if (likely(prev_index | next_index | prev_base))
273 loadseg(which, next_index);
274 }
275 } else {
276 if (prev_index != next_index)
277 loadseg(which, next_index);
278 wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE,
279 next_base);
280 }
281 } else {
282 /*
283 * The next task is using a real segment. Loading the selector
284 * is sufficient.
285 */
286 loadseg(which, next_index);
287 }
288}
289
f4550b52
CB
290static __always_inline void x86_fsgsbase_load(struct thread_struct *prev,
291 struct thread_struct *next)
292{
293 load_seg_legacy(prev->fsindex, prev->fsbase,
294 next->fsindex, next->fsbase, FS);
295 load_seg_legacy(prev->gsindex, prev->gsbase,
296 next->gsindex, next->gsbase, GS);
297}
298
e696c231
CB
299static unsigned long x86_fsgsbase_read_task(struct task_struct *task,
300 unsigned short selector)
b1378a56
CB
301{
302 unsigned short idx = selector >> 3;
303 unsigned long base;
304
305 if (likely((selector & SEGMENT_TI_MASK) == 0)) {
306 if (unlikely(idx >= GDT_ENTRIES))
307 return 0;
308
309 /*
310 * There are no user segments in the GDT with nonzero bases
311 * other than the TLS segments.
312 */
313 if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX)
314 return 0;
315
316 idx -= GDT_ENTRY_TLS_MIN;
317 base = get_desc_base(&task->thread.tls_array[idx]);
318 } else {
319#ifdef CONFIG_MODIFY_LDT_SYSCALL
320 struct ldt_struct *ldt;
321
322 /*
323 * If performance here mattered, we could protect the LDT
324 * with RCU. This is a slow path, though, so we can just
325 * take the mutex.
326 */
327 mutex_lock(&task->mm->context.lock);
328 ldt = task->mm->context.ldt;
329 if (unlikely(idx >= ldt->nr_entries))
330 base = 0;
331 else
332 base = get_desc_base(ldt->entries + idx);
333 mutex_unlock(&task->mm->context.lock);
334#else
335 base = 0;
336#endif
337 }
338
339 return base;
340}
341
b1378a56
CB
342unsigned long x86_fsbase_read_task(struct task_struct *task)
343{
344 unsigned long fsbase;
345
346 if (task == current)
347 fsbase = x86_fsbase_read_cpu();
348 else if (task->thread.fsindex == 0)
349 fsbase = task->thread.fsbase;
350 else
351 fsbase = x86_fsgsbase_read_task(task, task->thread.fsindex);
352
353 return fsbase;
354}
355
356unsigned long x86_gsbase_read_task(struct task_struct *task)
357{
358 unsigned long gsbase;
359
360 if (task == current)
361 gsbase = x86_gsbase_read_cpu_inactive();
362 else if (task->thread.gsindex == 0)
363 gsbase = task->thread.gsbase;
364 else
365 gsbase = x86_fsgsbase_read_task(task, task->thread.gsindex);
366
367 return gsbase;
368}
369
87ab4689 370void x86_fsbase_write_task(struct task_struct *task, unsigned long fsbase)
b1378a56 371{
87ab4689 372 WARN_ON_ONCE(task == current);
b1378a56 373
b1378a56 374 task->thread.fsbase = fsbase;
b1378a56
CB
375}
376
87ab4689 377void x86_gsbase_write_task(struct task_struct *task, unsigned long gsbase)
b1378a56 378{
87ab4689 379 WARN_ON_ONCE(task == current);
b1378a56 380
b1378a56 381 task->thread.gsbase = gsbase;
b1378a56
CB
382}
383
c1bd55f9
JT
384int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
385 unsigned long arg, struct task_struct *p, unsigned long tls)
1da177e4
LT
386{
387 int err;
7de08b4e 388 struct pt_regs *childregs;
0100301b
BG
389 struct fork_frame *fork_frame;
390 struct inactive_task_frame *frame;
1da177e4
LT
391 struct task_struct *me = current;
392
7076aada 393 childregs = task_pt_regs(p);
0100301b
BG
394 fork_frame = container_of(childregs, struct fork_frame, regs);
395 frame = &fork_frame->frame;
6690e86b 396
0100301b
BG
397 frame->bp = 0;
398 frame->ret_addr = (unsigned long) ret_from_fork;
399 p->thread.sp = (unsigned long) fork_frame;
66cb5917 400 p->thread.io_bitmap_ptr = NULL;
1da177e4 401
ada85708 402 savesegment(gs, p->thread.gsindex);
296f781a 403 p->thread.gsbase = p->thread.gsindex ? 0 : me->thread.gsbase;
ada85708 404 savesegment(fs, p->thread.fsindex);
296f781a 405 p->thread.fsbase = p->thread.fsindex ? 0 : me->thread.fsbase;
ada85708
JF
406 savesegment(es, p->thread.es);
407 savesegment(ds, p->thread.ds);
7076aada
AV
408 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
409
1d4b4b29 410 if (unlikely(p->flags & PF_KTHREAD)) {
7076aada
AV
411 /* kernel thread */
412 memset(childregs, 0, sizeof(struct pt_regs));
616d2483
BG
413 frame->bx = sp; /* function */
414 frame->r12 = arg;
7076aada
AV
415 return 0;
416 }
616d2483 417 frame->bx = 0;
1d4b4b29 418 *childregs = *current_pt_regs();
7076aada
AV
419
420 childregs->ax = 0;
1d4b4b29
AV
421 if (sp)
422 childregs->sp = sp;
1da177e4 423
66cb5917 424 err = -ENOMEM;
d3a4f48d 425 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
cced4022
TM
426 p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr,
427 IO_BITMAP_BYTES, GFP_KERNEL);
1da177e4
LT
428 if (!p->thread.io_bitmap_ptr) {
429 p->thread.io_bitmap_max = 0;
430 return -ENOMEM;
431 }
d3a4f48d 432 set_tsk_thread_flag(p, TIF_IO_BITMAP);
6612538c 433 }
1da177e4
LT
434
435 /*
436 * Set a new TLS for the child thread?
437 */
438 if (clone_flags & CLONE_SETTLS) {
439#ifdef CONFIG_IA32_EMULATION
abfb9498 440 if (in_ia32_syscall())
efd1ca52 441 err = do_set_thread_area(p, -1,
c1bd55f9 442 (struct user_desc __user *)tls, 0);
7de08b4e
GP
443 else
444#endif
17a6e1b8 445 err = do_arch_prctl_64(p, ARCH_SET_FS, tls);
7de08b4e 446 if (err)
1da177e4
LT
447 goto out;
448 }
449 err = 0;
450out:
451 if (err && p->thread.io_bitmap_ptr) {
452 kfree(p->thread.io_bitmap_ptr);
453 p->thread.io_bitmap_max = 0;
454 }
66cb5917 455
1da177e4
LT
456 return err;
457}
458
e634d8fc
PA
459static void
460start_thread_common(struct pt_regs *regs, unsigned long new_ip,
461 unsigned long new_sp,
462 unsigned int _cs, unsigned int _ss, unsigned int _ds)
513ad84b 463{
767d035d
AL
464 WARN_ON_ONCE(regs != current_pt_regs());
465
466 if (static_cpu_has(X86_BUG_NULL_SEG)) {
467 /* Loading zero below won't clear the base. */
468 loadsegment(fs, __USER_DS);
469 load_gs_index(__USER_DS);
470 }
471
ada85708 472 loadsegment(fs, 0);
e634d8fc
PA
473 loadsegment(es, _ds);
474 loadsegment(ds, _ds);
513ad84b 475 load_gs_index(0);
767d035d 476
513ad84b
IM
477 regs->ip = new_ip;
478 regs->sp = new_sp;
e634d8fc
PA
479 regs->cs = _cs;
480 regs->ss = _ss;
a6f05a6a 481 regs->flags = X86_EFLAGS_IF;
1daeaa31 482 force_iret();
513ad84b 483}
e634d8fc
PA
484
485void
486start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
487{
488 start_thread_common(regs, new_ip, new_sp,
489 __USER_CS, __USER_DS, 0);
490}
dc76803e 491EXPORT_SYMBOL_GPL(start_thread);
513ad84b 492
7da77078
BG
493#ifdef CONFIG_COMPAT
494void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp)
a6f05a6a 495{
e634d8fc 496 start_thread_common(regs, new_ip, new_sp,
d1a797f3
PA
497 test_thread_flag(TIF_X32)
498 ? __USER_CS : __USER32_CS,
499 __USER_DS, __USER_DS);
a6f05a6a
PA
500}
501#endif
513ad84b 502
1da177e4
LT
503/*
504 * switch_to(x,y) should switch tasks from x to y.
505 *
6612538c 506 * This could still be optimized:
1da177e4
LT
507 * - fold all the options into a flag word and test it with a single test.
508 * - could test fs/gs bitsliced
099f318b
AK
509 *
510 * Kprobes not supported here. Set the probe on schedule instead.
8b96f011 511 * Function graph tracer not supported too.
1da177e4 512 */
35ea7903 513__visible __notrace_funcgraph struct task_struct *
a88cde13 514__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
1da177e4 515{
87b935a0
JF
516 struct thread_struct *prev = &prev_p->thread;
517 struct thread_struct *next = &next_p->thread;
384a23f9
IM
518 struct fpu *prev_fpu = &prev->fpu;
519 struct fpu *next_fpu = &next->fpu;
6612538c 520 int cpu = smp_processor_id();
e07e23e1 521
1d3e53e8
AL
522 WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) &&
523 this_cpu_read(irq_count) != -1);
524
5f409e20
RR
525 if (!test_thread_flag(TIF_NEED_FPU_LOAD))
526 switch_fpu_prepare(prev_fpu, cpu);
4903062b 527
478de5a9
JF
528 /* We must save %fs and %gs before load_TLS() because
529 * %fs and %gs may be cleared by load_TLS().
530 *
531 * (e.g. xen_load_tls())
532 */
e137a4d8 533 save_fsgs(prev_p);
478de5a9 534
f647d7c1
AL
535 /*
536 * Load TLS before restoring any segments so that segment loads
537 * reference the correct GDT entries.
538 */
1da177e4
LT
539 load_TLS(next, cpu);
540
3fe0a63e 541 /*
f647d7c1
AL
542 * Leave lazy mode, flushing any hypercalls made here. This
543 * must be done after loading TLS entries in the GDT but before
6dd677a0 544 * loading segments that might reference them.
3fe0a63e 545 */
224101ed 546 arch_end_context_switch(next_p);
3fe0a63e 547
f647d7c1
AL
548 /* Switch DS and ES.
549 *
550 * Reading them only returns the selectors, but writing them (if
551 * nonzero) loads the full descriptor from the GDT or LDT. The
552 * LDT for next is loaded in switch_mm, and the GDT is loaded
553 * above.
554 *
555 * We therefore need to write new values to the segment
556 * registers on every context switch unless both the new and old
557 * values are zero.
558 *
559 * Note that we don't need to do anything for CS and SS, as
560 * those are saved and restored as part of pt_regs.
561 */
562 savesegment(es, prev->es);
563 if (unlikely(next->es | prev->es))
564 loadsegment(es, next->es);
565
566 savesegment(ds, prev->ds);
567 if (unlikely(next->ds | prev->ds))
568 loadsegment(ds, next->ds);
569
f4550b52 570 x86_fsgsbase_load(prev, next);
1da177e4 571
7de08b4e 572 /*
45948d77 573 * Switch the PDA and FPU contexts.
1da177e4 574 */
c6ae41e7 575 this_cpu_write(current_task, next_p);
9aaefe7b 576 this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
18bd057b 577
5f409e20 578 switch_fpu_finish(next_fpu);
2722146e 579
bd7dc5a6 580 /* Reload sp0. */
252e1a05 581 update_task_stack(next_p);
b27559a4 582
ff16701a 583 switch_to_extra(prev_p, next_p);
1da177e4 584
5e57f1d6 585#ifdef CONFIG_XEN_PV
b7a58459
AL
586 /*
587 * On Xen PV, IOPL bits in pt_regs->flags have no effect, and
588 * current_pt_regs()->flags may not match the current task's
589 * intended IOPL. We need to switch it manually.
590 */
591 if (unlikely(static_cpu_has(X86_FEATURE_XENPV) &&
592 prev->iopl != next->iopl))
593 xen_set_iopl_mask(next->iopl);
594#endif
595
61f01dd9
AL
596 if (static_cpu_has_bug(X86_BUG_SYSRET_SS_ATTRS)) {
597 /*
598 * AMD CPUs have a misfeature: SYSRET sets the SS selector but
599 * does not update the cached descriptor. As a result, if we
600 * do SYSRET while SS is NULL, we'll end up in user mode with
601 * SS apparently equal to __USER_DS but actually unusable.
602 *
603 * The straightforward workaround would be to fix it up just
604 * before SYSRET, but that would slow down the system call
605 * fast paths. Instead, we ensure that SS is never NULL in
606 * system call context. We do this by replacing NULL SS
607 * selectors at every context switch. SYSCALL sets up a valid
608 * SS, so the only way to get NULL is to re-enter the kernel
609 * from CPL 3 through an interrupt. Since that can't happen
610 * in the same task as a running syscall, we are guaranteed to
611 * context switch between every interrupt vector entry and a
612 * subsequent SYSRET.
613 *
614 * We read SS first because SS reads are much faster than
615 * writes. Out of caution, we force SS to __KERNEL_DS even if
616 * it previously had a different non-NULL value.
617 */
618 unsigned short ss_sel;
619 savesegment(ss, ss_sel);
620 if (ss_sel != __KERNEL_DS)
621 loadsegment(ss, __KERNEL_DS);
622 }
623
4f341a5e 624 /* Load the Intel cache allocation PQR MSR. */
352940ec 625 resctrl_sched_in();
4f341a5e 626
1da177e4
LT
627 return prev_p;
628}
629
1da177e4
LT
630void set_personality_64bit(void)
631{
632 /* inherit personality from parent */
633
634 /* Make sure to be in 64bit mode */
6612538c 635 clear_thread_flag(TIF_IA32);
6bd33008 636 clear_thread_flag(TIF_ADDR32);
bb212724 637 clear_thread_flag(TIF_X32);
ada26481
DS
638 /* Pretend that this comes from a 64bit execve */
639 task_pt_regs(current)->orig_ax = __NR_execve;
acf46020 640 current_thread_info()->status &= ~TS_COMPAT;
1da177e4 641
375906f8
SW
642 /* Ensure the corresponding mm is not marked. */
643 if (current->mm)
644 current->mm->context.ia32_compat = 0;
645
1da177e4
LT
646 /* TBD: overwrites user setup. Should have two bits.
647 But 64bit processes have always behaved this way,
648 so it's not too bad. The main problem is just that
a97673a1 649 32bit children are affected again. */
1da177e4
LT
650 current->personality &= ~READ_IMPLIES_EXEC;
651}
652
ada26481 653static void __set_personality_x32(void)
05d43ed8 654{
ada26481
DS
655#ifdef CONFIG_X86_X32
656 clear_thread_flag(TIF_IA32);
657 set_thread_flag(TIF_X32);
658 if (current->mm)
659 current->mm->context.ia32_compat = TIF_X32;
660 current->personality &= ~READ_IMPLIES_EXEC;
661 /*
a846446b 662 * in_32bit_syscall() uses the presence of the x32 syscall bit
ada26481
DS
663 * flag to determine compat status. The x86 mmap() code relies on
664 * the syscall bitness so set x32 syscall bit right here to make
a846446b 665 * in_32bit_syscall() work during exec().
ada26481
DS
666 *
667 * Pretend to come from a x32 execve.
668 */
669 task_pt_regs(current)->orig_ax = __NR_x32_execve | __X32_SYSCALL_BIT;
37a8f7c3 670 current_thread_info()->status &= ~TS_COMPAT;
ada26481
DS
671#endif
672}
05d43ed8 673
ada26481
DS
674static void __set_personality_ia32(void)
675{
676#ifdef CONFIG_IA32_EMULATION
677 set_thread_flag(TIF_IA32);
678 clear_thread_flag(TIF_X32);
679 if (current->mm)
680 current->mm->context.ia32_compat = TIF_IA32;
681 current->personality |= force_personality32;
682 /* Prepare the first "return" to user space */
683 task_pt_regs(current)->orig_ax = __NR_ia32_execve;
37a8f7c3 684 current_thread_info()->status |= TS_COMPAT;
ada26481
DS
685#endif
686}
687
688void set_personality_ia32(bool x32)
689{
05d43ed8 690 /* Make sure to be in 32bit mode */
6bd33008 691 set_thread_flag(TIF_ADDR32);
05d43ed8 692
ada26481
DS
693 if (x32)
694 __set_personality_x32();
695 else
696 __set_personality_ia32();
05d43ed8 697}
febb72a6 698EXPORT_SYMBOL_GPL(set_personality_ia32);
05d43ed8 699
91b7bd39 700#ifdef CONFIG_CHECKPOINT_RESTORE
2eefd878
DS
701static long prctl_map_vdso(const struct vdso_image *image, unsigned long addr)
702{
703 int ret;
704
705 ret = map_vdso_once(image, addr);
706 if (ret)
707 return ret;
708
709 return (long)image->size;
710}
91b7bd39 711#endif
2eefd878 712
17a6e1b8 713long do_arch_prctl_64(struct task_struct *task, int option, unsigned long arg2)
7de08b4e
GP
714{
715 int ret = 0;
1da177e4 716
dd93938a 717 switch (option) {
e696c231 718 case ARCH_SET_GS: {
87ab4689
CB
719 if (unlikely(arg2 >= TASK_SIZE_MAX))
720 return -EPERM;
721
722 preempt_disable();
723 /*
724 * ARCH_SET_GS has always overwritten the index
725 * and the base. Zero is the most sensible value
726 * to put in the index, and is the only value that
727 * makes any sense if FSGSBASE is unavailable.
728 */
729 if (task == current) {
730 loadseg(GS, 0);
731 x86_gsbase_write_cpu_inactive(arg2);
732
733 /*
734 * On non-FSGSBASE systems, save_base_legacy() expects
735 * that we also fill in thread.gsbase.
736 */
737 task->thread.gsbase = arg2;
738
739 } else {
740 task->thread.gsindex = 0;
741 x86_gsbase_write_task(task, arg2);
742 }
743 preempt_enable();
1da177e4 744 break;
e696c231
CB
745 }
746 case ARCH_SET_FS: {
87ab4689
CB
747 /*
748 * Not strictly needed for %fs, but do it for symmetry
749 * with %gs
750 */
751 if (unlikely(arg2 >= TASK_SIZE_MAX))
752 return -EPERM;
753
754 preempt_disable();
755 /*
756 * Set the selector to 0 for the same reason
757 * as %gs above.
758 */
759 if (task == current) {
760 loadseg(FS, 0);
761 x86_fsbase_write_cpu(arg2);
762
763 /*
764 * On non-FSGSBASE systems, save_base_legacy() expects
765 * that we also fill in thread.fsbase.
766 */
767 task->thread.fsbase = arg2;
768 } else {
769 task->thread.fsindex = 0;
770 x86_fsbase_write_task(task, arg2);
771 }
772 preempt_enable();
1da177e4 773 break;
e696c231 774 }
6612538c 775 case ARCH_GET_FS: {
e696c231 776 unsigned long base = x86_fsbase_read_task(task);
17a6e1b8 777
17a6e1b8 778 ret = put_user(base, (unsigned long __user *)arg2);
6612538c 779 break;
1da177e4 780 }
6612538c 781 case ARCH_GET_GS: {
e696c231 782 unsigned long base = x86_gsbase_read_task(task);
17a6e1b8 783
17a6e1b8 784 ret = put_user(base, (unsigned long __user *)arg2);
1da177e4
LT
785 break;
786 }
787
2eefd878 788#ifdef CONFIG_CHECKPOINT_RESTORE
6e68b087 789# ifdef CONFIG_X86_X32_ABI
2eefd878 790 case ARCH_MAP_VDSO_X32:
17a6e1b8 791 return prctl_map_vdso(&vdso_image_x32, arg2);
91b7bd39
IM
792# endif
793# if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
2eefd878 794 case ARCH_MAP_VDSO_32:
17a6e1b8 795 return prctl_map_vdso(&vdso_image_32, arg2);
91b7bd39 796# endif
2eefd878 797 case ARCH_MAP_VDSO_64:
17a6e1b8 798 return prctl_map_vdso(&vdso_image_64, arg2);
2eefd878
DS
799#endif
800
1da177e4
LT
801 default:
802 ret = -EINVAL;
803 break;
6612538c 804 }
1da177e4 805
6612538c
HS
806 return ret;
807}
1da177e4 808
17a6e1b8 809SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
1da177e4 810{
b0b9b014
KH
811 long ret;
812
813 ret = do_arch_prctl_64(current, option, arg2);
814 if (ret == -EINVAL)
815 ret = do_arch_prctl_common(current, option, arg2);
816
817 return ret;
1da177e4
LT
818}
819
79170fda
KH
820#ifdef CONFIG_IA32_EMULATION
821COMPAT_SYSCALL_DEFINE2(arch_prctl, int, option, unsigned long, arg2)
822{
823 return do_arch_prctl_common(current, option, arg2);
824}
825#endif
826
89240ba0
SS
827unsigned long KSTK_ESP(struct task_struct *task)
828{
263042e4 829 return task_pt_regs(task)->sp;
89240ba0 830}