]>
Commit | Line | Data |
---|---|---|
003ba957 | 1 | // SPDX-License-Identifier: GPL-2.0-only |
2aae950b | 2 | /* |
2aae950b | 3 | * Copyright 2007 Andi Kleen, SUSE Labs. |
1c0c1b93 AL |
4 | * |
5 | * This contains most of the x86 vDSO kernel-side code. | |
2aae950b AK |
6 | */ |
7 | #include <linux/mm.h> | |
4e950f6f | 8 | #include <linux/err.h> |
2aae950b | 9 | #include <linux/sched.h> |
68db0cf1 | 10 | #include <linux/sched/task_stack.h> |
5a0e3ad6 | 11 | #include <linux/slab.h> |
2aae950b AK |
12 | #include <linux/init.h> |
13 | #include <linux/random.h> | |
3fa89ca7 | 14 | #include <linux/elf.h> |
d4f829dd | 15 | #include <linux/cpu.h> |
b059a453 | 16 | #include <linux/ptrace.h> |
af34ebeb DS |
17 | #include <linux/time_namespace.h> |
18 | ||
cc1e24fd | 19 | #include <asm/pvclock.h> |
2aae950b AK |
20 | #include <asm/vgtod.h> |
21 | #include <asm/proto.h> | |
7f3646aa | 22 | #include <asm/vdso.h> |
1c0c1b93 | 23 | #include <asm/vvar.h> |
af34ebeb | 24 | #include <asm/tlb.h> |
aafade24 | 25 | #include <asm/page.h> |
d4f829dd | 26 | #include <asm/desc.h> |
cd4d09ec | 27 | #include <asm/cpufeature.h> |
dd2cb348 | 28 | #include <clocksource/hyperv_timer.h> |
2aae950b | 29 | |
64b302ab DS |
30 | #undef _ASM_X86_VVAR_H |
31 | #define EMIT_VVAR(name, offset) \ | |
32 | const size_t name ## _offset = offset; | |
33 | #include <asm/vvar.h> | |
34 | ||
35 | struct vdso_data *arch_get_vdso_data(void *vvar_page) | |
36 | { | |
37 | return (struct vdso_data *)(vvar_page + _vdso_data_offset); | |
38 | } | |
39 | #undef EMIT_VVAR | |
40 | ||
eec399dd TG |
41 | unsigned int vclocks_used __read_mostly; |
42 | ||
b4b541a6 | 43 | #if defined(CONFIG_X86_64) |
3d7ee969 | 44 | unsigned int __read_mostly vdso64_enabled = 1; |
b4b541a6 | 45 | #endif |
1a21d4e0 | 46 | |
4c382d72 | 47 | int __init init_vdso_image(const struct vdso_image *image) |
1a21d4e0 | 48 | { |
4c382d72 | 49 | BUILD_BUG_ON(VDSO_CLOCKMODE_MAX >= 32); |
6f121e54 | 50 | BUG_ON(image->size % PAGE_SIZE != 0); |
1a21d4e0 | 51 | |
6f121e54 AL |
52 | apply_alternatives((struct alt_instr *)(image->data + image->alt), |
53 | (struct alt_instr *)(image->data + image->alt + | |
54 | image->alt_len)); | |
4c382d72 BG |
55 | |
56 | return 0; | |
1a21d4e0 | 57 | } |
1b3f2a72 | 58 | |
70ddf651 | 59 | static const struct vm_special_mapping vvar_mapping; |
2aae950b AK |
60 | struct linux_binprm; |
61 | ||
b13fd1dc | 62 | static vm_fault_t vdso_fault(const struct vm_special_mapping *sm, |
05ef76b2 AL |
63 | struct vm_area_struct *vma, struct vm_fault *vmf) |
64 | { | |
65 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; | |
66 | ||
67 | if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size) | |
68 | return VM_FAULT_SIGBUS; | |
69 | ||
70 | vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT)); | |
71 | get_page(vmf->page); | |
72 | return 0; | |
73 | } | |
74 | ||
b059a453 DS |
75 | static void vdso_fix_landing(const struct vdso_image *image, |
76 | struct vm_area_struct *new_vma) | |
77 | { | |
78 | #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION | |
79 | if (in_ia32_syscall() && image == &vdso_image_32) { | |
80 | struct pt_regs *regs = current_pt_regs(); | |
81 | unsigned long vdso_land = image->sym_int80_landing_pad; | |
82 | unsigned long old_land_addr = vdso_land + | |
83 | (unsigned long)current->mm->context.vdso; | |
84 | ||
85 | /* Fixing userspace landing - look at do_fast_syscall_32 */ | |
86 | if (regs->ip == old_land_addr) | |
87 | regs->ip = new_vma->vm_start + vdso_land; | |
88 | } | |
89 | #endif | |
90 | } | |
91 | ||
92 | static int vdso_mremap(const struct vm_special_mapping *sm, | |
93 | struct vm_area_struct *new_vma) | |
94 | { | |
b059a453 DS |
95 | const struct vdso_image *image = current->mm->context.vdso_image; |
96 | ||
b059a453 DS |
97 | vdso_fix_landing(image, new_vma); |
98 | current->mm->context.vdso = (void __user *)new_vma->vm_start; | |
99 | ||
100 | return 0; | |
101 | } | |
05ef76b2 | 102 | |
af34ebeb | 103 | #ifdef CONFIG_TIME_NS |
70ddf651 DS |
104 | /* |
105 | * The vvar page layout depends on whether a task belongs to the root or | |
106 | * non-root time namespace. Whenever a task changes its namespace, the VVAR | |
107 | * page tables are cleared and then they will re-faulted with a | |
108 | * corresponding layout. | |
109 | * See also the comment near timens_setup_vdso_data() for details. | |
110 | */ | |
111 | int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) | |
112 | { | |
113 | struct mm_struct *mm = task->mm; | |
114 | struct vm_area_struct *vma; | |
a3884621 | 115 | VMA_ITERATOR(vmi, mm, 0); |
70ddf651 | 116 | |
42815808 | 117 | mmap_read_lock(mm); |
a3884621 | 118 | for_each_vma(vmi, vma) { |
70ddf651 | 119 | if (vma_is_special_mapping(vma, &vvar_mapping)) |
e9adcfec | 120 | zap_vma_pages(vma); |
70ddf651 | 121 | } |
42815808 | 122 | mmap_read_unlock(mm); |
a3884621 | 123 | |
70ddf651 DS |
124 | return 0; |
125 | } | |
af34ebeb DS |
126 | #endif |
127 | ||
b13fd1dc | 128 | static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, |
a48a7042 AL |
129 | struct vm_area_struct *vma, struct vm_fault *vmf) |
130 | { | |
131 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; | |
af34ebeb | 132 | unsigned long pfn; |
a48a7042 | 133 | long sym_offset; |
a48a7042 AL |
134 | |
135 | if (!image) | |
136 | return VM_FAULT_SIGBUS; | |
137 | ||
138 | sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) + | |
139 | image->sym_vvar_start; | |
140 | ||
141 | /* | |
142 | * Sanity check: a symbol offset of zero means that the page | |
143 | * does not exist for this vdso image, not that the page is at | |
144 | * offset zero relative to the text mapping. This should be | |
145 | * impossible here, because sym_offset should only be zero for | |
146 | * the page past the end of the vvar mapping. | |
147 | */ | |
148 | if (sym_offset == 0) | |
149 | return VM_FAULT_SIGBUS; | |
150 | ||
151 | if (sym_offset == image->sym_vvar_page) { | |
af34ebeb DS |
152 | struct page *timens_page = find_timens_vvar_page(vma); |
153 | ||
154 | pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; | |
155 | ||
156 | /* | |
157 | * If a task belongs to a time namespace then a namespace | |
158 | * specific VVAR is mapped with the sym_vvar_page offset and | |
159 | * the real VVAR page is mapped with the sym_timens_page | |
160 | * offset. | |
161 | * See also the comment near timens_setup_vdso_data(). | |
162 | */ | |
e6b28ec6 DS |
163 | if (timens_page) { |
164 | unsigned long addr; | |
165 | vm_fault_t err; | |
166 | ||
167 | /* | |
168 | * Optimization: inside time namespace pre-fault | |
169 | * VVAR page too. As on timens page there are only | |
170 | * offsets for clocks on VVAR, it'll be faulted | |
171 | * shortly by VDSO code. | |
172 | */ | |
173 | addr = vmf->address + (image->sym_timens_page - sym_offset); | |
174 | err = vmf_insert_pfn(vma, addr, pfn); | |
175 | if (unlikely(err & VM_FAULT_ERROR)) | |
176 | return err; | |
177 | ||
af34ebeb | 178 | pfn = page_to_pfn(timens_page); |
e6b28ec6 | 179 | } |
af34ebeb DS |
180 | |
181 | return vmf_insert_pfn(vma, vmf->address, pfn); | |
a48a7042 AL |
182 | } else if (sym_offset == image->sym_pvclock_page) { |
183 | struct pvclock_vsyscall_time_info *pvti = | |
9f08890a | 184 | pvclock_get_pvti_cpu0_va(); |
b95a8a27 | 185 | if (pvti && vclock_was_used(VDSO_CLOCKMODE_PVCLOCK)) { |
b13fd1dc MW |
186 | return vmf_insert_pfn_prot(vma, vmf->address, |
187 | __pa(pvti) >> PAGE_SHIFT, | |
188 | pgprot_decrypted(vma->vm_page_prot)); | |
a48a7042 | 189 | } |
90b20432 | 190 | } else if (sym_offset == image->sym_hvclock_page) { |
364adc45 | 191 | pfn = hv_get_tsc_pfn(); |
90b20432 | 192 | |
364adc45 SK |
193 | if (pfn && vclock_was_used(VDSO_CLOCKMODE_HVCLOCK)) |
194 | return vmf_insert_pfn(vma, vmf->address, pfn); | |
af34ebeb DS |
195 | } else if (sym_offset == image->sym_timens_page) { |
196 | struct page *timens_page = find_timens_vvar_page(vma); | |
197 | ||
198 | if (!timens_page) | |
199 | return VM_FAULT_SIGBUS; | |
200 | ||
201 | pfn = __pa_symbol(&__vvar_page) >> PAGE_SHIFT; | |
202 | return vmf_insert_pfn(vma, vmf->address, pfn); | |
a48a7042 AL |
203 | } |
204 | ||
a48a7042 AL |
205 | return VM_FAULT_SIGBUS; |
206 | } | |
207 | ||
2eefd878 DS |
208 | static const struct vm_special_mapping vdso_mapping = { |
209 | .name = "[vdso]", | |
210 | .fault = vdso_fault, | |
211 | .mremap = vdso_mremap, | |
212 | }; | |
213 | static const struct vm_special_mapping vvar_mapping = { | |
214 | .name = "[vvar]", | |
215 | .fault = vvar_fault, | |
216 | }; | |
217 | ||
576ebfef DS |
218 | /* |
219 | * Add vdso and vvar mappings to current process. | |
220 | * @image - blob to map | |
221 | * @addr - request a specific address (zero to map at free addr) | |
222 | */ | |
223 | static int map_vdso(const struct vdso_image *image, unsigned long addr) | |
2aae950b AK |
224 | { |
225 | struct mm_struct *mm = current->mm; | |
18d0a6fd | 226 | struct vm_area_struct *vma; |
576ebfef | 227 | unsigned long text_start; |
18d0a6fd | 228 | int ret = 0; |
b059a453 | 229 | |
d8ed45c5 | 230 | if (mmap_write_lock_killable(mm)) |
69048176 | 231 | return -EINTR; |
18d0a6fd | 232 | |
e6577a7c AL |
233 | addr = get_unmapped_area(NULL, addr, |
234 | image->size - image->sym_vvar_start, 0, 0); | |
2aae950b AK |
235 | if (IS_ERR_VALUE(addr)) { |
236 | ret = addr; | |
237 | goto up_fail; | |
238 | } | |
239 | ||
e6577a7c | 240 | text_start = addr - image->sym_vvar_start; |
f7b6eb3f | 241 | |
18d0a6fd AL |
242 | /* |
243 | * MAYWRITE to allow gdb to COW and set breakpoints | |
244 | */ | |
a62c34bd | 245 | vma = _install_special_mapping(mm, |
e6577a7c | 246 | text_start, |
a62c34bd AL |
247 | image->size, |
248 | VM_READ|VM_EXEC| | |
249 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, | |
b059a453 | 250 | &vdso_mapping); |
18d0a6fd | 251 | |
a62c34bd AL |
252 | if (IS_ERR(vma)) { |
253 | ret = PTR_ERR(vma); | |
18d0a6fd | 254 | goto up_fail; |
a62c34bd | 255 | } |
18d0a6fd AL |
256 | |
257 | vma = _install_special_mapping(mm, | |
e6577a7c AL |
258 | addr, |
259 | -image->sym_vvar_start, | |
a48a7042 AL |
260 | VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| |
261 | VM_PFNMAP, | |
a62c34bd | 262 | &vvar_mapping); |
18d0a6fd AL |
263 | |
264 | if (IS_ERR(vma)) { | |
265 | ret = PTR_ERR(vma); | |
897ab3e0 | 266 | do_munmap(mm, text_start, image->size, NULL); |
67dece7d DS |
267 | } else { |
268 | current->mm->context.vdso = (void __user *)text_start; | |
269 | current->mm->context.vdso_image = image; | |
f7b6eb3f | 270 | } |
2aae950b | 271 | |
2aae950b | 272 | up_fail: |
d8ed45c5 | 273 | mmap_write_unlock(mm); |
2aae950b AK |
274 | return ret; |
275 | } | |
276 | ||
2eefd878 DS |
277 | int map_vdso_once(const struct vdso_image *image, unsigned long addr) |
278 | { | |
279 | struct mm_struct *mm = current->mm; | |
280 | struct vm_area_struct *vma; | |
a3884621 | 281 | VMA_ITERATOR(vmi, mm, 0); |
2eefd878 | 282 | |
d8ed45c5 | 283 | mmap_write_lock(mm); |
2eefd878 DS |
284 | /* |
285 | * Check if we have already mapped vdso blob - fail to prevent | |
163b0991 | 286 | * abusing from userspace install_special_mapping, which may |
2eefd878 DS |
287 | * not do accounting and rlimit right. |
288 | * We could search vma near context.vdso, but it's a slowpath, | |
a97673a1 | 289 | * so let's explicitly check all VMAs to be completely sure. |
2eefd878 | 290 | */ |
a3884621 | 291 | for_each_vma(vmi, vma) { |
2eefd878 DS |
292 | if (vma_is_special_mapping(vma, &vdso_mapping) || |
293 | vma_is_special_mapping(vma, &vvar_mapping)) { | |
d8ed45c5 | 294 | mmap_write_unlock(mm); |
2eefd878 DS |
295 | return -EEXIST; |
296 | } | |
297 | } | |
d8ed45c5 | 298 | mmap_write_unlock(mm); |
2eefd878 DS |
299 | |
300 | return map_vdso(image, addr); | |
301 | } | |
302 | ||
ab8b82ee | 303 | #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) |
18d0a6fd AL |
304 | static int load_vdso32(void) |
305 | { | |
18d0a6fd AL |
306 | if (vdso32_enabled != 1) /* Other values all mean "disabled" */ |
307 | return 0; | |
308 | ||
576ebfef | 309 | return map_vdso(&vdso_image_32, 0); |
18d0a6fd AL |
310 | } |
311 | #endif | |
312 | ||
313 | #ifdef CONFIG_X86_64 | |
1a21d4e0 L |
314 | int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) |
315 | { | |
18d0a6fd AL |
316 | if (!vdso64_enabled) |
317 | return 0; | |
318 | ||
3c6539b4 | 319 | return map_vdso(&vdso_image_64, 0); |
1a21d4e0 L |
320 | } |
321 | ||
18d0a6fd AL |
322 | #ifdef CONFIG_COMPAT |
323 | int compat_arch_setup_additional_pages(struct linux_binprm *bprm, | |
3316ec8c | 324 | int uses_interp, bool x32) |
18d0a6fd | 325 | { |
1a21d4e0 | 326 | #ifdef CONFIG_X86_X32_ABI |
3316ec8c | 327 | if (x32) { |
18d0a6fd AL |
328 | if (!vdso64_enabled) |
329 | return 0; | |
3c6539b4 | 330 | return map_vdso(&vdso_image_x32, 0); |
18d0a6fd AL |
331 | } |
332 | #endif | |
ab8b82ee | 333 | #ifdef CONFIG_IA32_EMULATION |
18d0a6fd | 334 | return load_vdso32(); |
ab8b82ee BG |
335 | #else |
336 | return 0; | |
337 | #endif | |
18d0a6fd AL |
338 | } |
339 | #endif | |
340 | #else | |
341 | int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) | |
1a21d4e0 | 342 | { |
18d0a6fd | 343 | return load_vdso32(); |
1a21d4e0 L |
344 | } |
345 | #endif | |
346 | ||
c5c87812 GKB |
347 | bool arch_syscall_is_vdso_sigreturn(struct pt_regs *regs) |
348 | { | |
349 | #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) | |
350 | const struct vdso_image *image = current->mm->context.vdso_image; | |
351 | unsigned long vdso = (unsigned long) current->mm->context.vdso; | |
352 | ||
353 | if (in_ia32_syscall() && image == &vdso_image_32) { | |
354 | if (regs->ip == vdso + image->sym_vdso32_sigreturn_landing_pad || | |
355 | regs->ip == vdso + image->sym_vdso32_rt_sigreturn_landing_pad) | |
356 | return true; | |
357 | } | |
358 | #endif | |
359 | return false; | |
360 | } | |
361 | ||
18d0a6fd | 362 | #ifdef CONFIG_X86_64 |
2aae950b AK |
363 | static __init int vdso_setup(char *s) |
364 | { | |
3d7ee969 | 365 | vdso64_enabled = simple_strtoul(s, NULL, 0); |
12441ccd | 366 | return 1; |
2aae950b AK |
367 | } |
368 | __setup("vdso=", vdso_setup); | |
1c0c1b93 | 369 | #endif /* CONFIG_X86_64 */ |