]>
Commit | Line | Data |
---|---|---|
2aae950b | 1 | /* |
2aae950b AK |
2 | * Copyright 2007 Andi Kleen, SUSE Labs. |
3 | * Subject to the GPL, v.2 | |
1c0c1b93 AL |
4 | * |
5 | * This contains most of the x86 vDSO kernel-side code. | |
2aae950b AK |
6 | */ |
7 | #include <linux/mm.h> | |
4e950f6f | 8 | #include <linux/err.h> |
2aae950b | 9 | #include <linux/sched.h> |
5a0e3ad6 | 10 | #include <linux/slab.h> |
2aae950b AK |
11 | #include <linux/init.h> |
12 | #include <linux/random.h> | |
3fa89ca7 | 13 | #include <linux/elf.h> |
d4f829dd | 14 | #include <linux/cpu.h> |
b059a453 | 15 | #include <linux/ptrace.h> |
cc1e24fd | 16 | #include <asm/pvclock.h> |
2aae950b AK |
17 | #include <asm/vgtod.h> |
18 | #include <asm/proto.h> | |
7f3646aa | 19 | #include <asm/vdso.h> |
1c0c1b93 | 20 | #include <asm/vvar.h> |
aafade24 | 21 | #include <asm/page.h> |
d4f829dd | 22 | #include <asm/desc.h> |
cd4d09ec | 23 | #include <asm/cpufeature.h> |
2aae950b | 24 | |
b4b541a6 | 25 | #if defined(CONFIG_X86_64) |
3d7ee969 | 26 | unsigned int __read_mostly vdso64_enabled = 1; |
b4b541a6 | 27 | #endif |
1a21d4e0 | 28 | |
6f121e54 | 29 | void __init init_vdso_image(const struct vdso_image *image) |
1a21d4e0 | 30 | { |
6f121e54 | 31 | BUG_ON(image->size % PAGE_SIZE != 0); |
1a21d4e0 | 32 | |
6f121e54 AL |
33 | apply_alternatives((struct alt_instr *)(image->data + image->alt), |
34 | (struct alt_instr *)(image->data + image->alt + | |
35 | image->alt_len)); | |
1a21d4e0 | 36 | } |
1b3f2a72 | 37 | |
2aae950b AK |
38 | struct linux_binprm; |
39 | ||
394f56fe AL |
40 | /* |
41 | * Put the vdso above the (randomized) stack with another randomized | |
42 | * offset. This way there is no hole in the middle of address space. | |
43 | * To save memory make sure it is still in the same PTE as the stack | |
44 | * top. This doesn't give that many random bits. | |
45 | * | |
46 | * Note that this algorithm is imperfect: the distribution of the vdso | |
47 | * start address within a PMD is biased toward the end. | |
48 | * | |
49 | * Only used for the 64-bit and x32 vdsos. | |
50 | */ | |
2aae950b AK |
51 | static unsigned long vdso_addr(unsigned long start, unsigned len) |
52 | { | |
d093601b JB |
53 | #ifdef CONFIG_X86_32 |
54 | return 0; | |
55 | #else | |
2aae950b AK |
56 | unsigned long addr, end; |
57 | unsigned offset; | |
394f56fe AL |
58 | |
59 | /* | |
60 | * Round up the start address. It can start out unaligned as a result | |
61 | * of stack start randomization. | |
62 | */ | |
63 | start = PAGE_ALIGN(start); | |
64 | ||
65 | /* Round the lowest possible end address up to a PMD boundary. */ | |
66 | end = (start + len + PMD_SIZE - 1) & PMD_MASK; | |
d9517346 IM |
67 | if (end >= TASK_SIZE_MAX) |
68 | end = TASK_SIZE_MAX; | |
2aae950b | 69 | end -= len; |
394f56fe AL |
70 | |
71 | if (end > start) { | |
72 | offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); | |
73 | addr = start + (offset << PAGE_SHIFT); | |
74 | } else { | |
75 | addr = start; | |
76 | } | |
dfb09f9b BP |
77 | |
78 | /* | |
394f56fe AL |
79 | * Forcibly align the final address in case we have a hardware |
80 | * issue that requires alignment for performance reasons. | |
dfb09f9b | 81 | */ |
f9902472 | 82 | addr = align_vdso_addr(addr); |
dfb09f9b | 83 | |
2aae950b | 84 | return addr; |
d093601b | 85 | #endif |
2aae950b AK |
86 | } |
87 | ||
05ef76b2 AL |
88 | static int vdso_fault(const struct vm_special_mapping *sm, |
89 | struct vm_area_struct *vma, struct vm_fault *vmf) | |
90 | { | |
91 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; | |
92 | ||
93 | if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size) | |
94 | return VM_FAULT_SIGBUS; | |
95 | ||
96 | vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT)); | |
97 | get_page(vmf->page); | |
98 | return 0; | |
99 | } | |
100 | ||
b059a453 DS |
101 | static void vdso_fix_landing(const struct vdso_image *image, |
102 | struct vm_area_struct *new_vma) | |
103 | { | |
104 | #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION | |
105 | if (in_ia32_syscall() && image == &vdso_image_32) { | |
106 | struct pt_regs *regs = current_pt_regs(); | |
107 | unsigned long vdso_land = image->sym_int80_landing_pad; | |
108 | unsigned long old_land_addr = vdso_land + | |
109 | (unsigned long)current->mm->context.vdso; | |
110 | ||
111 | /* Fixing userspace landing - look at do_fast_syscall_32 */ | |
112 | if (regs->ip == old_land_addr) | |
113 | regs->ip = new_vma->vm_start + vdso_land; | |
114 | } | |
115 | #endif | |
116 | } | |
117 | ||
118 | static int vdso_mremap(const struct vm_special_mapping *sm, | |
119 | struct vm_area_struct *new_vma) | |
120 | { | |
121 | unsigned long new_size = new_vma->vm_end - new_vma->vm_start; | |
122 | const struct vdso_image *image = current->mm->context.vdso_image; | |
123 | ||
124 | if (image->size != new_size) | |
125 | return -EINVAL; | |
126 | ||
127 | if (WARN_ON_ONCE(current->mm != new_vma->vm_mm)) | |
128 | return -EFAULT; | |
129 | ||
130 | vdso_fix_landing(image, new_vma); | |
131 | current->mm->context.vdso = (void __user *)new_vma->vm_start; | |
132 | ||
133 | return 0; | |
134 | } | |
05ef76b2 | 135 | |
a48a7042 AL |
136 | static int vvar_fault(const struct vm_special_mapping *sm, |
137 | struct vm_area_struct *vma, struct vm_fault *vmf) | |
138 | { | |
139 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; | |
140 | long sym_offset; | |
141 | int ret = -EFAULT; | |
142 | ||
143 | if (!image) | |
144 | return VM_FAULT_SIGBUS; | |
145 | ||
146 | sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) + | |
147 | image->sym_vvar_start; | |
148 | ||
149 | /* | |
150 | * Sanity check: a symbol offset of zero means that the page | |
151 | * does not exist for this vdso image, not that the page is at | |
152 | * offset zero relative to the text mapping. This should be | |
153 | * impossible here, because sym_offset should only be zero for | |
154 | * the page past the end of the vvar mapping. | |
155 | */ | |
156 | if (sym_offset == 0) | |
157 | return VM_FAULT_SIGBUS; | |
158 | ||
159 | if (sym_offset == image->sym_vvar_page) { | |
160 | ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, | |
161 | __pa_symbol(&__vvar_page) >> PAGE_SHIFT); | |
a48a7042 AL |
162 | } else if (sym_offset == image->sym_pvclock_page) { |
163 | struct pvclock_vsyscall_time_info *pvti = | |
164 | pvclock_pvti_cpu0_va(); | |
bd902c53 | 165 | if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { |
a48a7042 AL |
166 | ret = vm_insert_pfn( |
167 | vma, | |
168 | (unsigned long)vmf->virtual_address, | |
169 | __pa(pvti) >> PAGE_SHIFT); | |
170 | } | |
171 | } | |
172 | ||
173 | if (ret == 0 || ret == -EBUSY) | |
174 | return VM_FAULT_NOPAGE; | |
175 | ||
176 | return VM_FAULT_SIGBUS; | |
177 | } | |
178 | ||
2eefd878 DS |
179 | static const struct vm_special_mapping vdso_mapping = { |
180 | .name = "[vdso]", | |
181 | .fault = vdso_fault, | |
182 | .mremap = vdso_mremap, | |
183 | }; | |
184 | static const struct vm_special_mapping vvar_mapping = { | |
185 | .name = "[vvar]", | |
186 | .fault = vvar_fault, | |
187 | }; | |
188 | ||
576ebfef DS |
189 | /* |
190 | * Add vdso and vvar mappings to current process. | |
191 | * @image - blob to map | |
192 | * @addr - request a specific address (zero to map at free addr) | |
193 | */ | |
194 | static int map_vdso(const struct vdso_image *image, unsigned long addr) | |
2aae950b AK |
195 | { |
196 | struct mm_struct *mm = current->mm; | |
18d0a6fd | 197 | struct vm_area_struct *vma; |
576ebfef | 198 | unsigned long text_start; |
18d0a6fd | 199 | int ret = 0; |
b059a453 | 200 | |
69048176 MH |
201 | if (down_write_killable(&mm->mmap_sem)) |
202 | return -EINTR; | |
18d0a6fd | 203 | |
e6577a7c AL |
204 | addr = get_unmapped_area(NULL, addr, |
205 | image->size - image->sym_vvar_start, 0, 0); | |
2aae950b AK |
206 | if (IS_ERR_VALUE(addr)) { |
207 | ret = addr; | |
208 | goto up_fail; | |
209 | } | |
210 | ||
e6577a7c AL |
211 | text_start = addr - image->sym_vvar_start; |
212 | current->mm->context.vdso = (void __user *)text_start; | |
352b78c6 | 213 | current->mm->context.vdso_image = image; |
f7b6eb3f | 214 | |
18d0a6fd AL |
215 | /* |
216 | * MAYWRITE to allow gdb to COW and set breakpoints | |
217 | */ | |
a62c34bd | 218 | vma = _install_special_mapping(mm, |
e6577a7c | 219 | text_start, |
a62c34bd AL |
220 | image->size, |
221 | VM_READ|VM_EXEC| | |
222 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, | |
b059a453 | 223 | &vdso_mapping); |
18d0a6fd | 224 | |
a62c34bd AL |
225 | if (IS_ERR(vma)) { |
226 | ret = PTR_ERR(vma); | |
18d0a6fd | 227 | goto up_fail; |
a62c34bd | 228 | } |
18d0a6fd AL |
229 | |
230 | vma = _install_special_mapping(mm, | |
e6577a7c AL |
231 | addr, |
232 | -image->sym_vvar_start, | |
a48a7042 AL |
233 | VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| |
234 | VM_PFNMAP, | |
a62c34bd | 235 | &vvar_mapping); |
18d0a6fd AL |
236 | |
237 | if (IS_ERR(vma)) { | |
238 | ret = PTR_ERR(vma); | |
e38447ee | 239 | do_munmap(mm, text_start, image->size); |
f7b6eb3f | 240 | } |
2aae950b | 241 | |
2aae950b | 242 | up_fail: |
e38447ee | 243 | if (ret) { |
18d0a6fd | 244 | current->mm->context.vdso = NULL; |
e38447ee DS |
245 | current->mm->context.vdso_image = NULL; |
246 | } | |
18d0a6fd | 247 | |
2aae950b AK |
248 | up_write(&mm->mmap_sem); |
249 | return ret; | |
250 | } | |
251 | ||
576ebfef DS |
252 | static int map_vdso_randomized(const struct vdso_image *image) |
253 | { | |
254 | unsigned long addr = vdso_addr(current->mm->start_stack, | |
255 | image->size - image->sym_vvar_start); | |
256 | return map_vdso(image, addr); | |
257 | } | |
258 | ||
2eefd878 DS |
259 | int map_vdso_once(const struct vdso_image *image, unsigned long addr) |
260 | { | |
261 | struct mm_struct *mm = current->mm; | |
262 | struct vm_area_struct *vma; | |
263 | ||
264 | down_write(&mm->mmap_sem); | |
265 | /* | |
266 | * Check if we have already mapped vdso blob - fail to prevent | |
267 | * abusing from userspace install_speciall_mapping, which may | |
268 | * not do accounting and rlimit right. | |
269 | * We could search vma near context.vdso, but it's a slowpath, | |
270 | * so let's explicitely check all VMAs to be completely sure. | |
271 | */ | |
272 | for (vma = mm->mmap; vma; vma = vma->vm_next) { | |
273 | if (vma_is_special_mapping(vma, &vdso_mapping) || | |
274 | vma_is_special_mapping(vma, &vvar_mapping)) { | |
275 | up_write(&mm->mmap_sem); | |
276 | return -EEXIST; | |
277 | } | |
278 | } | |
279 | up_write(&mm->mmap_sem); | |
280 | ||
281 | return map_vdso(image, addr); | |
282 | } | |
283 | ||
ab8b82ee | 284 | #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) |
18d0a6fd AL |
285 | static int load_vdso32(void) |
286 | { | |
18d0a6fd AL |
287 | if (vdso32_enabled != 1) /* Other values all mean "disabled" */ |
288 | return 0; | |
289 | ||
576ebfef | 290 | return map_vdso(&vdso_image_32, 0); |
18d0a6fd AL |
291 | } |
292 | #endif | |
293 | ||
294 | #ifdef CONFIG_X86_64 | |
1a21d4e0 L |
295 | int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) |
296 | { | |
18d0a6fd AL |
297 | if (!vdso64_enabled) |
298 | return 0; | |
299 | ||
576ebfef | 300 | return map_vdso_randomized(&vdso_image_64); |
1a21d4e0 L |
301 | } |
302 | ||
18d0a6fd AL |
303 | #ifdef CONFIG_COMPAT |
304 | int compat_arch_setup_additional_pages(struct linux_binprm *bprm, | |
305 | int uses_interp) | |
306 | { | |
1a21d4e0 | 307 | #ifdef CONFIG_X86_X32_ABI |
18d0a6fd AL |
308 | if (test_thread_flag(TIF_X32)) { |
309 | if (!vdso64_enabled) | |
310 | return 0; | |
576ebfef | 311 | return map_vdso_randomized(&vdso_image_x32); |
18d0a6fd AL |
312 | } |
313 | #endif | |
ab8b82ee | 314 | #ifdef CONFIG_IA32_EMULATION |
18d0a6fd | 315 | return load_vdso32(); |
ab8b82ee BG |
316 | #else |
317 | return 0; | |
318 | #endif | |
18d0a6fd AL |
319 | } |
320 | #endif | |
321 | #else | |
322 | int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) | |
1a21d4e0 | 323 | { |
18d0a6fd | 324 | return load_vdso32(); |
1a21d4e0 L |
325 | } |
326 | #endif | |
327 | ||
18d0a6fd | 328 | #ifdef CONFIG_X86_64 |
2aae950b AK |
329 | static __init int vdso_setup(char *s) |
330 | { | |
3d7ee969 | 331 | vdso64_enabled = simple_strtoul(s, NULL, 0); |
2aae950b AK |
332 | return 0; |
333 | } | |
334 | __setup("vdso=", vdso_setup); | |
b4b541a6 | 335 | #endif |
d4f829dd AL |
336 | |
337 | #ifdef CONFIG_X86_64 | |
1c0c1b93 | 338 | static void vgetcpu_cpu_init(void *arg) |
d4f829dd | 339 | { |
1c0c1b93 | 340 | int cpu = smp_processor_id(); |
a92f101b | 341 | struct desc_struct d = { }; |
d4f829dd AL |
342 | unsigned long node = 0; |
343 | #ifdef CONFIG_NUMA | |
344 | node = cpu_to_node(cpu); | |
345 | #endif | |
8c725306 | 346 | if (static_cpu_has(X86_FEATURE_RDTSCP)) |
d4f829dd AL |
347 | write_rdtscp_aux((node << 12) | cpu); |
348 | ||
349 | /* | |
25880156 AL |
350 | * Store cpu number in limit so that it can be loaded |
351 | * quickly in user space in vgetcpu. (12 bits for the CPU | |
352 | * and 8 bits for the node) | |
d4f829dd | 353 | */ |
a92f101b AM |
354 | d.limit0 = cpu | ((node & 0xf) << 12); |
355 | d.limit = node >> 4; | |
356 | d.type = 5; /* RO data, expand down, accessed */ | |
357 | d.dpl = 3; /* Visible to user code */ | |
358 | d.s = 1; /* Not a system segment */ | |
359 | d.p = 1; /* Present */ | |
360 | d.d = 1; /* 32-bit */ | |
d4f829dd AL |
361 | |
362 | write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); | |
363 | } | |
364 | ||
07d36c9e | 365 | static int vgetcpu_online(unsigned int cpu) |
d4f829dd | 366 | { |
07d36c9e | 367 | return smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); |
d4f829dd AL |
368 | } |
369 | ||
1c0c1b93 | 370 | static int __init init_vdso(void) |
d4f829dd | 371 | { |
1c0c1b93 AL |
372 | init_vdso_image(&vdso_image_64); |
373 | ||
374 | #ifdef CONFIG_X86_X32_ABI | |
375 | init_vdso_image(&vdso_image_x32); | |
376 | #endif | |
377 | ||
d4f829dd | 378 | /* notifier priority > KVM */ |
07d36c9e SAS |
379 | return cpuhp_setup_state(CPUHP_AP_X86_VDSO_VMA_ONLINE, |
380 | "AP_X86_VDSO_VMA_ONLINE", vgetcpu_online, NULL); | |
d4f829dd | 381 | } |
1c0c1b93 AL |
382 | subsys_initcall(init_vdso); |
383 | #endif /* CONFIG_X86_64 */ |