]>
Commit | Line | Data |
---|---|---|
2aae950b | 1 | /* |
2aae950b AK |
2 | * Copyright 2007 Andi Kleen, SUSE Labs. |
3 | * Subject to the GPL, v.2 | |
1c0c1b93 AL |
4 | * |
5 | * This contains most of the x86 vDSO kernel-side code. | |
2aae950b AK |
6 | */ |
7 | #include <linux/mm.h> | |
4e950f6f | 8 | #include <linux/err.h> |
2aae950b | 9 | #include <linux/sched.h> |
5a0e3ad6 | 10 | #include <linux/slab.h> |
2aae950b AK |
11 | #include <linux/init.h> |
12 | #include <linux/random.h> | |
3fa89ca7 | 13 | #include <linux/elf.h> |
d4f829dd | 14 | #include <linux/cpu.h> |
cc1e24fd | 15 | #include <asm/pvclock.h> |
2aae950b AK |
16 | #include <asm/vgtod.h> |
17 | #include <asm/proto.h> | |
7f3646aa | 18 | #include <asm/vdso.h> |
1c0c1b93 | 19 | #include <asm/vvar.h> |
aafade24 | 20 | #include <asm/page.h> |
18d0a6fd | 21 | #include <asm/hpet.h> |
d4f829dd | 22 | #include <asm/desc.h> |
cd4d09ec | 23 | #include <asm/cpufeature.h> |
2aae950b | 24 | |
b4b541a6 | 25 | #if defined(CONFIG_X86_64) |
3d7ee969 | 26 | unsigned int __read_mostly vdso64_enabled = 1; |
b4b541a6 | 27 | #endif |
1a21d4e0 | 28 | |
6f121e54 | 29 | void __init init_vdso_image(const struct vdso_image *image) |
1a21d4e0 | 30 | { |
6f121e54 | 31 | BUG_ON(image->size % PAGE_SIZE != 0); |
1a21d4e0 | 32 | |
6f121e54 AL |
33 | apply_alternatives((struct alt_instr *)(image->data + image->alt), |
34 | (struct alt_instr *)(image->data + image->alt + | |
35 | image->alt_len)); | |
1a21d4e0 | 36 | } |
1b3f2a72 | 37 | |
2aae950b AK |
38 | struct linux_binprm; |
39 | ||
394f56fe AL |
40 | /* |
41 | * Put the vdso above the (randomized) stack with another randomized | |
42 | * offset. This way there is no hole in the middle of address space. | |
43 | * To save memory make sure it is still in the same PTE as the stack | |
44 | * top. This doesn't give that many random bits. | |
45 | * | |
46 | * Note that this algorithm is imperfect: the distribution of the vdso | |
47 | * start address within a PMD is biased toward the end. | |
48 | * | |
49 | * Only used for the 64-bit and x32 vdsos. | |
50 | */ | |
2aae950b AK |
51 | static unsigned long vdso_addr(unsigned long start, unsigned len) |
52 | { | |
d093601b JB |
53 | #ifdef CONFIG_X86_32 |
54 | return 0; | |
55 | #else | |
2aae950b AK |
56 | unsigned long addr, end; |
57 | unsigned offset; | |
394f56fe AL |
58 | |
59 | /* | |
60 | * Round up the start address. It can start out unaligned as a result | |
61 | * of stack start randomization. | |
62 | */ | |
63 | start = PAGE_ALIGN(start); | |
64 | ||
65 | /* Round the lowest possible end address up to a PMD boundary. */ | |
66 | end = (start + len + PMD_SIZE - 1) & PMD_MASK; | |
d9517346 IM |
67 | if (end >= TASK_SIZE_MAX) |
68 | end = TASK_SIZE_MAX; | |
2aae950b | 69 | end -= len; |
394f56fe AL |
70 | |
71 | if (end > start) { | |
72 | offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); | |
73 | addr = start + (offset << PAGE_SHIFT); | |
74 | } else { | |
75 | addr = start; | |
76 | } | |
dfb09f9b BP |
77 | |
78 | /* | |
394f56fe AL |
79 | * Forcibly align the final address in case we have a hardware |
80 | * issue that requires alignment for performance reasons. | |
dfb09f9b | 81 | */ |
f9902472 | 82 | addr = align_vdso_addr(addr); |
dfb09f9b | 83 | |
2aae950b | 84 | return addr; |
d093601b | 85 | #endif |
2aae950b AK |
86 | } |
87 | ||
05ef76b2 AL |
88 | static int vdso_fault(const struct vm_special_mapping *sm, |
89 | struct vm_area_struct *vma, struct vm_fault *vmf) | |
90 | { | |
91 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; | |
92 | ||
93 | if (!image || (vmf->pgoff << PAGE_SHIFT) >= image->size) | |
94 | return VM_FAULT_SIGBUS; | |
95 | ||
96 | vmf->page = virt_to_page(image->data + (vmf->pgoff << PAGE_SHIFT)); | |
97 | get_page(vmf->page); | |
98 | return 0; | |
99 | } | |
100 | ||
101 | static const struct vm_special_mapping text_mapping = { | |
102 | .name = "[vdso]", | |
103 | .fault = vdso_fault, | |
104 | }; | |
105 | ||
a48a7042 AL |
106 | static int vvar_fault(const struct vm_special_mapping *sm, |
107 | struct vm_area_struct *vma, struct vm_fault *vmf) | |
108 | { | |
109 | const struct vdso_image *image = vma->vm_mm->context.vdso_image; | |
110 | long sym_offset; | |
111 | int ret = -EFAULT; | |
112 | ||
113 | if (!image) | |
114 | return VM_FAULT_SIGBUS; | |
115 | ||
116 | sym_offset = (long)(vmf->pgoff << PAGE_SHIFT) + | |
117 | image->sym_vvar_start; | |
118 | ||
119 | /* | |
120 | * Sanity check: a symbol offset of zero means that the page | |
121 | * does not exist for this vdso image, not that the page is at | |
122 | * offset zero relative to the text mapping. This should be | |
123 | * impossible here, because sym_offset should only be zero for | |
124 | * the page past the end of the vvar mapping. | |
125 | */ | |
126 | if (sym_offset == 0) | |
127 | return VM_FAULT_SIGBUS; | |
128 | ||
129 | if (sym_offset == image->sym_vvar_page) { | |
130 | ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, | |
131 | __pa_symbol(&__vvar_page) >> PAGE_SHIFT); | |
132 | } else if (sym_offset == image->sym_hpet_page) { | |
133 | #ifdef CONFIG_HPET_TIMER | |
bd902c53 | 134 | if (hpet_address && vclock_was_used(VCLOCK_HPET)) { |
a48a7042 AL |
135 | ret = vm_insert_pfn_prot( |
136 | vma, | |
137 | (unsigned long)vmf->virtual_address, | |
138 | hpet_address >> PAGE_SHIFT, | |
139 | pgprot_noncached(PAGE_READONLY)); | |
140 | } | |
141 | #endif | |
142 | } else if (sym_offset == image->sym_pvclock_page) { | |
143 | struct pvclock_vsyscall_time_info *pvti = | |
144 | pvclock_pvti_cpu0_va(); | |
bd902c53 | 145 | if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { |
a48a7042 AL |
146 | ret = vm_insert_pfn( |
147 | vma, | |
148 | (unsigned long)vmf->virtual_address, | |
149 | __pa(pvti) >> PAGE_SHIFT); | |
150 | } | |
151 | } | |
152 | ||
153 | if (ret == 0 || ret == -EBUSY) | |
154 | return VM_FAULT_NOPAGE; | |
155 | ||
156 | return VM_FAULT_SIGBUS; | |
157 | } | |
158 | ||
18d0a6fd | 159 | static int map_vdso(const struct vdso_image *image, bool calculate_addr) |
2aae950b AK |
160 | { |
161 | struct mm_struct *mm = current->mm; | |
18d0a6fd | 162 | struct vm_area_struct *vma; |
e6577a7c | 163 | unsigned long addr, text_start; |
18d0a6fd | 164 | int ret = 0; |
a48a7042 | 165 | static const struct vm_special_mapping vvar_mapping = { |
a62c34bd | 166 | .name = "[vvar]", |
a48a7042 | 167 | .fault = vvar_fault, |
a62c34bd | 168 | }; |
2aae950b | 169 | |
18d0a6fd AL |
170 | if (calculate_addr) { |
171 | addr = vdso_addr(current->mm->start_stack, | |
e6577a7c | 172 | image->size - image->sym_vvar_start); |
18d0a6fd AL |
173 | } else { |
174 | addr = 0; | |
175 | } | |
2aae950b AK |
176 | |
177 | down_write(&mm->mmap_sem); | |
18d0a6fd | 178 | |
e6577a7c AL |
179 | addr = get_unmapped_area(NULL, addr, |
180 | image->size - image->sym_vvar_start, 0, 0); | |
2aae950b AK |
181 | if (IS_ERR_VALUE(addr)) { |
182 | ret = addr; | |
183 | goto up_fail; | |
184 | } | |
185 | ||
e6577a7c AL |
186 | text_start = addr - image->sym_vvar_start; |
187 | current->mm->context.vdso = (void __user *)text_start; | |
352b78c6 | 188 | current->mm->context.vdso_image = image; |
f7b6eb3f | 189 | |
18d0a6fd AL |
190 | /* |
191 | * MAYWRITE to allow gdb to COW and set breakpoints | |
192 | */ | |
a62c34bd | 193 | vma = _install_special_mapping(mm, |
e6577a7c | 194 | text_start, |
a62c34bd AL |
195 | image->size, |
196 | VM_READ|VM_EXEC| | |
197 | VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC, | |
05ef76b2 | 198 | &text_mapping); |
18d0a6fd | 199 | |
a62c34bd AL |
200 | if (IS_ERR(vma)) { |
201 | ret = PTR_ERR(vma); | |
18d0a6fd | 202 | goto up_fail; |
a62c34bd | 203 | } |
18d0a6fd AL |
204 | |
205 | vma = _install_special_mapping(mm, | |
e6577a7c AL |
206 | addr, |
207 | -image->sym_vvar_start, | |
a48a7042 AL |
208 | VM_READ|VM_MAYREAD|VM_IO|VM_DONTDUMP| |
209 | VM_PFNMAP, | |
a62c34bd | 210 | &vvar_mapping); |
18d0a6fd AL |
211 | |
212 | if (IS_ERR(vma)) { | |
213 | ret = PTR_ERR(vma); | |
2aae950b | 214 | goto up_fail; |
f7b6eb3f | 215 | } |
2aae950b | 216 | |
2aae950b | 217 | up_fail: |
18d0a6fd AL |
218 | if (ret) |
219 | current->mm->context.vdso = NULL; | |
220 | ||
2aae950b AK |
221 | up_write(&mm->mmap_sem); |
222 | return ret; | |
223 | } | |
224 | ||
ab8b82ee | 225 | #if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION) |
18d0a6fd AL |
226 | static int load_vdso32(void) |
227 | { | |
18d0a6fd AL |
228 | if (vdso32_enabled != 1) /* Other values all mean "disabled" */ |
229 | return 0; | |
230 | ||
0a6d1fa0 | 231 | return map_vdso(&vdso_image_32, false); |
18d0a6fd AL |
232 | } |
233 | #endif | |
234 | ||
235 | #ifdef CONFIG_X86_64 | |
1a21d4e0 L |
236 | int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) |
237 | { | |
18d0a6fd AL |
238 | if (!vdso64_enabled) |
239 | return 0; | |
240 | ||
241 | return map_vdso(&vdso_image_64, true); | |
1a21d4e0 L |
242 | } |
243 | ||
18d0a6fd AL |
244 | #ifdef CONFIG_COMPAT |
245 | int compat_arch_setup_additional_pages(struct linux_binprm *bprm, | |
246 | int uses_interp) | |
247 | { | |
1a21d4e0 | 248 | #ifdef CONFIG_X86_X32_ABI |
18d0a6fd AL |
249 | if (test_thread_flag(TIF_X32)) { |
250 | if (!vdso64_enabled) | |
251 | return 0; | |
252 | ||
253 | return map_vdso(&vdso_image_x32, true); | |
254 | } | |
255 | #endif | |
ab8b82ee | 256 | #ifdef CONFIG_IA32_EMULATION |
18d0a6fd | 257 | return load_vdso32(); |
ab8b82ee BG |
258 | #else |
259 | return 0; | |
260 | #endif | |
18d0a6fd AL |
261 | } |
262 | #endif | |
263 | #else | |
264 | int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) | |
1a21d4e0 | 265 | { |
18d0a6fd | 266 | return load_vdso32(); |
1a21d4e0 L |
267 | } |
268 | #endif | |
269 | ||
18d0a6fd | 270 | #ifdef CONFIG_X86_64 |
2aae950b AK |
271 | static __init int vdso_setup(char *s) |
272 | { | |
3d7ee969 | 273 | vdso64_enabled = simple_strtoul(s, NULL, 0); |
2aae950b AK |
274 | return 0; |
275 | } | |
276 | __setup("vdso=", vdso_setup); | |
b4b541a6 | 277 | #endif |
d4f829dd AL |
278 | |
279 | #ifdef CONFIG_X86_64 | |
1c0c1b93 | 280 | static void vgetcpu_cpu_init(void *arg) |
d4f829dd | 281 | { |
1c0c1b93 | 282 | int cpu = smp_processor_id(); |
a92f101b | 283 | struct desc_struct d = { }; |
d4f829dd AL |
284 | unsigned long node = 0; |
285 | #ifdef CONFIG_NUMA | |
286 | node = cpu_to_node(cpu); | |
287 | #endif | |
288 | if (cpu_has(&cpu_data(cpu), X86_FEATURE_RDTSCP)) | |
289 | write_rdtscp_aux((node << 12) | cpu); | |
290 | ||
291 | /* | |
25880156 AL |
292 | * Store cpu number in limit so that it can be loaded |
293 | * quickly in user space in vgetcpu. (12 bits for the CPU | |
294 | * and 8 bits for the node) | |
d4f829dd | 295 | */ |
a92f101b AM |
296 | d.limit0 = cpu | ((node & 0xf) << 12); |
297 | d.limit = node >> 4; | |
298 | d.type = 5; /* RO data, expand down, accessed */ | |
299 | d.dpl = 3; /* Visible to user code */ | |
300 | d.s = 1; /* Not a system segment */ | |
301 | d.p = 1; /* Present */ | |
302 | d.d = 1; /* 32-bit */ | |
d4f829dd AL |
303 | |
304 | write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_PER_CPU, &d, DESCTYPE_S); | |
305 | } | |
306 | ||
d4f829dd | 307 | static int |
1c0c1b93 | 308 | vgetcpu_cpu_notifier(struct notifier_block *n, unsigned long action, void *arg) |
d4f829dd AL |
309 | { |
310 | long cpu = (long)arg; | |
311 | ||
312 | if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) | |
1c0c1b93 | 313 | smp_call_function_single(cpu, vgetcpu_cpu_init, NULL, 1); |
d4f829dd AL |
314 | |
315 | return NOTIFY_DONE; | |
316 | } | |
317 | ||
1c0c1b93 | 318 | static int __init init_vdso(void) |
d4f829dd | 319 | { |
1c0c1b93 AL |
320 | init_vdso_image(&vdso_image_64); |
321 | ||
322 | #ifdef CONFIG_X86_X32_ABI | |
323 | init_vdso_image(&vdso_image_x32); | |
324 | #endif | |
325 | ||
d4f829dd AL |
326 | cpu_notifier_register_begin(); |
327 | ||
1c0c1b93 | 328 | on_each_cpu(vgetcpu_cpu_init, NULL, 1); |
d4f829dd | 329 | /* notifier priority > KVM */ |
1c0c1b93 | 330 | __hotcpu_notifier(vgetcpu_cpu_notifier, 30); |
d4f829dd AL |
331 | |
332 | cpu_notifier_register_done(); | |
333 | ||
334 | return 0; | |
335 | } | |
1c0c1b93 AL |
336 | subsys_initcall(init_vdso); |
337 | #endif /* CONFIG_X86_64 */ |